@elizaos/plugin-local-inference 2.0.0-beta.1 → 2.0.11-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +83 -0
- package/package.json +81 -15
- package/src/actions/generate-media.d.ts +59 -0
- package/src/actions/generate-media.d.ts.map +1 -0
- package/src/actions/generate-media.ts +647 -0
- package/src/actions/identify-speaker.d.ts +23 -0
- package/src/actions/identify-speaker.d.ts.map +1 -0
- package/src/actions/identify-speaker.ts +171 -0
- package/src/adapters/capacitor-llama/__tests__/compat-behavior.test.ts +218 -0
- package/src/adapters/capacitor-llama/__tests__/index.test.ts +68 -0
- package/src/adapters/capacitor-llama/__tests__/structured-output.test.ts +215 -0
- package/src/adapters/capacitor-llama/__tests__/text-streaming.test.ts +174 -0
- package/src/adapters/capacitor-llama/environment.ts +71 -0
- package/src/adapters/capacitor-llama/index.browser.ts +83 -0
- package/src/adapters/capacitor-llama/index.ts +807 -0
- package/src/adapters/capacitor-llama/loader.ts +109 -0
- package/src/adapters/capacitor-llama/structured-output.ts +165 -0
- package/src/adapters/capacitor-llama/text-streaming.ts +227 -0
- package/src/adapters/capacitor-llama/types.ts +374 -0
- package/src/backends/apple-foundation.ts +127 -0
- package/src/index.d.ts +7 -0
- package/src/index.d.ts.map +1 -0
- package/src/index.ts +54 -0
- package/src/local-inference-routes.d.ts +38 -0
- package/src/local-inference-routes.d.ts.map +1 -0
- package/src/local-inference-routes.test.ts +344 -0
- package/src/local-inference-routes.ts +1543 -0
- package/src/provider.d.ts +21 -0
- package/src/provider.d.ts.map +1 -0
- package/src/provider.ts +1171 -0
- package/src/routes/compat-helpers.d.ts +18 -0
- package/src/routes/compat-helpers.d.ts.map +1 -0
- package/src/routes/compat-helpers.ts +274 -0
- package/src/routes/family-member-route.d.ts +62 -0
- package/src/routes/family-member-route.d.ts.map +1 -0
- package/src/routes/family-member-route.ts +353 -0
- package/src/routes/index.d.ts +19 -0
- package/src/routes/index.d.ts.map +1 -0
- package/src/routes/index.ts +60 -0
- package/src/routes/live-diarization-route.d.ts +26 -0
- package/src/routes/live-diarization-route.d.ts.map +1 -0
- package/src/routes/live-diarization-route.test.ts +213 -0
- package/src/routes/live-diarization-route.ts +122 -0
- package/src/routes/local-inference-asr-route.d.ts +4 -0
- package/src/routes/local-inference-asr-route.d.ts.map +1 -0
- package/src/routes/local-inference-asr-route.test.ts +190 -0
- package/src/routes/local-inference-asr-route.ts +213 -0
- package/src/routes/local-inference-compat-routes.d.ts +16 -0
- package/src/routes/local-inference-compat-routes.d.ts.map +1 -0
- package/src/routes/local-inference-compat-routes.test.ts +423 -0
- package/src/routes/local-inference-compat-routes.ts +782 -0
- package/src/routes/local-inference-tts-route.d.ts +7 -0
- package/src/routes/local-inference-tts-route.d.ts.map +1 -0
- package/src/routes/local-inference-tts-route.test.ts +179 -0
- package/src/routes/local-inference-tts-route.ts +230 -0
- package/src/routes/voice-first-run-routes.d.ts +62 -0
- package/src/routes/voice-first-run-routes.d.ts.map +1 -0
- package/src/routes/voice-first-run-routes.ts +524 -0
- package/src/routes/voice-models-routes.d.ts +62 -0
- package/src/routes/voice-models-routes.d.ts.map +1 -0
- package/src/routes/voice-models-routes.ts +554 -0
- package/src/routes/voice-profile-plugin-routes.d.ts +19 -0
- package/src/routes/voice-profile-plugin-routes.d.ts.map +1 -0
- package/src/routes/voice-profile-plugin-routes.ts +138 -0
- package/src/routes/voice-profiles-management-routes.d.ts +52 -0
- package/src/routes/voice-profiles-management-routes.d.ts.map +1 -0
- package/src/routes/voice-profiles-management-routes.ts +476 -0
- package/src/routes/voice-speaker-profile-routes.d.ts +57 -0
- package/src/routes/voice-speaker-profile-routes.d.ts.map +1 -0
- package/src/routes/voice-speaker-profile-routes.ts +199 -0
- package/src/runtime/aosp-llama-loader-selection.test.ts +80 -0
- package/src/runtime/capacitor-llama.d.ts +25 -0
- package/src/runtime/embedding-manager-support.d.ts +77 -0
- package/src/runtime/embedding-manager-support.d.ts.map +1 -0
- package/src/runtime/embedding-manager-support.ts +497 -0
- package/src/runtime/embedding-presets.d.ts +16 -0
- package/src/runtime/embedding-presets.d.ts.map +1 -0
- package/src/runtime/embedding-presets.ts +81 -0
- package/src/runtime/embedding-warmup-policy.d.ts +14 -0
- package/src/runtime/embedding-warmup-policy.d.ts.map +1 -0
- package/src/runtime/embedding-warmup-policy.test.ts +53 -0
- package/src/runtime/embedding-warmup-policy.ts +48 -0
- package/src/runtime/ensure-local-inference-handler.d.ts +53 -0
- package/src/runtime/ensure-local-inference-handler.d.ts.map +1 -0
- package/src/runtime/ensure-local-inference-handler.test.ts +528 -0
- package/src/runtime/ensure-local-inference-handler.ts +1398 -0
- package/src/runtime/index.d.ts +14 -0
- package/src/runtime/index.d.ts.map +1 -0
- package/src/runtime/index.ts +27 -0
- package/src/runtime/mobile-local-inference-gate.d.ts +31 -0
- package/src/runtime/mobile-local-inference-gate.d.ts.map +1 -0
- package/src/runtime/mobile-local-inference-gate.test.ts +69 -0
- package/src/runtime/mobile-local-inference-gate.ts +44 -0
- package/src/runtime/voice-entity-binding.d.ts +103 -0
- package/src/runtime/voice-entity-binding.d.ts.map +1 -0
- package/src/runtime/voice-entity-binding.transcript.test.ts +69 -0
- package/src/runtime/voice-entity-binding.ts +328 -0
- package/src/services/README.md +71 -0
- package/src/services/__tests__/backend-selector.test.ts +101 -0
- package/src/services/__tests__/checkpoint-manager.test.ts +376 -0
- package/src/services/__tests__/gpu-autotune.test.ts +400 -0
- package/src/services/__tests__/llm-streaming-binding.test.ts +85 -0
- package/src/services/__tests__/planner-grammar.test.ts +372 -0
- package/src/services/__tests__/runtime-target.test.ts +176 -0
- package/src/services/active-model-switch-rollback.test.ts +183 -0
- package/src/services/active-model.d.ts +282 -0
- package/src/services/active-model.d.ts.map +1 -0
- package/src/services/active-model.ts +1213 -0
- package/src/services/asr/errors.d.ts +21 -0
- package/src/services/asr/errors.d.ts.map +1 -0
- package/src/services/asr/errors.ts +50 -0
- package/src/services/asr/hash.d.ts +28 -0
- package/src/services/asr/hash.d.ts.map +1 -0
- package/src/services/asr/hash.ts +49 -0
- package/src/services/asr/index.d.ts +76 -0
- package/src/services/asr/index.d.ts.map +1 -0
- package/src/services/asr/index.ts +178 -0
- package/src/services/asr/types.d.ts +91 -0
- package/src/services/asr/types.d.ts.map +1 -0
- package/src/services/asr/types.ts +95 -0
- package/src/services/assignments.d.ts +71 -0
- package/src/services/assignments.d.ts.map +1 -0
- package/src/services/assignments.test.ts +80 -0
- package/src/services/assignments.ts +230 -0
- package/src/services/backend-selector.ts +95 -0
- package/src/services/backend.d.ts +346 -0
- package/src/services/backend.d.ts.map +1 -0
- package/src/services/backend.ts +612 -0
- package/src/services/bundled-models.d.ts +34 -0
- package/src/services/bundled-models.d.ts.map +1 -0
- package/src/services/bundled-models.ts +129 -0
- package/src/services/cache-bridge.d.ts +206 -0
- package/src/services/cache-bridge.d.ts.map +1 -0
- package/src/services/cache-bridge.test.ts +516 -0
- package/src/services/cache-bridge.ts +423 -0
- package/src/services/catalog.d.ts +10 -0
- package/src/services/catalog.d.ts.map +1 -0
- package/src/services/catalog.test.ts +240 -0
- package/src/services/catalog.ts +27 -0
- package/src/services/checkpoint-client.d.ts +109 -0
- package/src/services/checkpoint-client.d.ts.map +1 -0
- package/src/services/checkpoint-client.ts +258 -0
- package/src/services/checkpoint-manager.ts +474 -0
- package/src/services/cloud-fallback.d.ts +102 -0
- package/src/services/cloud-fallback.d.ts.map +1 -0
- package/src/services/cloud-fallback.ts +230 -0
- package/src/services/conversation-registry.d.ts +142 -0
- package/src/services/conversation-registry.d.ts.map +1 -0
- package/src/services/conversation-registry.test.ts +235 -0
- package/src/services/conversation-registry.ts +264 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts +92 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts.map +1 -0
- package/src/services/desktop-fused-ffi-backend-runtime.ts +333 -0
- package/src/services/device-bridge.d.ts +188 -0
- package/src/services/device-bridge.d.ts.map +1 -0
- package/src/services/device-bridge.ts +1237 -0
- package/src/services/device-resource-metrics.d.ts +149 -0
- package/src/services/device-resource-metrics.d.ts.map +1 -0
- package/src/services/device-resource-metrics.test.ts +98 -0
- package/src/services/device-resource-metrics.ts +346 -0
- package/src/services/device-tier.d.ts +115 -0
- package/src/services/device-tier.d.ts.map +1 -0
- package/src/services/device-tier.test.ts +371 -0
- package/src/services/device-tier.ts +410 -0
- package/src/services/downloader.d.ts +82 -0
- package/src/services/downloader.d.ts.map +1 -0
- package/src/services/downloader.test.ts +724 -0
- package/src/services/downloader.ts +899 -0
- package/src/services/engine-direct-bundle.test.ts +58 -0
- package/src/services/engine-streaming.test.ts +80 -0
- package/src/services/engine.d.ts +534 -0
- package/src/services/engine.d.ts.map +1 -0
- package/src/services/engine.ts +1891 -0
- package/src/services/ensure-local-artifacts.integration.test.ts +273 -0
- package/src/services/ensure-local-artifacts.test.ts +368 -0
- package/src/services/ensure-local-artifacts.ts +351 -0
- package/src/services/external-scanner.d.ts +17 -0
- package/src/services/external-scanner.d.ts.map +1 -0
- package/src/services/external-scanner.ts +312 -0
- package/src/services/ffi-llm-mock.ts +354 -0
- package/src/services/ffi-llm-streaming-abi.ts +442 -0
- package/src/services/ffi-streaming-backend.d.ts +180 -0
- package/src/services/ffi-streaming-backend.d.ts.map +1 -0
- package/src/services/ffi-streaming-backend.ts +382 -0
- package/src/services/ffi-streaming-runner.d.ts +122 -0
- package/src/services/ffi-streaming-runner.d.ts.map +1 -0
- package/src/services/ffi-streaming-runner.test.ts +60 -0
- package/src/services/ffi-streaming-runner.ts +354 -0
- package/src/services/ffi-unload-ordering.test.ts +162 -0
- package/src/services/gpu-autotune.ts +534 -0
- package/src/services/gpu-detect.ts +139 -0
- package/src/services/handler-registry.d.ts +72 -0
- package/src/services/handler-registry.d.ts.map +1 -0
- package/src/services/handler-registry.ts +240 -0
- package/src/services/hardware.d.ts +63 -0
- package/src/services/hardware.d.ts.map +1 -0
- package/src/services/hardware.test.ts +183 -0
- package/src/services/hardware.ts +404 -0
- package/src/services/hf-search.d.ts +26 -0
- package/src/services/hf-search.d.ts.map +1 -0
- package/src/services/hf-search.test.ts +69 -0
- package/src/services/hf-search.ts +420 -0
- package/src/services/image-description-runtime.d.ts +14 -0
- package/src/services/image-description-runtime.d.ts.map +1 -0
- package/src/services/image-description-runtime.test.ts +61 -0
- package/src/services/image-description-runtime.ts +118 -0
- package/src/services/imagegen/aosp-unavailable.d.ts +134 -0
- package/src/services/imagegen/aosp-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/aosp-unavailable.ts +229 -0
- package/src/services/imagegen/backend-selector.d.ts +118 -0
- package/src/services/imagegen/backend-selector.d.ts.map +1 -0
- package/src/services/imagegen/backend-selector.ts +281 -0
- package/src/services/imagegen/coreml-unavailable.d.ts +105 -0
- package/src/services/imagegen/coreml-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/coreml-unavailable.ts +237 -0
- package/src/services/imagegen/errors.d.ts +16 -0
- package/src/services/imagegen/errors.d.ts.map +1 -0
- package/src/services/imagegen/errors.ts +40 -0
- package/src/services/imagegen/index.d.ts +58 -0
- package/src/services/imagegen/index.d.ts.map +1 -0
- package/src/services/imagegen/index.ts +144 -0
- package/src/services/imagegen/mflux.d.ts +74 -0
- package/src/services/imagegen/mflux.d.ts.map +1 -0
- package/src/services/imagegen/mflux.ts +313 -0
- package/src/services/imagegen/sd-cpp.d.ts +180 -0
- package/src/services/imagegen/sd-cpp.d.ts.map +1 -0
- package/src/services/imagegen/sd-cpp.ts +718 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts +83 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/tensorrt-unavailable.ts +295 -0
- package/src/services/imagegen/types.d.ts +181 -0
- package/src/services/imagegen/types.d.ts.map +1 -0
- package/src/services/imagegen/types.ts +193 -0
- package/src/services/index.d.ts +30 -0
- package/src/services/index.d.ts.map +1 -0
- package/src/services/index.ts +225 -0
- package/src/services/inference-capabilities.d.ts +132 -0
- package/src/services/inference-capabilities.d.ts.map +1 -0
- package/src/services/inference-capabilities.test.ts +75 -0
- package/src/services/inference-capabilities.ts +204 -0
- package/src/services/inference-telemetry.d.ts +59 -0
- package/src/services/inference-telemetry.d.ts.map +1 -0
- package/src/services/inference-telemetry.ts +143 -0
- package/src/services/ios-llama-streaming.ts +248 -0
- package/src/services/kv-spill.d.ts +189 -0
- package/src/services/kv-spill.d.ts.map +1 -0
- package/src/services/kv-spill.test.ts +222 -0
- package/src/services/kv-spill.ts +356 -0
- package/src/services/latency-trace.d.ts +346 -0
- package/src/services/latency-trace.d.ts.map +1 -0
- package/src/services/latency-trace.test.ts +266 -0
- package/src/services/latency-trace.ts +844 -0
- package/src/services/llama-server-metrics.ts +304 -0
- package/src/services/llm-streaming-binding.d.ts +96 -0
- package/src/services/llm-streaming-binding.d.ts.map +1 -0
- package/src/services/llm-streaming-binding.ts +136 -0
- package/src/services/load-args.d.ts +82 -0
- package/src/services/load-args.d.ts.map +1 -0
- package/src/services/load-args.ts +81 -0
- package/src/services/manifest/eliza-1.manifest.v1.json +708 -0
- package/src/services/manifest/index.d.ts +4 -0
- package/src/services/manifest/index.d.ts.map +1 -0
- package/src/services/manifest/index.ts +66 -0
- package/src/services/manifest/manifest.test.ts +693 -0
- package/src/services/manifest/schema.d.ts +715 -0
- package/src/services/manifest/schema.d.ts.map +1 -0
- package/src/services/manifest/schema.ts +655 -0
- package/src/services/manifest/types.d.ts +30 -0
- package/src/services/manifest/types.d.ts.map +1 -0
- package/src/services/manifest/types.ts +55 -0
- package/src/services/manifest/validator.d.ts +66 -0
- package/src/services/manifest/validator.d.ts.map +1 -0
- package/src/services/manifest/validator.ts +569 -0
- package/src/services/memory-arbiter.d.ts +343 -0
- package/src/services/memory-arbiter.d.ts.map +1 -0
- package/src/services/memory-arbiter.test.ts +419 -0
- package/src/services/memory-arbiter.ts +1000 -0
- package/src/services/memory-monitor.d.ts +119 -0
- package/src/services/memory-monitor.d.ts.map +1 -0
- package/src/services/memory-monitor.test.ts +208 -0
- package/src/services/memory-monitor.ts +296 -0
- package/src/services/memory-pressure.d.ts +127 -0
- package/src/services/memory-pressure.d.ts.map +1 -0
- package/src/services/memory-pressure.ts +413 -0
- package/src/services/mtp-doctor.d.ts +13 -0
- package/src/services/mtp-doctor.d.ts.map +1 -0
- package/src/services/mtp-doctor.ts +78 -0
- package/src/services/network-policy.d.ts +127 -0
- package/src/services/network-policy.d.ts.map +1 -0
- package/src/services/network-policy.ts +346 -0
- package/src/services/paths.d.ts +6 -0
- package/src/services/paths.d.ts.map +1 -0
- package/src/services/paths.ts +25 -0
- package/src/services/planner-skeleton.d.ts +124 -0
- package/src/services/planner-skeleton.d.ts.map +1 -0
- package/src/services/planner-skeleton.ts +175 -0
- package/src/services/providers.d.ts +38 -0
- package/src/services/providers.d.ts.map +1 -0
- package/src/services/providers.ts +507 -0
- package/src/services/ram-budget-cache.test.ts +163 -0
- package/src/services/ram-budget.d.ts +110 -0
- package/src/services/ram-budget.d.ts.map +1 -0
- package/src/services/ram-budget.ts +0 -0
- package/src/services/readiness.d.ts +9 -0
- package/src/services/readiness.d.ts.map +1 -0
- package/src/services/readiness.test.ts +87 -0
- package/src/services/readiness.ts +238 -0
- package/src/services/recommendation.d.ts +111 -0
- package/src/services/recommendation.d.ts.map +1 -0
- package/src/services/recommendation.ts +672 -0
- package/src/services/registry.d.ts +35 -0
- package/src/services/registry.d.ts.map +1 -0
- package/src/services/registry.ts +151 -0
- package/src/services/router-handler.d.ts +92 -0
- package/src/services/router-handler.d.ts.map +1 -0
- package/src/services/router-handler.test.ts +45 -0
- package/src/services/router-handler.ts +376 -0
- package/src/services/routing-policy.d.ts +55 -0
- package/src/services/routing-policy.d.ts.map +1 -0
- package/src/services/routing-policy.ts +228 -0
- package/src/services/routing-preferences.d.ts +8 -0
- package/src/services/routing-preferences.d.ts.map +1 -0
- package/src/services/routing-preferences.ts +15 -0
- package/src/services/runtime-target.d.ts +98 -0
- package/src/services/runtime-target.d.ts.map +1 -0
- package/src/services/runtime-target.ts +154 -0
- package/src/services/service.d.ts +128 -0
- package/src/services/service.d.ts.map +1 -0
- package/src/services/service.test.ts +223 -0
- package/src/services/service.ts +735 -0
- package/src/services/session-pool.d.ts +72 -0
- package/src/services/session-pool.d.ts.map +1 -0
- package/src/services/session-pool.ts +153 -0
- package/src/services/structured-output/deterministic-repair.d.ts +23 -0
- package/src/services/structured-output/deterministic-repair.d.ts.map +1 -0
- package/src/services/structured-output/deterministic-repair.test.ts +169 -0
- package/src/services/structured-output/deterministic-repair.ts +443 -0
- package/src/services/structured-output/index.ts +4 -0
- package/src/services/structured-output.d.ts +311 -0
- package/src/services/structured-output.d.ts.map +1 -0
- package/src/services/structured-output.test.ts +483 -0
- package/src/services/structured-output.ts +712 -0
- package/src/services/transcription-priority.test.ts +211 -0
- package/src/services/tts/errors.ts +46 -0
- package/src/services/tts/index.ts +214 -0
- package/src/services/tts/tts-audio-cache.ts +235 -0
- package/src/services/tts/types.ts +157 -0
- package/src/services/types.d.ts +19 -0
- package/src/services/types.d.ts.map +1 -0
- package/src/services/types.ts +55 -0
- package/src/services/verify-on-device.d.ts +34 -0
- package/src/services/verify-on-device.d.ts.map +1 -0
- package/src/services/verify-on-device.test.ts +87 -0
- package/src/services/verify-on-device.ts +127 -0
- package/src/services/verify.d.ts +8 -0
- package/src/services/verify.d.ts.map +1 -0
- package/src/services/verify.ts +13 -0
- package/src/services/vision/aosp-unavailable.d.ts +115 -0
- package/src/services/vision/aosp-unavailable.d.ts.map +1 -0
- package/src/services/vision/aosp-unavailable.ts +163 -0
- package/src/services/vision/capacitor-llama.d.ts +99 -0
- package/src/services/vision/capacitor-llama.d.ts.map +1 -0
- package/src/services/vision/capacitor-llama.ts +255 -0
- package/src/services/vision/cloud-fallback.d.ts +47 -0
- package/src/services/vision/cloud-fallback.d.ts.map +1 -0
- package/src/services/vision/cloud-fallback.test.ts +243 -0
- package/src/services/vision/cloud-fallback.ts +268 -0
- package/src/services/vision/fallback-chain.test.ts +86 -0
- package/src/services/vision/hash.d.ts +71 -0
- package/src/services/vision/hash.d.ts.map +1 -0
- package/src/services/vision/hash.ts +157 -0
- package/src/services/vision/index.d.ts +95 -0
- package/src/services/vision/index.d.ts.map +1 -0
- package/src/services/vision/index.ts +251 -0
- package/src/services/vision/llama-server.d.ts +73 -0
- package/src/services/vision/llama-server.d.ts.map +1 -0
- package/src/services/vision/llama-server.ts +177 -0
- package/src/services/vision/types.d.ts +153 -0
- package/src/services/vision/types.d.ts.map +1 -0
- package/src/services/vision/types.ts +154 -0
- package/src/services/vision/vast-fallback.d.ts +18 -0
- package/src/services/vision/vast-fallback.d.ts.map +1 -0
- package/src/services/vision/vast-fallback.ts +127 -0
- package/src/services/vision-embedding-cache.d.ts +98 -0
- package/src/services/vision-embedding-cache.d.ts.map +1 -0
- package/src/services/vision-embedding-cache.ts +189 -0
- package/src/services/voice/VOICE_WORKBENCH.md +88 -0
- package/src/services/voice/__test-helpers__/fake-ffi.ts +92 -0
- package/src/services/voice/__test-helpers__/synthetic-speech.ts +124 -0
- package/src/services/voice/__tests__/checkpoint-manager.test.ts +241 -0
- package/src/services/voice/__tests__/checkpoint-policy.test.ts +270 -0
- package/src/services/voice/__tests__/eager-context-builder.test.ts +257 -0
- package/src/services/voice/__tests__/eliza1-eot-scorer.test.ts +288 -0
- package/src/services/voice/__tests__/eot-classifier.test.ts +431 -0
- package/src/services/voice/__tests__/optimistic-rollback.test.ts +312 -0
- package/src/services/voice/__tests__/prefill-client.test.ts +266 -0
- package/src/services/voice/__tests__/prefix-preserving-queue.test.ts +208 -0
- package/src/services/voice/__tests__/streaming-asr.test.ts +450 -0
- package/src/services/voice/__tests__/streaming-transcriber.test.ts +339 -0
- package/src/services/voice/__tests__/turn-detector-resolver.test.ts +197 -0
- package/src/services/voice/__tests__/voice-state-machine-prefill.test.ts +275 -0
- package/src/services/voice/__tests__/voice-state-machine.test.ts +354 -0
- package/src/services/voice/audio-frame-consumer.d.ts +212 -0
- package/src/services/voice/audio-frame-consumer.d.ts.map +1 -0
- package/src/services/voice/audio-frame-consumer.test.ts +343 -0
- package/src/services/voice/audio-frame-consumer.ts +491 -0
- package/src/services/voice/barge-in.d.ts +112 -0
- package/src/services/voice/barge-in.d.ts.map +1 -0
- package/src/services/voice/barge-in.test.ts +244 -0
- package/src/services/voice/barge-in.ts +336 -0
- package/src/services/voice/cancellation-coordinator.d.ts +127 -0
- package/src/services/voice/cancellation-coordinator.d.ts.map +1 -0
- package/src/services/voice/cancellation-coordinator.test.ts +196 -0
- package/src/services/voice/cancellation-coordinator.ts +269 -0
- package/src/services/voice/checkpoint-manager.d.ts +199 -0
- package/src/services/voice/checkpoint-manager.d.ts.map +1 -0
- package/src/services/voice/checkpoint-manager.ts +401 -0
- package/src/services/voice/checkpoint-policy.ts +336 -0
- package/src/services/voice/composite-eot-classifier.test.ts +59 -0
- package/src/services/voice/e2e-harness.test.ts +182 -0
- package/src/services/voice/e2e-harness.ts +743 -0
- package/src/services/voice/eager-context-builder.d.ts +170 -0
- package/src/services/voice/eager-context-builder.d.ts.map +1 -0
- package/src/services/voice/eager-context-builder.ts +262 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts +124 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/eliza1-eot-scorer.ts +242 -0
- package/src/services/voice/embedding-server.ts +200 -0
- package/src/services/voice/embedding.d.ts +133 -0
- package/src/services/voice/embedding.d.ts.map +1 -0
- package/src/services/voice/embedding.test.ts +148 -0
- package/src/services/voice/embedding.ts +244 -0
- package/src/services/voice/emotion-attribution.d.ts +68 -0
- package/src/services/voice/emotion-attribution.d.ts.map +1 -0
- package/src/services/voice/emotion-attribution.test.ts +129 -0
- package/src/services/voice/emotion-attribution.ts +361 -0
- package/src/services/voice/engine-bridge-cancellation.test.ts +422 -0
- package/src/services/voice/engine-bridge.d.ts +746 -0
- package/src/services/voice/engine-bridge.d.ts.map +1 -0
- package/src/services/voice/engine-bridge.test.ts +384 -0
- package/src/services/voice/engine-bridge.ts +2226 -0
- package/src/services/voice/eot-classifier-ggml.d.ts +179 -0
- package/src/services/voice/eot-classifier-ggml.d.ts.map +1 -0
- package/src/services/voice/eot-classifier-ggml.ts +566 -0
- package/src/services/voice/eot-classifier.d.ts +214 -0
- package/src/services/voice/eot-classifier.d.ts.map +1 -0
- package/src/services/voice/eot-classifier.ts +533 -0
- package/src/services/voice/errors.d.ts +20 -0
- package/src/services/voice/errors.d.ts.map +1 -0
- package/src/services/voice/errors.ts +32 -0
- package/src/services/voice/expressive-tags.d.ts +158 -0
- package/src/services/voice/expressive-tags.d.ts.map +1 -0
- package/src/services/voice/expressive-tags.ts +405 -0
- package/src/services/voice/ffi-bindings.d.ts +636 -0
- package/src/services/voice/ffi-bindings.d.ts.map +1 -0
- package/src/services/voice/ffi-bindings.test.ts +671 -0
- package/src/services/voice/ffi-bindings.ts +3050 -0
- package/src/services/voice/first-line-cache.d.ts +181 -0
- package/src/services/voice/first-line-cache.d.ts.map +1 -0
- package/src/services/voice/first-line-cache.ts +725 -0
- package/src/services/voice/fused-eot-scorer.d.ts +51 -0
- package/src/services/voice/fused-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/fused-eot-scorer.ts +135 -0
- package/src/services/voice/index.d.ts +91 -0
- package/src/services/voice/index.d.ts.map +1 -0
- package/src/services/voice/index.ts +481 -0
- package/src/services/voice/kokoro/__tests__/kokoro-backend.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.real.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.test.ts +60 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-discovery.test.ts +277 -0
- package/src/services/voice/kokoro/__tests__/kokoro-ffi-runtime.test.ts +235 -0
- package/src/services/voice/kokoro/__tests__/kokoro-runtime.test.ts +95 -0
- package/src/services/voice/kokoro/__tests__/phonemizer.test.ts +53 -0
- package/src/services/voice/kokoro/__tests__/runtime-selection.test.ts +231 -0
- package/src/services/voice/kokoro/__tests__/voices.test.ts +57 -0
- package/src/services/voice/kokoro/index.ts +79 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts +72 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-backend.ts +207 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts +58 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.ts +177 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts +75 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.ts +233 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts +100 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-runtime.ts +170 -0
- package/src/services/voice/kokoro/phoneme-stream.ts +123 -0
- package/src/services/voice/kokoro/phonemizer.d.ts +50 -0
- package/src/services/voice/kokoro/phonemizer.d.ts.map +1 -0
- package/src/services/voice/kokoro/phonemizer.ts +344 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts +61 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/pick-runtime.test.ts +91 -0
- package/src/services/voice/kokoro/pick-runtime.ts +130 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts +92 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts.map +1 -0
- package/src/services/voice/kokoro/runtime-selection.ts +237 -0
- package/src/services/voice/kokoro/types.d.ts +82 -0
- package/src/services/voice/kokoro/types.d.ts.map +1 -0
- package/src/services/voice/kokoro/types.ts +95 -0
- package/src/services/voice/kokoro/voice-presets.d.ts +23 -0
- package/src/services/voice/kokoro/voice-presets.d.ts.map +1 -0
- package/src/services/voice/kokoro/voice-presets.ts +129 -0
- package/src/services/voice/kokoro/voices.d.ts +30 -0
- package/src/services/voice/kokoro/voices.d.ts.map +1 -0
- package/src/services/voice/kokoro/voices.ts +64 -0
- package/src/services/voice/lifecycle.d.ts +135 -0
- package/src/services/voice/lifecycle.d.ts.map +1 -0
- package/src/services/voice/lifecycle.test.ts +315 -0
- package/src/services/voice/lifecycle.ts +301 -0
- package/src/services/voice/live-diarization-session.d.ts +96 -0
- package/src/services/voice/live-diarization-session.d.ts.map +1 -0
- package/src/services/voice/live-diarization-session.ts +289 -0
- package/src/services/voice/mic-source.d.ts +136 -0
- package/src/services/voice/mic-source.d.ts.map +1 -0
- package/src/services/voice/mic-source.test.ts +210 -0
- package/src/services/voice/mic-source.ts +503 -0
- package/src/services/voice/optimistic-policy.d.ts +109 -0
- package/src/services/voice/optimistic-policy.d.ts.map +1 -0
- package/src/services/voice/optimistic-policy.test.ts +101 -0
- package/src/services/voice/optimistic-policy.ts +192 -0
- package/src/services/voice/optimistic-rollback.ts +343 -0
- package/src/services/voice/partial-stabilizer.d.ts +73 -0
- package/src/services/voice/partial-stabilizer.d.ts.map +1 -0
- package/src/services/voice/partial-stabilizer.test.ts +68 -0
- package/src/services/voice/partial-stabilizer.ts +140 -0
- package/src/services/voice/phoneme-tokenizer.d.ts +49 -0
- package/src/services/voice/phoneme-tokenizer.d.ts.map +1 -0
- package/src/services/voice/phoneme-tokenizer.ts +158 -0
- package/src/services/voice/phrase-cache.d.ts +76 -0
- package/src/services/voice/phrase-cache.d.ts.map +1 -0
- package/src/services/voice/phrase-cache.test.ts +242 -0
- package/src/services/voice/phrase-cache.ts +186 -0
- package/src/services/voice/phrase-chunker.d.ts +62 -0
- package/src/services/voice/phrase-chunker.d.ts.map +1 -0
- package/src/services/voice/phrase-chunker.test.ts +239 -0
- package/src/services/voice/phrase-chunker.ts +281 -0
- package/src/services/voice/pipeline-impls.d.ts +151 -0
- package/src/services/voice/pipeline-impls.d.ts.map +1 -0
- package/src/services/voice/pipeline-impls.l6.test.ts +110 -0
- package/src/services/voice/pipeline-impls.test.ts +292 -0
- package/src/services/voice/pipeline-impls.ts +315 -0
- package/src/services/voice/pipeline.d.ts +216 -0
- package/src/services/voice/pipeline.d.ts.map +1 -0
- package/src/services/voice/pipeline.ts +505 -0
- package/src/services/voice/prefill-client.d.ts +123 -0
- package/src/services/voice/prefill-client.d.ts.map +1 -0
- package/src/services/voice/prefill-client.ts +316 -0
- package/src/services/voice/prefix-preserving-queue.d.ts +113 -0
- package/src/services/voice/prefix-preserving-queue.d.ts.map +1 -0
- package/src/services/voice/prefix-preserving-queue.ts +162 -0
- package/src/services/voice/profile-store.d.ts +248 -0
- package/src/services/voice/profile-store.d.ts.map +1 -0
- package/src/services/voice/profile-store.ts +887 -0
- package/src/services/voice/ring-buffer.d.ts +40 -0
- package/src/services/voice/ring-buffer.d.ts.map +1 -0
- package/src/services/voice/ring-buffer.ts +105 -0
- package/src/services/voice/rollback-queue.d.ts +24 -0
- package/src/services/voice/rollback-queue.d.ts.map +1 -0
- package/src/services/voice/rollback-queue.ts +74 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts +67 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-placeholder.test.ts +97 -0
- package/src/services/voice/samantha-preset-placeholder.ts +148 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts +87 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-regenerator.ts +393 -0
- package/src/services/voice/scheduler.d.ts +146 -0
- package/src/services/voice/scheduler.d.ts.map +1 -0
- package/src/services/voice/scheduler.t2.test.ts +141 -0
- package/src/services/voice/scheduler.ts +927 -0
- package/src/services/voice/shared-resources.d.ts +190 -0
- package/src/services/voice/shared-resources.d.ts.map +1 -0
- package/src/services/voice/shared-resources.ts +320 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts +74 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts.map +1 -0
- package/src/services/voice/speaker/attribution-pipeline.ts +386 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts +59 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer-fused.real.test.ts +100 -0
- package/src/services/voice/speaker/diarizer-fused.ts +154 -0
- package/src/services/voice/speaker/diarizer.d.ts +75 -0
- package/src/services/voice/speaker/diarizer.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer.ts +218 -0
- package/src/services/voice/speaker/encoder-fused.d.ts +60 -0
- package/src/services/voice/speaker/encoder-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-fused.real.test.ts +113 -0
- package/src/services/voice/speaker/encoder-fused.ts +138 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts +33 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-ggml.ts +79 -0
- package/src/services/voice/speaker/encoder.d.ts +37 -0
- package/src/services/voice/speaker/encoder.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder.ts +105 -0
- package/src/services/voice/speaker-imprint.d.ts +83 -0
- package/src/services/voice/speaker-imprint.d.ts.map +1 -0
- package/src/services/voice/speaker-imprint.test.ts +185 -0
- package/src/services/voice/speaker-imprint.ts +312 -0
- package/src/services/voice/speaker-preset-cache.d.ts +77 -0
- package/src/services/voice/speaker-preset-cache.d.ts.map +1 -0
- package/src/services/voice/speaker-preset-cache.test.ts +154 -0
- package/src/services/voice/speaker-preset-cache.ts +195 -0
- package/src/services/voice/streaming-asr/streaming-pipeline-adapter.ts +292 -0
- package/src/services/voice/system-audio-sink.d.ts +73 -0
- package/src/services/voice/system-audio-sink.d.ts.map +1 -0
- package/src/services/voice/system-audio-sink.test.ts +29 -0
- package/src/services/voice/system-audio-sink.ts +366 -0
- package/src/services/voice/transcriber.d.ts +244 -0
- package/src/services/voice/transcriber.d.ts.map +1 -0
- package/src/services/voice/transcriber.test.ts +392 -0
- package/src/services/voice/transcriber.ts +704 -0
- package/src/services/voice/turn-controller.d.ts +183 -0
- package/src/services/voice/turn-controller.d.ts.map +1 -0
- package/src/services/voice/turn-controller.test.ts +575 -0
- package/src/services/voice/turn-controller.ts +596 -0
- package/src/services/voice/types.d.ts +643 -0
- package/src/services/voice/types.d.ts.map +1 -0
- package/src/services/voice/types.ts +699 -0
- package/src/services/voice/vad.d.ts +282 -0
- package/src/services/voice/vad.d.ts.map +1 -0
- package/src/services/voice/vad.test.ts +480 -0
- package/src/services/voice/vad.ts +827 -0
- package/src/services/voice/vad.v1-v4.test.ts +222 -0
- package/src/services/voice/voice-budget.d.ts +241 -0
- package/src/services/voice/voice-budget.d.ts.map +1 -0
- package/src/services/voice/voice-budget.test.ts +420 -0
- package/src/services/voice/voice-budget.ts +656 -0
- package/src/services/voice/voice-duet.test.ts +375 -0
- package/src/services/voice/voice-emotion-classifier.d.ts +95 -0
- package/src/services/voice/voice-emotion-classifier.d.ts.map +1 -0
- package/src/services/voice/voice-emotion-classifier.test.ts +210 -0
- package/src/services/voice/voice-emotion-classifier.ts +273 -0
- package/src/services/voice/voice-preset-format.d.ts +158 -0
- package/src/services/voice/voice-preset-format.d.ts.map +1 -0
- package/src/services/voice/voice-preset-format.ts +700 -0
- package/src/services/voice/voice-preset-generator.test.ts +89 -0
- package/src/services/voice/voice-profile-artifact.d.ts +116 -0
- package/src/services/voice/voice-profile-artifact.d.ts.map +1 -0
- package/src/services/voice/voice-profile-artifact.test.ts +138 -0
- package/src/services/voice/voice-profile-artifact.ts +518 -0
- package/src/services/voice/voice-profile-routes.d.ts +83 -0
- package/src/services/voice/voice-profile-routes.d.ts.map +1 -0
- package/src/services/voice/voice-profile-routes.test.ts +429 -0
- package/src/services/voice/voice-profile-routes.ts +425 -0
- package/src/services/voice/voice-scenario.ts +154 -0
- package/src/services/voice/voice-settings.d.ts +82 -0
- package/src/services/voice/voice-settings.d.ts.map +1 -0
- package/src/services/voice/voice-settings.ts +172 -0
- package/src/services/voice/voice-state-machine.d.ts +364 -0
- package/src/services/voice/voice-state-machine.d.ts.map +1 -0
- package/src/services/voice/voice-state-machine.ts +727 -0
- package/src/services/voice/voice-workbench-report.test.ts +168 -0
- package/src/services/voice/voice-workbench-report.ts +326 -0
- package/src/services/voice/voice-workbench.test.ts +158 -0
- package/src/services/voice/voice.test.ts +1070 -0
- package/src/services/voice/wake-word-ggml.d.ts +101 -0
- package/src/services/voice/wake-word-ggml.d.ts.map +1 -0
- package/src/services/voice/wake-word-ggml.ts +320 -0
- package/src/services/voice/wake-word.d.ts +255 -0
- package/src/services/voice/wake-word.d.ts.map +1 -0
- package/src/services/voice/wake-word.test.ts +298 -0
- package/src/services/voice/wake-word.ts +554 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts +70 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts.map +1 -0
- package/src/services/voice/wrap-with-first-line-cache.ts +267 -0
- package/src/services/voice-model-updater.d.ts +240 -0
- package/src/services/voice-model-updater.d.ts.map +1 -0
- package/src/services/voice-model-updater.ts +724 -0
- package/src/services/voice-prewarm.d.ts +3 -0
- package/src/services/voice-prewarm.d.ts.map +1 -0
- package/src/services/voice-prewarm.ts +51 -0
- package/dist/index.d.ts +0 -37
- package/dist/index.js +0 -1098
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content-hashed cache for projected vision-language tokens (WS1 deliverable).
|
|
3
|
+
*
|
|
4
|
+
* Vision models in the Eliza-1 stack (Qwen3-VL, Florence-2, Apothic-VL) all
|
|
5
|
+
* go through the same expensive projector step: raw pixel
|
|
6
|
+
* bytes → patch embeddings → projector → tokens that the text decoder
|
|
7
|
+
* actually consumes. When the user pastes the same screenshot three times
|
|
8
|
+
* in a row, or when computer-use takes near-duplicate frames of an idle
|
|
9
|
+
* screen, we want to skip the projector entirely and reuse the cached
|
|
10
|
+
* tokens.
|
|
11
|
+
*
|
|
12
|
+
* Contract:
|
|
13
|
+
* - Caller computes a stable hash of the *normalized* input bytes
|
|
14
|
+
* (downscaled to the model's input resolution, then SHA-256 of the
|
|
15
|
+
* packed pixels). The hash is the cache key.
|
|
16
|
+
* - Caller pairs the hash with the projected token tensor (a flat
|
|
17
|
+
* `Float32Array` of length `tokens * hiddenSize`) AND the geometry
|
|
18
|
+
* `{ tokens, hiddenSize }` so a reader can reshape on the way out.
|
|
19
|
+
* - `get(hash)` returns `null` on miss or expiry, the entry on hit.
|
|
20
|
+
* A hit also "touches" the entry to keep it warm under LRU.
|
|
21
|
+
* - `set(hash, entry, ttlMs?)` inserts with a TTL (default 5 min); if
|
|
22
|
+
* the LRU is full, the coldest entry is evicted.
|
|
23
|
+
*
|
|
24
|
+
* Why a separate module:
|
|
25
|
+
* - The arbiter owns the *model handle*; the cache holds *per-input
|
|
26
|
+
* projected weights* that survive across model loads/unloads of the
|
|
27
|
+
* same family. Keeping the cache in a sibling module lets the vision
|
|
28
|
+
* plugin reuse it even when the arbiter swapped the underlying model
|
|
29
|
+
* for memory pressure (the projector tokens are still valid as long
|
|
30
|
+
* as the model family + hash match — we encode the family in the key
|
|
31
|
+
* to be safe).
|
|
32
|
+
*
|
|
33
|
+
* What this is NOT:
|
|
34
|
+
* - A blob cache for the encoder *weights*. Those live in mmap regions
|
|
35
|
+
* owned by the arbiter / SharedResourceRegistry and are evicted via
|
|
36
|
+
* `MmapRegionHandle.evictPages()`.
|
|
37
|
+
* - A cache for downstream LLM generations. Prefix-cache for text is
|
|
38
|
+
* handled by `cache-bridge.ts` and the backend session pool.
|
|
39
|
+
*/
|
|
40
|
+
export interface VisionEmbeddingEntry {
|
|
41
|
+
/** Flat row-major buffer: `tokenCount * hiddenSize` floats. */
|
|
42
|
+
tokens: Float32Array;
|
|
43
|
+
tokenCount: number;
|
|
44
|
+
hiddenSize: number;
|
|
45
|
+
/** True when this entry is still within its TTL. */
|
|
46
|
+
live: boolean;
|
|
47
|
+
}
|
|
48
|
+
export interface VisionEmbeddingCacheConfig {
|
|
49
|
+
/** Max entries retained. LRU evicts beyond this. Default 32. */
|
|
50
|
+
maxEntries: number;
|
|
51
|
+
/** Default TTL when `set()` is called without one. Default 5 min. */
|
|
52
|
+
defaultTtlMs: number;
|
|
53
|
+
}
|
|
54
|
+
export declare class VisionEmbeddingCache {
|
|
55
|
+
private readonly config;
|
|
56
|
+
/**
|
|
57
|
+
* `Map` preserves insertion order; we re-insert on hit to bubble entries
|
|
58
|
+
* to the back, so the first key in iteration order is the LRU candidate.
|
|
59
|
+
*/
|
|
60
|
+
private readonly entries;
|
|
61
|
+
private readonly now;
|
|
62
|
+
constructor(opts?: {
|
|
63
|
+
config?: Partial<VisionEmbeddingCacheConfig>;
|
|
64
|
+
now?: () => number;
|
|
65
|
+
});
|
|
66
|
+
/**
|
|
67
|
+
* Lookup. Returns the entry on hit (and refreshes LRU position), or null
|
|
68
|
+
* on miss / expiry. Expired entries are deleted on read so they don't
|
|
69
|
+
* silently consume the LRU budget.
|
|
70
|
+
*/
|
|
71
|
+
get(hash: string): VisionEmbeddingEntry | null;
|
|
72
|
+
/**
|
|
73
|
+
* Insert. Replaces any existing entry under the same hash. Evicts the
|
|
74
|
+
* coldest entry if we're at capacity. `ttlMs` overrides the configured
|
|
75
|
+
* default; pass 0 to use the default.
|
|
76
|
+
*/
|
|
77
|
+
set(hash: string, entry: {
|
|
78
|
+
tokens: Float32Array;
|
|
79
|
+
tokenCount: number;
|
|
80
|
+
hiddenSize: number;
|
|
81
|
+
}, ttlMs?: number): void;
|
|
82
|
+
/** Diagnostic: current entry count. */
|
|
83
|
+
size(): number;
|
|
84
|
+
/** Diagnostic: snapshot of (hash, byteSize, expiresAtMs) for each entry. */
|
|
85
|
+
snapshot(): ReadonlyArray<{
|
|
86
|
+
hash: string;
|
|
87
|
+
bytes: number;
|
|
88
|
+
expiresAtMs: number;
|
|
89
|
+
}>;
|
|
90
|
+
/** Drop everything. Cheap; only releases JS-side refs to the Float32Arrays. */
|
|
91
|
+
clear(): void;
|
|
92
|
+
/**
|
|
93
|
+
* Drop entries whose TTL has expired. Returns the number removed. Cheap
|
|
94
|
+
* to call from the arbiter's pressure tick.
|
|
95
|
+
*/
|
|
96
|
+
purgeExpired(nowMs?: number): number;
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=vision-embedding-cache.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vision-embedding-cache.d.ts","sourceRoot":"","sources":["vision-embedding-cache.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AASH,MAAM,WAAW,oBAAoB;IACpC,+DAA+D;IAC/D,MAAM,EAAE,YAAY,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,oDAAoD;IACpD,IAAI,EAAE,OAAO,CAAC;CACd;AAED,MAAM,WAAW,0BAA0B;IAC1C,gEAAgE;IAChE,UAAU,EAAE,MAAM,CAAC;IACnB,qEAAqE;IACrE,YAAY,EAAE,MAAM,CAAC;CACrB;AAOD,qBAAa,oBAAoB;IAChC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAA6B;IACpD;;;OAGG;IACH,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAiC;IACzD,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAe;gBAGlC,IAAI,GAAE;QACL,MAAM,CAAC,EAAE,OAAO,CAAC,0BAA0B,CAAC,CAAC;QAC7C,GAAG,CAAC,EAAE,MAAM,MAAM,CAAC;KACd;IAYP;;;;OAIG;IACH,GAAG,CAAC,IAAI,EAAE,MAAM,GAAG,oBAAoB,GAAG,IAAI;IAkB9C;;;;OAIG;IACH,GAAG,CACF,IAAI,EAAE,MAAM,EACZ,KAAK,EAAE;QAAE,MAAM,EAAE,YAAY,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAA;KAAE,EACvE,KAAK,CAAC,EAAE,MAAM,GACZ,IAAI;IAsBP,uCAAuC;IACvC,IAAI,IAAI,MAAM;IAId,4EAA4E;IAC5E,QAAQ,IAAI,aAAa,CAAC;QACzB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;KACpB,CAAC;IAYF,+EAA+E;IAC/E,KAAK,IAAI,IAAI;IAIb;;;OAGG;IACH,YAAY,CAAC,KAAK,GAAE,MAAmB,GAAG,MAAM;CAUhD"}
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content-hashed cache for projected vision-language tokens (WS1 deliverable).
|
|
3
|
+
*
|
|
4
|
+
* Vision models in the Eliza-1 stack (Qwen3-VL, Florence-2, Apothic-VL) all
|
|
5
|
+
* go through the same expensive projector step: raw pixel
|
|
6
|
+
* bytes → patch embeddings → projector → tokens that the text decoder
|
|
7
|
+
* actually consumes. When the user pastes the same screenshot three times
|
|
8
|
+
* in a row, or when computer-use takes near-duplicate frames of an idle
|
|
9
|
+
* screen, we want to skip the projector entirely and reuse the cached
|
|
10
|
+
* tokens.
|
|
11
|
+
*
|
|
12
|
+
* Contract:
|
|
13
|
+
* - Caller computes a stable hash of the *normalized* input bytes
|
|
14
|
+
* (downscaled to the model's input resolution, then SHA-256 of the
|
|
15
|
+
* packed pixels). The hash is the cache key.
|
|
16
|
+
* - Caller pairs the hash with the projected token tensor (a flat
|
|
17
|
+
* `Float32Array` of length `tokens * hiddenSize`) AND the geometry
|
|
18
|
+
* `{ tokens, hiddenSize }` so a reader can reshape on the way out.
|
|
19
|
+
* - `get(hash)` returns `null` on miss or expiry, the entry on hit.
|
|
20
|
+
* A hit also "touches" the entry to keep it warm under LRU.
|
|
21
|
+
* - `set(hash, entry, ttlMs?)` inserts with a TTL (default 5 min); if
|
|
22
|
+
* the LRU is full, the coldest entry is evicted.
|
|
23
|
+
*
|
|
24
|
+
* Why a separate module:
|
|
25
|
+
* - The arbiter owns the *model handle*; the cache holds *per-input
|
|
26
|
+
* projected weights* that survive across model loads/unloads of the
|
|
27
|
+
* same family. Keeping the cache in a sibling module lets the vision
|
|
28
|
+
* plugin reuse it even when the arbiter swapped the underlying model
|
|
29
|
+
* for memory pressure (the projector tokens are still valid as long
|
|
30
|
+
* as the model family + hash match — we encode the family in the key
|
|
31
|
+
* to be safe).
|
|
32
|
+
*
|
|
33
|
+
* What this is NOT:
|
|
34
|
+
* - A blob cache for the encoder *weights*. Those live in mmap regions
|
|
35
|
+
* owned by the arbiter / SharedResourceRegistry and are evicted via
|
|
36
|
+
* `MmapRegionHandle.evictPages()`.
|
|
37
|
+
* - A cache for downstream LLM generations. Prefix-cache for text is
|
|
38
|
+
* handled by `cache-bridge.ts` and the backend session pool.
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
interface CacheEntry {
|
|
42
|
+
tokens: Float32Array;
|
|
43
|
+
tokenCount: number;
|
|
44
|
+
hiddenSize: number;
|
|
45
|
+
expiresAtMs: number;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export interface VisionEmbeddingEntry {
|
|
49
|
+
/** Flat row-major buffer: `tokenCount * hiddenSize` floats. */
|
|
50
|
+
tokens: Float32Array;
|
|
51
|
+
tokenCount: number;
|
|
52
|
+
hiddenSize: number;
|
|
53
|
+
/** True when this entry is still within its TTL. */
|
|
54
|
+
live: boolean;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface VisionEmbeddingCacheConfig {
|
|
58
|
+
/** Max entries retained. LRU evicts beyond this. Default 32. */
|
|
59
|
+
maxEntries: number;
|
|
60
|
+
/** Default TTL when `set()` is called without one. Default 5 min. */
|
|
61
|
+
defaultTtlMs: number;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const DEFAULTS: VisionEmbeddingCacheConfig = {
|
|
65
|
+
maxEntries: 32,
|
|
66
|
+
defaultTtlMs: 5 * 60_000,
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
export class VisionEmbeddingCache {
|
|
70
|
+
private readonly config: VisionEmbeddingCacheConfig;
|
|
71
|
+
/**
|
|
72
|
+
* `Map` preserves insertion order; we re-insert on hit to bubble entries
|
|
73
|
+
* to the back, so the first key in iteration order is the LRU candidate.
|
|
74
|
+
*/
|
|
75
|
+
private readonly entries = new Map<string, CacheEntry>();
|
|
76
|
+
private readonly now: () => number;
|
|
77
|
+
|
|
78
|
+
constructor(
|
|
79
|
+
opts: {
|
|
80
|
+
config?: Partial<VisionEmbeddingCacheConfig>;
|
|
81
|
+
now?: () => number;
|
|
82
|
+
} = {},
|
|
83
|
+
) {
|
|
84
|
+
this.config = {
|
|
85
|
+
maxEntries: Math.max(1, opts.config?.maxEntries ?? DEFAULTS.maxEntries),
|
|
86
|
+
defaultTtlMs: Math.max(
|
|
87
|
+
0,
|
|
88
|
+
opts.config?.defaultTtlMs ?? DEFAULTS.defaultTtlMs,
|
|
89
|
+
),
|
|
90
|
+
};
|
|
91
|
+
this.now = opts.now ?? (() => Date.now());
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Lookup. Returns the entry on hit (and refreshes LRU position), or null
|
|
96
|
+
* on miss / expiry. Expired entries are deleted on read so they don't
|
|
97
|
+
* silently consume the LRU budget.
|
|
98
|
+
*/
|
|
99
|
+
get(hash: string): VisionEmbeddingEntry | null {
|
|
100
|
+
const found = this.entries.get(hash);
|
|
101
|
+
if (!found) return null;
|
|
102
|
+
if (found.expiresAtMs <= this.now()) {
|
|
103
|
+
this.entries.delete(hash);
|
|
104
|
+
return null;
|
|
105
|
+
}
|
|
106
|
+
// Touch — re-insert so it moves to the back of the iteration order.
|
|
107
|
+
this.entries.delete(hash);
|
|
108
|
+
this.entries.set(hash, found);
|
|
109
|
+
return {
|
|
110
|
+
tokens: found.tokens,
|
|
111
|
+
tokenCount: found.tokenCount,
|
|
112
|
+
hiddenSize: found.hiddenSize,
|
|
113
|
+
live: true,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Insert. Replaces any existing entry under the same hash. Evicts the
|
|
119
|
+
* coldest entry if we're at capacity. `ttlMs` overrides the configured
|
|
120
|
+
* default; pass 0 to use the default.
|
|
121
|
+
*/
|
|
122
|
+
set(
|
|
123
|
+
hash: string,
|
|
124
|
+
entry: { tokens: Float32Array; tokenCount: number; hiddenSize: number },
|
|
125
|
+
ttlMs?: number,
|
|
126
|
+
): void {
|
|
127
|
+
if (entry.tokens.length !== entry.tokenCount * entry.hiddenSize) {
|
|
128
|
+
throw new Error(
|
|
129
|
+
`[vision-embedding-cache] token buffer length ${entry.tokens.length} does not match tokenCount*hiddenSize (${entry.tokenCount}*${entry.hiddenSize})`,
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
const ttl = ttlMs && ttlMs > 0 ? ttlMs : this.config.defaultTtlMs;
|
|
133
|
+
const expiresAtMs = this.now() + ttl;
|
|
134
|
+
this.entries.delete(hash);
|
|
135
|
+
this.entries.set(hash, {
|
|
136
|
+
tokens: entry.tokens,
|
|
137
|
+
tokenCount: entry.tokenCount,
|
|
138
|
+
hiddenSize: entry.hiddenSize,
|
|
139
|
+
expiresAtMs,
|
|
140
|
+
});
|
|
141
|
+
while (this.entries.size > this.config.maxEntries) {
|
|
142
|
+
const firstKey = this.entries.keys().next().value;
|
|
143
|
+
if (firstKey === undefined) break;
|
|
144
|
+
this.entries.delete(firstKey);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/** Diagnostic: current entry count. */
|
|
149
|
+
size(): number {
|
|
150
|
+
return this.entries.size;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/** Diagnostic: snapshot of (hash, byteSize, expiresAtMs) for each entry. */
|
|
154
|
+
snapshot(): ReadonlyArray<{
|
|
155
|
+
hash: string;
|
|
156
|
+
bytes: number;
|
|
157
|
+
expiresAtMs: number;
|
|
158
|
+
}> {
|
|
159
|
+
const out: { hash: string; bytes: number; expiresAtMs: number }[] = [];
|
|
160
|
+
for (const [hash, entry] of this.entries) {
|
|
161
|
+
out.push({
|
|
162
|
+
hash,
|
|
163
|
+
bytes: entry.tokens.byteLength,
|
|
164
|
+
expiresAtMs: entry.expiresAtMs,
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
return out;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/** Drop everything. Cheap; only releases JS-side refs to the Float32Arrays. */
|
|
171
|
+
clear(): void {
|
|
172
|
+
this.entries.clear();
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Drop entries whose TTL has expired. Returns the number removed. Cheap
|
|
177
|
+
* to call from the arbiter's pressure tick.
|
|
178
|
+
*/
|
|
179
|
+
purgeExpired(nowMs: number = this.now()): number {
|
|
180
|
+
let removed = 0;
|
|
181
|
+
for (const [hash, entry] of this.entries) {
|
|
182
|
+
if (entry.expiresAtMs <= nowMs) {
|
|
183
|
+
this.entries.delete(hash);
|
|
184
|
+
removed++;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return removed;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Voice Workbench
|
|
2
|
+
|
|
3
|
+
Tracking issue: [elizaOS/eliza#8785](https://github.com/elizaOS/eliza/issues/8785).
|
|
4
|
+
|
|
5
|
+
elizaOS ships a mature voice pipeline (VAD, streaming ASR, EOT classifier,
|
|
6
|
+
barge-in, diarization, speaker imprint/profiles, Kokoro/OmniVoice TTS) but its
|
|
7
|
+
test harnesses were **fragmented** across five families with no shared scenario
|
|
8
|
+
format, no shared corpus, divergent metric definitions, and a headful surface
|
|
9
|
+
that only covered a single-speaker, single-turn round-trip. The Voice Workbench
|
|
10
|
+
unifies them onto **one scenario format, one metric module, and one report**.
|
|
11
|
+
|
|
12
|
+
## Status
|
|
13
|
+
|
|
14
|
+
This directory holds the **pure, framework-level foundation** — the parts that
|
|
15
|
+
can be implemented, tested, and shipped without an audio corpus, native models,
|
|
16
|
+
or a browser. The execution runners that actually drive real services/audio are
|
|
17
|
+
intentionally **gated** (they need a provisioned Eliza-1 local backend + a
|
|
18
|
+
synthesized corpus) and are listed under *Remaining* below.
|
|
19
|
+
|
|
20
|
+
### Implemented (this directory, unit-tested, no native artifacts)
|
|
21
|
+
|
|
22
|
+
| Piece | File | What it is |
|
|
23
|
+
| --- | --- | --- |
|
|
24
|
+
| **Scenario schema** | `voice-scenario.ts` | The declarative `VoiceScenario` format: named `participants` (voice→entity), ordered `turns` (`expectRespond`, `expectedTranscript`, `expectedSpeakerLabel`, `expectedEntity`, `pausesMs`), scenario `assertions` (WER/DER/EOT/latency ceilings), and `classes`. Pure `validateVoiceScenario` reports every consistency error at once. |
|
|
25
|
+
| **Metric module (single source of truth)** | `e2e-harness.ts` | All voice scoring lives here. WER is delegated to `@elizaos/shared/voice-wer` (one definition for headless + headful). Added scorers: `scoreEotDecision` (latency p50/p95 + false-trigger/false-suppression rate), `scoreRespondDecision` (FP/FN split), `scoreDiarization` (DER + confusions/misses), `scoreEntityExtraction` (precision/recall/F1), `scoreVoiceEntityMatch` (recognized-voice→entity accuracy). |
|
|
26
|
+
| **Benchmark report** | `voice-workbench-report.ts` | `buildVoiceWorkbenchReport` rolls a matrix of per-scenario scorer results into one gating report (per-metric mean/worst + percentiles, per-scenario verdict). `formatVoiceWorkbenchMarkdown` renders it; `regressionsAgainstBaseline` flags metrics that worsened past a tolerance. |
|
|
27
|
+
| **WER consolidation** | `@elizaos/shared/voice-wer` | The previously-duplicated `wordErrorRate` (`e2e-harness.ts` **and** `voice-selftest-harness.ts`, with subtly different normalization) is now defined once — Unicode-aware, contraction-preserving — and imported by both. |
|
|
28
|
+
|
|
29
|
+
Tests: `voice-workbench.test.ts`, `voice-workbench-report.test.ts`,
|
|
30
|
+
`e2e-harness.test.ts`.
|
|
31
|
+
|
|
32
|
+
### Honesty contract
|
|
33
|
+
|
|
34
|
+
A scenario whose corpus/backend artifacts are absent is reported `skipped`,
|
|
35
|
+
**never `pass`** — matching the existing self-test contract. A workbench report
|
|
36
|
+
is `skipped` overall only when *every* scenario was skipped; one ran-and-failed
|
|
37
|
+
scenario makes the whole report `fail`.
|
|
38
|
+
|
|
39
|
+
## Execution modes (the three the schema feeds)
|
|
40
|
+
|
|
41
|
+
1. **Headless** — feed corpus audio through the real services without a browser:
|
|
42
|
+
`/api/asr/local-inference`, `LiveDiarizationSession` / `/api/voice/audio-frames`,
|
|
43
|
+
the `ELIZA_VOICE_EOT_BACKEND` classifier, respond/room decisions over a real
|
|
44
|
+
`AgentRuntime` (scenario-runner PGLite boot), `VOICE_TURN_OBSERVED` /
|
|
45
|
+
`VOICE_ENTITY_BOUND` / `IDENTIFY_SPEAKER`, and `/api/tts/local-inference`.
|
|
46
|
+
2. **Headful** — extend `VoiceSelfTestShell` (`packages/ui/src/voice/voice-selftest/`)
|
|
47
|
+
from a single-turn self-test into a scenario player that drives the real
|
|
48
|
+
client pipeline (capture → ASR → SSE → TTS → playback) turn-by-turn, with
|
|
49
|
+
per-turn machine-readable + DOM-mirrored verdicts.
|
|
50
|
+
3. **Benchmark/report** — a single `voice:workbench` entrypoint that runs the
|
|
51
|
+
matrix in both modes and rolls up via `voice-workbench-report.ts` into one
|
|
52
|
+
JSON + Markdown report with regression baselines.
|
|
53
|
+
|
|
54
|
+
All three consume the **same** `VoiceScenario` and the **same** scorers, so a
|
|
55
|
+
metric is defined exactly once regardless of where the audio is driven.
|
|
56
|
+
|
|
57
|
+
## Consolidation map (what converges here)
|
|
58
|
+
|
|
59
|
+
The workbench is the convergence point for these previously-disjoint harnesses:
|
|
60
|
+
|
|
61
|
+
| Legacy harness | Convergence |
|
|
62
|
+
| --- | --- |
|
|
63
|
+
| `e2e-harness.ts:wordErrorRate` + `voice-selftest-harness.ts:wordErrorRate` | **Done** — one `@elizaos/shared/voice-wer`. |
|
|
64
|
+
| Pure scoring lib (`e2e-harness.ts`) | **Promoted** to the single metric module (EOT/diarization/respond/entity scorers added). |
|
|
65
|
+
| `packages/app-core/scripts/voice-duet.mjs` (`voice:duet`), `voice-e2e-hardware.ts`, `voice-vad-smoke.ts`, `voice-attribution-smoke.ts`, `lib/duet-bridge.mjs` | Feed measurements into the shared scorers + report (planned absorb). |
|
|
66
|
+
| `packages/benchmarks/voice/three-voice-scenario.mjs`, `three-voice-e2e-real.mjs` | Corpus-generation precedent the `VoiceScenario` corpus generator extends (planned). |
|
|
67
|
+
| `packages/benchmarks/voicebench/` (TS latency p95/p99) | The report layer mirrors its p95/p99 shape; remains a research bench linked from the workbench. |
|
|
68
|
+
| Per-spec inline `tinyWav()` fixtures (`packages/app/test/ui-smoke/voice-*.spec.ts`) | Replaced by the versioned corpus (planned). |
|
|
69
|
+
|
|
70
|
+
## Remaining (gated — needs corpus + real backend)
|
|
71
|
+
|
|
72
|
+
These are tracked on #8785 and are **not** stubbed here (no LARP):
|
|
73
|
+
|
|
74
|
+
- **Corpus generator + versioned labeled corpus** — TTS-synthesize each turn,
|
|
75
|
+
splice pauses, mix multi-speaker streams; persist labeled WAV + ground-truth
|
|
76
|
+
JSON. Needs the real TTS routes / Kokoro voices. (`__test-helpers__/synthetic-speech.ts`
|
|
77
|
+
is the synthesis seed.)
|
|
78
|
+
- **Headless runner** — wire the scenario through the real ASR/diarization/EOT/
|
|
79
|
+
respond/entity/TTS services + `AgentRuntime`.
|
|
80
|
+
- **scenario-runner audio turn kind** — add an `audio`/`voice` `ScenarioTurnExecution`
|
|
81
|
+
so voice scenarios become first-class `.scenario.ts` files.
|
|
82
|
+
- **Headful scenario player** — `VoiceSelfTestShell` → multi-turn player +
|
|
83
|
+
`packages/app/test/ui-smoke/voice-workbench-*.spec.ts` per scenario class.
|
|
84
|
+
- **`voice:workbench` entrypoint + CI lane** — run the matrix, emit the report
|
|
85
|
+
(`buildVoiceWorkbenchReport`), `skipped` (never `pass`) when artifacts absent.
|
|
86
|
+
- **Multi-agent room semantics** — the canonical ≥3-participant "who responds"
|
|
87
|
+
contract (an open question on the issue) must be settled before the workbench
|
|
88
|
+
can assert against it rather than inventing a rule.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test-only `ElizaInferenceFfi` stand-in. Only the methods the voice
|
|
3
|
+
* pipeline exercises are non-trivial: `asrTranscribe` returns the supplied
|
|
4
|
+
* fixed transcript; `ttsSynthesize` writes a constant number of samples;
|
|
5
|
+
* `ttsSynthesizeStream` emits the same PCM as two chunks (one body + one
|
|
6
|
+
* `isFinal` tail) and honours `onChunk` returning `true` as a cancel.
|
|
7
|
+
* The ABI-v2 streaming-ASR symbols report "no working decoder" by
|
|
8
|
+
* default (the same as the C unsupported-build path) so the pipeline routes through the v1
|
|
9
|
+
* batch path unless a test opts into `asrStreamSupported`. Everything
|
|
10
|
+
* else is a no-op / identity so a test can wire a "fused" FFI without a
|
|
11
|
+
* real `.dylib`.
|
|
12
|
+
*/
|
|
13
|
+
import type { ElizaInferenceFfi, TtsStreamChunk } from "../ffi-bindings";
|
|
14
|
+
|
|
15
|
+
export function fakeFfi(
|
|
16
|
+
transcript: string,
|
|
17
|
+
opts: {
|
|
18
|
+
ttsSamples?: number;
|
|
19
|
+
ttsStreamSupported?: boolean;
|
|
20
|
+
asrStreamSupported?: boolean;
|
|
21
|
+
vadSupported?: boolean;
|
|
22
|
+
vadProbs?: readonly number[];
|
|
23
|
+
speakerSupported?: boolean;
|
|
24
|
+
/** The 256-float embedding `speakerEmbed` returns (defaults to zeros). */
|
|
25
|
+
speakerEmbedding?: Float32Array;
|
|
26
|
+
diarizSupported?: boolean;
|
|
27
|
+
/** The per-frame label sequence `diarizSegment` returns. */
|
|
28
|
+
diarizLabels?: Int8Array;
|
|
29
|
+
} = {},
|
|
30
|
+
): ElizaInferenceFfi {
|
|
31
|
+
const ttsSamples = opts.ttsSamples ?? 8;
|
|
32
|
+
const ttsStreamSupported = opts.ttsStreamSupported ?? true;
|
|
33
|
+
const asrStreamSupported = opts.asrStreamSupported ?? false;
|
|
34
|
+
const vadSupported = opts.vadSupported ?? false;
|
|
35
|
+
const vadProbs = opts.vadProbs ?? [0];
|
|
36
|
+
const speakerSupported = opts.speakerSupported ?? false;
|
|
37
|
+
const diarizSupported = opts.diarizSupported ?? false;
|
|
38
|
+
let vadIdx = 0;
|
|
39
|
+
return {
|
|
40
|
+
libraryPath: "/fake/libelizainference.so",
|
|
41
|
+
libraryAbiVersion: "3",
|
|
42
|
+
create: () => 1n,
|
|
43
|
+
destroy: () => {},
|
|
44
|
+
mmapAcquire: () => {},
|
|
45
|
+
mmapEvict: () => {},
|
|
46
|
+
ttsSynthesize: ({ out }) => {
|
|
47
|
+
const n = Math.min(ttsSamples, out.length);
|
|
48
|
+
out.fill(0.1, 0, n);
|
|
49
|
+
return n;
|
|
50
|
+
},
|
|
51
|
+
asrTranscribe: () => transcript,
|
|
52
|
+
ttsStreamSupported: () => ttsStreamSupported,
|
|
53
|
+
ttsSynthesizeStream: ({ onChunk }) => {
|
|
54
|
+
const body = new Float32Array(ttsSamples).fill(0.1);
|
|
55
|
+
const wantCancel = onChunk({
|
|
56
|
+
pcm: body,
|
|
57
|
+
isFinal: false,
|
|
58
|
+
} as TtsStreamChunk);
|
|
59
|
+
onChunk({ pcm: new Float32Array(0), isFinal: true });
|
|
60
|
+
return { cancelled: wantCancel === true };
|
|
61
|
+
},
|
|
62
|
+
cancelTts: () => {},
|
|
63
|
+
setVerifierCallback: () => ({ close: () => {} }),
|
|
64
|
+
encodeReferenceSupported: () => false,
|
|
65
|
+
vadSupported: () => vadSupported,
|
|
66
|
+
vadOpen: () => 2n,
|
|
67
|
+
vadProcess: ({ pcm }) => {
|
|
68
|
+
if (pcm.length !== 512) throw new Error("fake VAD expected 512 samples");
|
|
69
|
+
const p = vadProbs[vadIdx] ?? vadProbs[vadProbs.length - 1] ?? 0;
|
|
70
|
+
vadIdx++;
|
|
71
|
+
return p;
|
|
72
|
+
},
|
|
73
|
+
vadReset: () => {},
|
|
74
|
+
vadClose: () => {},
|
|
75
|
+
speakerSupported: () => speakerSupported,
|
|
76
|
+
speakerOpen: () => 3n,
|
|
77
|
+
speakerEmbed: () =>
|
|
78
|
+
(opts.speakerEmbedding ?? new Float32Array(256)).slice(),
|
|
79
|
+
speakerClose: () => {},
|
|
80
|
+
diarizSupported: () => diarizSupported,
|
|
81
|
+
diarizOpen: () => 4n,
|
|
82
|
+
diarizSegment: () => (opts.diarizLabels ?? new Int8Array(293)).slice(),
|
|
83
|
+
diarizClose: () => {},
|
|
84
|
+
asrStreamSupported: () => asrStreamSupported,
|
|
85
|
+
asrStreamOpen: () => 1n,
|
|
86
|
+
asrStreamFeed: () => {},
|
|
87
|
+
asrStreamPartial: () => ({ partial: transcript }),
|
|
88
|
+
asrStreamFinish: () => ({ partial: transcript }),
|
|
89
|
+
asrStreamClose: () => {},
|
|
90
|
+
close: () => {},
|
|
91
|
+
};
|
|
92
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic-ish speech-like audio generator for VAD/wake-word smoke
|
|
3
|
+
* tests. Pure synthesis (glottal pulse train through a three-formant
|
|
4
|
+
* resonator bank with a syllable-rate amplitude envelope and mild f0
|
|
5
|
+
* jitter) — close enough to real speech in the time/frequency domain that
|
|
6
|
+
* the Silero VAD reads it as speech, without shipping a recorded WAV.
|
|
7
|
+
*
|
|
8
|
+
* `silence + speech + silence` is the canonical smoke fixture: the VAD
|
|
9
|
+
* should detect exactly one speech segment whose boundaries land inside
|
|
10
|
+
* the voiced region, and `VadDetector` should drop the leading/trailing
|
|
11
|
+
* silence windows from its speech-state timeline.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
export interface SpeechFixtureOptions {
|
|
15
|
+
sampleRate?: number;
|
|
16
|
+
/** Seconds of leading silence. */
|
|
17
|
+
leadSilenceSec?: number;
|
|
18
|
+
/** Seconds of synthesized speech. */
|
|
19
|
+
speechSec?: number;
|
|
20
|
+
/** Seconds of trailing silence. */
|
|
21
|
+
tailSilenceSec?: number;
|
|
22
|
+
/** Deterministic seed for the f0 jitter. */
|
|
23
|
+
seed?: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface SpeechFixture {
|
|
27
|
+
pcm: Float32Array;
|
|
28
|
+
sampleRate: number;
|
|
29
|
+
speechStartSample: number;
|
|
30
|
+
speechEndSample: number;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function mulberry32(seed: number): () => number {
|
|
34
|
+
let a = seed >>> 0;
|
|
35
|
+
return () => {
|
|
36
|
+
a |= 0;
|
|
37
|
+
a = (a + 0x6d2b79f5) | 0;
|
|
38
|
+
let t = Math.imul(a ^ (a >>> 15), 1 | a);
|
|
39
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
40
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** A three-formant resonator bank state. */
|
|
45
|
+
class FormantBank {
|
|
46
|
+
private readonly r: number[];
|
|
47
|
+
private readonly a1: number[];
|
|
48
|
+
private readonly a2: number[];
|
|
49
|
+
private readonly z1: number[];
|
|
50
|
+
private readonly z2: number[];
|
|
51
|
+
constructor(
|
|
52
|
+
sampleRate: number,
|
|
53
|
+
formants: ReadonlyArray<readonly [number, number]>,
|
|
54
|
+
) {
|
|
55
|
+
this.r = [];
|
|
56
|
+
this.a1 = [];
|
|
57
|
+
this.a2 = [];
|
|
58
|
+
this.z1 = [];
|
|
59
|
+
this.z2 = [];
|
|
60
|
+
for (const [fc, bw] of formants) {
|
|
61
|
+
const r = Math.exp((-Math.PI * bw) / sampleRate);
|
|
62
|
+
const theta = (2 * Math.PI * fc) / sampleRate;
|
|
63
|
+
this.r.push(r);
|
|
64
|
+
this.a1.push(-2 * r * Math.cos(theta));
|
|
65
|
+
this.a2.push(r * r);
|
|
66
|
+
this.z1.push(0);
|
|
67
|
+
this.z2.push(0);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
step(excitation: number): number {
|
|
71
|
+
let v = 0;
|
|
72
|
+
for (let k = 0; k < this.r.length; k++) {
|
|
73
|
+
const y = excitation - this.a1[k] * this.z1[k] - this.a2[k] * this.z2[k];
|
|
74
|
+
this.z2[k] = this.z1[k];
|
|
75
|
+
this.z1[k] = y;
|
|
76
|
+
v += y * (1 - k * 0.25);
|
|
77
|
+
}
|
|
78
|
+
return v;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const DEFAULT_FORMANTS: ReadonlyArray<readonly [number, number]> = [
|
|
83
|
+
[700, 80],
|
|
84
|
+
[1220, 90],
|
|
85
|
+
[2600, 120],
|
|
86
|
+
];
|
|
87
|
+
|
|
88
|
+
/** Build a `silence + synthesized speech + silence` PCM buffer. */
|
|
89
|
+
export function makeSpeechWithSilenceFixture(
|
|
90
|
+
opts: SpeechFixtureOptions = {},
|
|
91
|
+
): SpeechFixture {
|
|
92
|
+
const sampleRate = opts.sampleRate ?? 16_000;
|
|
93
|
+
const leadSec = opts.leadSilenceSec ?? 0.5;
|
|
94
|
+
const speechSec = opts.speechSec ?? 1.2;
|
|
95
|
+
const tailSec = opts.tailSilenceSec ?? 0.5;
|
|
96
|
+
const totalSec = leadSec + speechSec + tailSec;
|
|
97
|
+
const n = Math.floor(totalSec * sampleRate);
|
|
98
|
+
const pcm = new Float32Array(n);
|
|
99
|
+
const speechStartSample = Math.floor(leadSec * sampleRate);
|
|
100
|
+
const speechEndSample = Math.floor((leadSec + speechSec) * sampleRate);
|
|
101
|
+
|
|
102
|
+
const rng = mulberry32(opts.seed ?? 0xe11a);
|
|
103
|
+
const bank = new FormantBank(sampleRate, DEFAULT_FORMANTS);
|
|
104
|
+
let phase = 0;
|
|
105
|
+
for (let i = speechStartSample; i < speechEndSample; i++) {
|
|
106
|
+
const tInSpeech = (i - speechStartSample) / sampleRate;
|
|
107
|
+
const f0 =
|
|
108
|
+
110 + 30 * Math.sin(2 * Math.PI * 5 * tInSpeech) + (rng() - 0.5) * 4;
|
|
109
|
+
phase += f0 / sampleRate;
|
|
110
|
+
let excitation = 0;
|
|
111
|
+
if (phase >= 1) {
|
|
112
|
+
phase -= 1;
|
|
113
|
+
excitation = 1;
|
|
114
|
+
}
|
|
115
|
+
// Syllable-rate amplitude envelope (~4 Hz).
|
|
116
|
+
const amp = Math.max(
|
|
117
|
+
0,
|
|
118
|
+
0.6 * (1 + Math.sin(2 * Math.PI * 4 * tInSpeech - Math.PI / 2)),
|
|
119
|
+
);
|
|
120
|
+
excitation *= amp;
|
|
121
|
+
pcm[i] = bank.step(excitation) * 0.15;
|
|
122
|
+
}
|
|
123
|
+
return { pcm, sampleRate, speechStartSample, speechEndSample };
|
|
124
|
+
}
|