@elizaos/plugin-local-inference 2.0.0-beta.1 → 2.0.3-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +83 -0
- package/package.json +82 -15
- package/src/actions/generate-media.d.ts +59 -0
- package/src/actions/generate-media.d.ts.map +1 -0
- package/src/actions/generate-media.ts +647 -0
- package/src/actions/identify-speaker.d.ts +23 -0
- package/src/actions/identify-speaker.d.ts.map +1 -0
- package/src/actions/identify-speaker.ts +171 -0
- package/src/actions/transcription-control.d.ts +29 -0
- package/src/actions/transcription-control.d.ts.map +1 -0
- package/src/actions/transcription-control.test.ts +100 -0
- package/src/actions/transcription-control.ts +127 -0
- package/src/adapters/capacitor-llama/__tests__/compat-behavior.test.ts +218 -0
- package/src/adapters/capacitor-llama/__tests__/index.test.ts +68 -0
- package/src/adapters/capacitor-llama/__tests__/structured-output.test.ts +215 -0
- package/src/adapters/capacitor-llama/__tests__/text-streaming.test.ts +174 -0
- package/src/adapters/capacitor-llama/environment.ts +71 -0
- package/src/adapters/capacitor-llama/index.browser.ts +83 -0
- package/src/adapters/capacitor-llama/index.ts +807 -0
- package/src/adapters/capacitor-llama/loader.ts +109 -0
- package/src/adapters/capacitor-llama/structured-output.ts +165 -0
- package/src/adapters/capacitor-llama/text-streaming.ts +227 -0
- package/src/adapters/capacitor-llama/types.ts +374 -0
- package/src/backends/apple-foundation.ts +127 -0
- package/src/index.d.ts +8 -0
- package/src/index.d.ts.map +1 -0
- package/src/index.ts +62 -0
- package/src/local-inference-routes.d.ts +38 -0
- package/src/local-inference-routes.d.ts.map +1 -0
- package/src/local-inference-routes.test.ts +344 -0
- package/src/local-inference-routes.ts +1543 -0
- package/src/provider.d.ts +21 -0
- package/src/provider.d.ts.map +1 -0
- package/src/provider.ts +1082 -0
- package/src/routes/compat-helpers.d.ts +18 -0
- package/src/routes/compat-helpers.d.ts.map +1 -0
- package/src/routes/compat-helpers.ts +274 -0
- package/src/routes/family-member-route.d.ts +62 -0
- package/src/routes/family-member-route.d.ts.map +1 -0
- package/src/routes/family-member-route.ts +353 -0
- package/src/routes/index.d.ts +19 -0
- package/src/routes/index.d.ts.map +1 -0
- package/src/routes/index.ts +60 -0
- package/src/routes/live-diarization-route.d.ts +26 -0
- package/src/routes/live-diarization-route.d.ts.map +1 -0
- package/src/routes/live-diarization-route.test.ts +213 -0
- package/src/routes/live-diarization-route.ts +122 -0
- package/src/routes/local-inference-asr-route.d.ts +4 -0
- package/src/routes/local-inference-asr-route.d.ts.map +1 -0
- package/src/routes/local-inference-asr-route.test.ts +205 -0
- package/src/routes/local-inference-asr-route.ts +163 -0
- package/src/routes/local-inference-asr-transcribe.d.ts +20 -0
- package/src/routes/local-inference-asr-transcribe.d.ts.map +1 -0
- package/src/routes/local-inference-asr-transcribe.test.ts +118 -0
- package/src/routes/local-inference-asr-transcribe.ts +97 -0
- package/src/routes/local-inference-compat-routes.d.ts +16 -0
- package/src/routes/local-inference-compat-routes.d.ts.map +1 -0
- package/src/routes/local-inference-compat-routes.test.ts +485 -0
- package/src/routes/local-inference-compat-routes.ts +808 -0
- package/src/routes/local-inference-tts-route.d.ts +7 -0
- package/src/routes/local-inference-tts-route.d.ts.map +1 -0
- package/src/routes/local-inference-tts-route.test.ts +179 -0
- package/src/routes/local-inference-tts-route.ts +230 -0
- package/src/routes/transcript-audio-store.d.ts +15 -0
- package/src/routes/transcript-audio-store.d.ts.map +1 -0
- package/src/routes/transcript-audio-store.ts +27 -0
- package/src/routes/transcripts-routes.d.ts +36 -0
- package/src/routes/transcripts-routes.d.ts.map +1 -0
- package/src/routes/transcripts-routes.test.ts +144 -0
- package/src/routes/transcripts-routes.ts +159 -0
- package/src/routes/voice-first-run-routes.d.ts +62 -0
- package/src/routes/voice-first-run-routes.d.ts.map +1 -0
- package/src/routes/voice-first-run-routes.ts +524 -0
- package/src/routes/voice-models-routes.d.ts +62 -0
- package/src/routes/voice-models-routes.d.ts.map +1 -0
- package/src/routes/voice-models-routes.ts +554 -0
- package/src/routes/voice-profile-plugin-routes.d.ts +19 -0
- package/src/routes/voice-profile-plugin-routes.d.ts.map +1 -0
- package/src/routes/voice-profile-plugin-routes.ts +138 -0
- package/src/routes/voice-profiles-management-routes.d.ts +52 -0
- package/src/routes/voice-profiles-management-routes.d.ts.map +1 -0
- package/src/routes/voice-profiles-management-routes.ts +476 -0
- package/src/routes/voice-speaker-profile-routes.d.ts +57 -0
- package/src/routes/voice-speaker-profile-routes.d.ts.map +1 -0
- package/src/routes/voice-speaker-profile-routes.ts +199 -0
- package/src/runtime/aosp-llama-loader-selection.test.ts +80 -0
- package/src/runtime/capacitor-llama.d.ts +25 -0
- package/src/runtime/embedding-manager-support.d.ts +77 -0
- package/src/runtime/embedding-manager-support.d.ts.map +1 -0
- package/src/runtime/embedding-manager-support.ts +497 -0
- package/src/runtime/embedding-presets.d.ts +16 -0
- package/src/runtime/embedding-presets.d.ts.map +1 -0
- package/src/runtime/embedding-presets.ts +81 -0
- package/src/runtime/embedding-warmup-policy.d.ts +14 -0
- package/src/runtime/embedding-warmup-policy.d.ts.map +1 -0
- package/src/runtime/embedding-warmup-policy.test.ts +53 -0
- package/src/runtime/embedding-warmup-policy.ts +48 -0
- package/src/runtime/ensure-local-inference-handler.d.ts +62 -0
- package/src/runtime/ensure-local-inference-handler.d.ts.map +1 -0
- package/src/runtime/ensure-local-inference-handler.test.ts +528 -0
- package/src/runtime/ensure-local-inference-handler.ts +1448 -0
- package/src/runtime/index.d.ts +15 -0
- package/src/runtime/index.d.ts.map +1 -0
- package/src/runtime/index.ts +33 -0
- package/src/runtime/mobile-local-inference-gate.d.ts +31 -0
- package/src/runtime/mobile-local-inference-gate.d.ts.map +1 -0
- package/src/runtime/mobile-local-inference-gate.test.ts +69 -0
- package/src/runtime/mobile-local-inference-gate.ts +44 -0
- package/src/runtime/voice-entity-binding.d.ts +103 -0
- package/src/runtime/voice-entity-binding.d.ts.map +1 -0
- package/src/runtime/voice-entity-binding.transcript.test.ts +69 -0
- package/src/runtime/voice-entity-binding.ts +328 -0
- package/src/services/README.md +71 -0
- package/src/services/__tests__/backend-selector.test.ts +101 -0
- package/src/services/__tests__/checkpoint-manager.test.ts +376 -0
- package/src/services/__tests__/gpu-autotune.test.ts +400 -0
- package/src/services/__tests__/llm-streaming-binding.test.ts +85 -0
- package/src/services/__tests__/planner-grammar.test.ts +372 -0
- package/src/services/__tests__/runtime-target.test.ts +176 -0
- package/src/services/active-model-switch-rollback.test.ts +183 -0
- package/src/services/active-model.d.ts +282 -0
- package/src/services/active-model.d.ts.map +1 -0
- package/src/services/active-model.ts +1213 -0
- package/src/services/assignments.d.ts +71 -0
- package/src/services/assignments.d.ts.map +1 -0
- package/src/services/assignments.test.ts +80 -0
- package/src/services/assignments.ts +230 -0
- package/src/services/backend-selector.ts +95 -0
- package/src/services/backend.d.ts +346 -0
- package/src/services/backend.d.ts.map +1 -0
- package/src/services/backend.ts +612 -0
- package/src/services/bionic-host-loader.d.ts +46 -0
- package/src/services/bionic-host-loader.d.ts.map +1 -0
- package/src/services/bionic-host-loader.test.ts +133 -0
- package/src/services/bionic-host-loader.ts +180 -0
- package/src/services/bundled-models.d.ts +34 -0
- package/src/services/bundled-models.d.ts.map +1 -0
- package/src/services/bundled-models.ts +129 -0
- package/src/services/cache-bridge.d.ts +206 -0
- package/src/services/cache-bridge.d.ts.map +1 -0
- package/src/services/cache-bridge.test.ts +516 -0
- package/src/services/cache-bridge.ts +423 -0
- package/src/services/catalog.d.ts +10 -0
- package/src/services/catalog.d.ts.map +1 -0
- package/src/services/catalog.test.ts +238 -0
- package/src/services/catalog.ts +27 -0
- package/src/services/checkpoint-client.d.ts +109 -0
- package/src/services/checkpoint-client.d.ts.map +1 -0
- package/src/services/checkpoint-client.ts +258 -0
- package/src/services/checkpoint-manager.ts +474 -0
- package/src/services/cloud-fallback.d.ts +102 -0
- package/src/services/cloud-fallback.d.ts.map +1 -0
- package/src/services/cloud-fallback.ts +230 -0
- package/src/services/conversation-registry.d.ts +142 -0
- package/src/services/conversation-registry.d.ts.map +1 -0
- package/src/services/conversation-registry.test.ts +235 -0
- package/src/services/conversation-registry.ts +264 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts +95 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts.map +1 -0
- package/src/services/desktop-fused-ffi-backend-runtime.ts +339 -0
- package/src/services/device-bridge.d.ts +188 -0
- package/src/services/device-bridge.d.ts.map +1 -0
- package/src/services/device-bridge.ts +1237 -0
- package/src/services/device-resource-metrics.d.ts +149 -0
- package/src/services/device-resource-metrics.d.ts.map +1 -0
- package/src/services/device-resource-metrics.test.ts +98 -0
- package/src/services/device-resource-metrics.ts +346 -0
- package/src/services/device-tier.d.ts +115 -0
- package/src/services/device-tier.d.ts.map +1 -0
- package/src/services/device-tier.test.ts +371 -0
- package/src/services/device-tier.ts +410 -0
- package/src/services/downloader.d.ts +82 -0
- package/src/services/downloader.d.ts.map +1 -0
- package/src/services/downloader.test.ts +747 -0
- package/src/services/downloader.ts +925 -0
- package/src/services/engine-direct-bundle.test.ts +58 -0
- package/src/services/engine-streaming.test.ts +80 -0
- package/src/services/engine.d.ts +540 -0
- package/src/services/engine.d.ts.map +1 -0
- package/src/services/engine.ts +1909 -0
- package/src/services/ensure-local-artifacts.integration.test.ts +273 -0
- package/src/services/ensure-local-artifacts.test.ts +368 -0
- package/src/services/ensure-local-artifacts.ts +351 -0
- package/src/services/external-scanner.d.ts +17 -0
- package/src/services/external-scanner.d.ts.map +1 -0
- package/src/services/external-scanner.ts +312 -0
- package/src/services/ffi-llm-mock.ts +354 -0
- package/src/services/ffi-llm-streaming-abi.ts +442 -0
- package/src/services/ffi-streaming-backend.d.ts +180 -0
- package/src/services/ffi-streaming-backend.d.ts.map +1 -0
- package/src/services/ffi-streaming-backend.ts +382 -0
- package/src/services/ffi-streaming-runner.d.ts +122 -0
- package/src/services/ffi-streaming-runner.d.ts.map +1 -0
- package/src/services/ffi-streaming-runner.test.ts +60 -0
- package/src/services/ffi-streaming-runner.ts +354 -0
- package/src/services/ffi-unload-ordering.test.ts +162 -0
- package/src/services/gpu-autotune.ts +534 -0
- package/src/services/gpu-detect.d.ts +56 -0
- package/src/services/gpu-detect.d.ts.map +1 -0
- package/src/services/gpu-detect.ts +139 -0
- package/src/services/handler-registry.d.ts +72 -0
- package/src/services/handler-registry.d.ts.map +1 -0
- package/src/services/handler-registry.ts +240 -0
- package/src/services/hardware.d.ts +63 -0
- package/src/services/hardware.d.ts.map +1 -0
- package/src/services/hardware.test.ts +231 -0
- package/src/services/hardware.ts +410 -0
- package/src/services/hf-search.d.ts +26 -0
- package/src/services/hf-search.d.ts.map +1 -0
- package/src/services/hf-search.test.ts +69 -0
- package/src/services/hf-search.ts +420 -0
- package/src/services/image-description-runtime.d.ts +14 -0
- package/src/services/image-description-runtime.d.ts.map +1 -0
- package/src/services/image-description-runtime.test.ts +61 -0
- package/src/services/image-description-runtime.ts +118 -0
- package/src/services/imagegen/aosp-unavailable.d.ts +134 -0
- package/src/services/imagegen/aosp-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/aosp-unavailable.ts +229 -0
- package/src/services/imagegen/backend-selector.d.ts +118 -0
- package/src/services/imagegen/backend-selector.d.ts.map +1 -0
- package/src/services/imagegen/backend-selector.ts +277 -0
- package/src/services/imagegen/coreml-unavailable.d.ts +105 -0
- package/src/services/imagegen/coreml-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/coreml-unavailable.ts +237 -0
- package/src/services/imagegen/errors.d.ts +16 -0
- package/src/services/imagegen/errors.d.ts.map +1 -0
- package/src/services/imagegen/errors.ts +40 -0
- package/src/services/imagegen/index.d.ts +58 -0
- package/src/services/imagegen/index.d.ts.map +1 -0
- package/src/services/imagegen/index.ts +144 -0
- package/src/services/imagegen/mflux.d.ts +74 -0
- package/src/services/imagegen/mflux.d.ts.map +1 -0
- package/src/services/imagegen/mflux.ts +313 -0
- package/src/services/imagegen/sd-cpp.d.ts +180 -0
- package/src/services/imagegen/sd-cpp.d.ts.map +1 -0
- package/src/services/imagegen/sd-cpp.ts +718 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts +83 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/tensorrt-unavailable.ts +295 -0
- package/src/services/imagegen/types.d.ts +181 -0
- package/src/services/imagegen/types.d.ts.map +1 -0
- package/src/services/imagegen/types.ts +193 -0
- package/src/services/index.d.ts +29 -0
- package/src/services/index.d.ts.map +1 -0
- package/src/services/index.ts +211 -0
- package/src/services/inference-capabilities.d.ts +132 -0
- package/src/services/inference-capabilities.d.ts.map +1 -0
- package/src/services/inference-capabilities.test.ts +75 -0
- package/src/services/inference-capabilities.ts +204 -0
- package/src/services/inference-telemetry.d.ts +59 -0
- package/src/services/inference-telemetry.d.ts.map +1 -0
- package/src/services/inference-telemetry.ts +143 -0
- package/src/services/ios-llama-streaming.ts +248 -0
- package/src/services/kv-spill.d.ts +189 -0
- package/src/services/kv-spill.d.ts.map +1 -0
- package/src/services/kv-spill.test.ts +222 -0
- package/src/services/kv-spill.ts +356 -0
- package/src/services/latency-trace.d.ts +346 -0
- package/src/services/latency-trace.d.ts.map +1 -0
- package/src/services/latency-trace.test.ts +266 -0
- package/src/services/latency-trace.ts +844 -0
- package/src/services/llama-server-metrics.ts +304 -0
- package/src/services/llm-streaming-binding.d.ts +96 -0
- package/src/services/llm-streaming-binding.d.ts.map +1 -0
- package/src/services/llm-streaming-binding.ts +136 -0
- package/src/services/load-args.d.ts +82 -0
- package/src/services/load-args.d.ts.map +1 -0
- package/src/services/load-args.ts +81 -0
- package/src/services/manifest/eliza-1.manifest.v1.json +708 -0
- package/src/services/manifest/index.d.ts +4 -0
- package/src/services/manifest/index.d.ts.map +1 -0
- package/src/services/manifest/index.ts +66 -0
- package/src/services/manifest/manifest.test.ts +689 -0
- package/src/services/manifest/schema.d.ts +713 -0
- package/src/services/manifest/schema.d.ts.map +1 -0
- package/src/services/manifest/schema.ts +653 -0
- package/src/services/manifest/types.d.ts +30 -0
- package/src/services/manifest/types.d.ts.map +1 -0
- package/src/services/manifest/types.ts +55 -0
- package/src/services/manifest/validator.d.ts +66 -0
- package/src/services/manifest/validator.d.ts.map +1 -0
- package/src/services/manifest/validator.ts +567 -0
- package/src/services/memory-arbiter.d.ts +318 -0
- package/src/services/memory-arbiter.d.ts.map +1 -0
- package/src/services/memory-arbiter.test.ts +419 -0
- package/src/services/memory-arbiter.ts +925 -0
- package/src/services/memory-monitor.d.ts +122 -0
- package/src/services/memory-monitor.d.ts.map +1 -0
- package/src/services/memory-monitor.test.ts +208 -0
- package/src/services/memory-monitor.ts +297 -0
- package/src/services/memory-pressure.d.ts +130 -0
- package/src/services/memory-pressure.d.ts.map +1 -0
- package/src/services/memory-pressure.ts +414 -0
- package/src/services/mtp-doctor.d.ts +13 -0
- package/src/services/mtp-doctor.d.ts.map +1 -0
- package/src/services/mtp-doctor.ts +78 -0
- package/src/services/network-policy.d.ts +127 -0
- package/src/services/network-policy.d.ts.map +1 -0
- package/src/services/network-policy.ts +346 -0
- package/src/services/paths.d.ts +6 -0
- package/src/services/paths.d.ts.map +1 -0
- package/src/services/paths.ts +25 -0
- package/src/services/planner-skeleton.d.ts +124 -0
- package/src/services/planner-skeleton.d.ts.map +1 -0
- package/src/services/planner-skeleton.ts +175 -0
- package/src/services/providers.d.ts +38 -0
- package/src/services/providers.d.ts.map +1 -0
- package/src/services/providers.ts +507 -0
- package/src/services/ram-budget-cache.test.ts +163 -0
- package/src/services/ram-budget.d.ts +110 -0
- package/src/services/ram-budget.d.ts.map +1 -0
- package/src/services/ram-budget.ts +0 -0
- package/src/services/readiness.d.ts +9 -0
- package/src/services/readiness.d.ts.map +1 -0
- package/src/services/readiness.test.ts +87 -0
- package/src/services/readiness.ts +238 -0
- package/src/services/recommendation.d.ts +111 -0
- package/src/services/recommendation.d.ts.map +1 -0
- package/src/services/recommendation.ts +671 -0
- package/src/services/registry.d.ts +35 -0
- package/src/services/registry.d.ts.map +1 -0
- package/src/services/registry.ts +151 -0
- package/src/services/router-handler.d.ts +92 -0
- package/src/services/router-handler.d.ts.map +1 -0
- package/src/services/router-handler.test.ts +45 -0
- package/src/services/router-handler.ts +407 -0
- package/src/services/routing-policy.d.ts +69 -0
- package/src/services/routing-policy.d.ts.map +1 -0
- package/src/services/routing-policy.test.ts +164 -0
- package/src/services/routing-policy.ts +297 -0
- package/src/services/routing-preferences.d.ts +8 -0
- package/src/services/routing-preferences.d.ts.map +1 -0
- package/src/services/routing-preferences.ts +17 -0
- package/src/services/runtime-target.d.ts +98 -0
- package/src/services/runtime-target.d.ts.map +1 -0
- package/src/services/runtime-target.ts +154 -0
- package/src/services/service.d.ts +128 -0
- package/src/services/service.d.ts.map +1 -0
- package/src/services/service.test.ts +223 -0
- package/src/services/service.ts +735 -0
- package/src/services/session-pool.d.ts +72 -0
- package/src/services/session-pool.d.ts.map +1 -0
- package/src/services/session-pool.ts +153 -0
- package/src/services/structured-output/deterministic-repair.d.ts +23 -0
- package/src/services/structured-output/deterministic-repair.d.ts.map +1 -0
- package/src/services/structured-output/deterministic-repair.test.ts +169 -0
- package/src/services/structured-output/deterministic-repair.ts +443 -0
- package/src/services/structured-output/index.ts +4 -0
- package/src/services/structured-output.d.ts +311 -0
- package/src/services/structured-output.d.ts.map +1 -0
- package/src/services/structured-output.test.ts +483 -0
- package/src/services/structured-output.ts +712 -0
- package/src/services/system-memory.d.ts +33 -0
- package/src/services/system-memory.d.ts.map +1 -0
- package/src/services/system-memory.test.ts +47 -0
- package/src/services/system-memory.ts +67 -0
- package/src/services/transcription-priority.test.ts +211 -0
- package/src/services/types.d.ts +19 -0
- package/src/services/types.d.ts.map +1 -0
- package/src/services/types.ts +55 -0
- package/src/services/verify-on-device.d.ts +34 -0
- package/src/services/verify-on-device.d.ts.map +1 -0
- package/src/services/verify-on-device.test.ts +87 -0
- package/src/services/verify-on-device.ts +127 -0
- package/src/services/verify.d.ts +8 -0
- package/src/services/verify.d.ts.map +1 -0
- package/src/services/verify.ts +13 -0
- package/src/services/vision/aosp-unavailable.d.ts +115 -0
- package/src/services/vision/aosp-unavailable.d.ts.map +1 -0
- package/src/services/vision/aosp-unavailable.ts +163 -0
- package/src/services/vision/capacitor-llama.d.ts +99 -0
- package/src/services/vision/capacitor-llama.d.ts.map +1 -0
- package/src/services/vision/capacitor-llama.ts +255 -0
- package/src/services/vision/cloud-fallback.d.ts +47 -0
- package/src/services/vision/cloud-fallback.d.ts.map +1 -0
- package/src/services/vision/cloud-fallback.test.ts +243 -0
- package/src/services/vision/cloud-fallback.ts +268 -0
- package/src/services/vision/fallback-chain.test.ts +86 -0
- package/src/services/vision/hash.d.ts +71 -0
- package/src/services/vision/hash.d.ts.map +1 -0
- package/src/services/vision/hash.ts +157 -0
- package/src/services/vision/index.d.ts +95 -0
- package/src/services/vision/index.d.ts.map +1 -0
- package/src/services/vision/index.ts +251 -0
- package/src/services/vision/llama-server.d.ts +73 -0
- package/src/services/vision/llama-server.d.ts.map +1 -0
- package/src/services/vision/llama-server.ts +177 -0
- package/src/services/vision/types.d.ts +153 -0
- package/src/services/vision/types.d.ts.map +1 -0
- package/src/services/vision/types.ts +154 -0
- package/src/services/vision/vast-fallback.d.ts +18 -0
- package/src/services/vision/vast-fallback.d.ts.map +1 -0
- package/src/services/vision/vast-fallback.ts +127 -0
- package/src/services/vision-embedding-cache.d.ts +98 -0
- package/src/services/vision-embedding-cache.d.ts.map +1 -0
- package/src/services/vision-embedding-cache.ts +189 -0
- package/src/services/voice/VOICE_WORKBENCH.md +88 -0
- package/src/services/voice/__test-helpers__/fake-ffi.ts +94 -0
- package/src/services/voice/__test-helpers__/synthetic-speech.ts +124 -0
- package/src/services/voice/__tests__/checkpoint-manager.test.ts +241 -0
- package/src/services/voice/__tests__/checkpoint-policy.test.ts +270 -0
- package/src/services/voice/__tests__/eager-context-builder.test.ts +257 -0
- package/src/services/voice/__tests__/eliza1-eot-scorer.test.ts +288 -0
- package/src/services/voice/__tests__/eot-classifier.test.ts +431 -0
- package/src/services/voice/__tests__/optimistic-rollback.test.ts +312 -0
- package/src/services/voice/__tests__/prefill-client.test.ts +266 -0
- package/src/services/voice/__tests__/prefix-preserving-queue.test.ts +208 -0
- package/src/services/voice/__tests__/streaming-asr.test.ts +450 -0
- package/src/services/voice/__tests__/streaming-transcriber.test.ts +339 -0
- package/src/services/voice/__tests__/turn-detector-resolver.test.ts +195 -0
- package/src/services/voice/__tests__/voice-state-machine-prefill.test.ts +275 -0
- package/src/services/voice/__tests__/voice-state-machine.test.ts +354 -0
- package/src/services/voice/asr-timed.real.test.ts +141 -0
- package/src/services/voice/audio-frame-consumer.d.ts +212 -0
- package/src/services/voice/audio-frame-consumer.d.ts.map +1 -0
- package/src/services/voice/audio-frame-consumer.test.ts +343 -0
- package/src/services/voice/audio-frame-consumer.ts +491 -0
- package/src/services/voice/barge-in.d.ts +112 -0
- package/src/services/voice/barge-in.d.ts.map +1 -0
- package/src/services/voice/barge-in.test.ts +244 -0
- package/src/services/voice/barge-in.ts +336 -0
- package/src/services/voice/cancellation-coordinator.d.ts +127 -0
- package/src/services/voice/cancellation-coordinator.d.ts.map +1 -0
- package/src/services/voice/cancellation-coordinator.test.ts +196 -0
- package/src/services/voice/cancellation-coordinator.ts +269 -0
- package/src/services/voice/checkpoint-manager.d.ts +199 -0
- package/src/services/voice/checkpoint-manager.d.ts.map +1 -0
- package/src/services/voice/checkpoint-manager.ts +401 -0
- package/src/services/voice/checkpoint-policy.ts +336 -0
- package/src/services/voice/composite-eot-classifier.test.ts +59 -0
- package/src/services/voice/e2e-harness.test.ts +182 -0
- package/src/services/voice/e2e-harness.ts +743 -0
- package/src/services/voice/eager-context-builder.d.ts +170 -0
- package/src/services/voice/eager-context-builder.d.ts.map +1 -0
- package/src/services/voice/eager-context-builder.ts +262 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts +124 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/eliza1-eot-scorer.ts +242 -0
- package/src/services/voice/embedding-server.ts +200 -0
- package/src/services/voice/embedding.d.ts +133 -0
- package/src/services/voice/embedding.d.ts.map +1 -0
- package/src/services/voice/embedding.test.ts +131 -0
- package/src/services/voice/embedding.ts +243 -0
- package/src/services/voice/emotion-attribution.d.ts +68 -0
- package/src/services/voice/emotion-attribution.d.ts.map +1 -0
- package/src/services/voice/emotion-attribution.test.ts +129 -0
- package/src/services/voice/emotion-attribution.ts +361 -0
- package/src/services/voice/engine-bridge-cancellation.test.ts +422 -0
- package/src/services/voice/engine-bridge.d.ts +759 -0
- package/src/services/voice/engine-bridge.d.ts.map +1 -0
- package/src/services/voice/engine-bridge.test.ts +384 -0
- package/src/services/voice/engine-bridge.ts +2302 -0
- package/src/services/voice/eot-classifier-ggml.d.ts +179 -0
- package/src/services/voice/eot-classifier-ggml.d.ts.map +1 -0
- package/src/services/voice/eot-classifier-ggml.ts +566 -0
- package/src/services/voice/eot-classifier.d.ts +214 -0
- package/src/services/voice/eot-classifier.d.ts.map +1 -0
- package/src/services/voice/eot-classifier.ts +533 -0
- package/src/services/voice/errors.d.ts +20 -0
- package/src/services/voice/errors.d.ts.map +1 -0
- package/src/services/voice/errors.ts +32 -0
- package/src/services/voice/expressive-tags.d.ts +158 -0
- package/src/services/voice/expressive-tags.d.ts.map +1 -0
- package/src/services/voice/expressive-tags.ts +405 -0
- package/src/services/voice/ffi-bindings.d.ts +674 -0
- package/src/services/voice/ffi-bindings.d.ts.map +1 -0
- package/src/services/voice/ffi-bindings.test.ts +728 -0
- package/src/services/voice/ffi-bindings.ts +3225 -0
- package/src/services/voice/first-line-cache.d.ts +181 -0
- package/src/services/voice/first-line-cache.d.ts.map +1 -0
- package/src/services/voice/first-line-cache.ts +725 -0
- package/src/services/voice/fused-eot-scorer.d.ts +51 -0
- package/src/services/voice/fused-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/fused-eot-scorer.ts +135 -0
- package/src/services/voice/index.d.ts +91 -0
- package/src/services/voice/index.d.ts.map +1 -0
- package/src/services/voice/index.ts +481 -0
- package/src/services/voice/kokoro/__tests__/kokoro-backend.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.real.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.test.ts +60 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-discovery.test.ts +277 -0
- package/src/services/voice/kokoro/__tests__/kokoro-ffi-runtime.test.ts +235 -0
- package/src/services/voice/kokoro/__tests__/kokoro-runtime.test.ts +95 -0
- package/src/services/voice/kokoro/__tests__/phonemizer.test.ts +53 -0
- package/src/services/voice/kokoro/__tests__/runtime-selection.test.ts +231 -0
- package/src/services/voice/kokoro/__tests__/voices.test.ts +57 -0
- package/src/services/voice/kokoro/index.ts +79 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts +72 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-backend.ts +207 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts +58 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.ts +177 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts +75 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.ts +233 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts +100 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-runtime.ts +170 -0
- package/src/services/voice/kokoro/phoneme-stream.ts +123 -0
- package/src/services/voice/kokoro/phonemizer.d.ts +50 -0
- package/src/services/voice/kokoro/phonemizer.d.ts.map +1 -0
- package/src/services/voice/kokoro/phonemizer.ts +344 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts +61 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/pick-runtime.test.ts +91 -0
- package/src/services/voice/kokoro/pick-runtime.ts +130 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts +92 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts.map +1 -0
- package/src/services/voice/kokoro/runtime-selection.ts +237 -0
- package/src/services/voice/kokoro/types.d.ts +82 -0
- package/src/services/voice/kokoro/types.d.ts.map +1 -0
- package/src/services/voice/kokoro/types.ts +95 -0
- package/src/services/voice/kokoro/voice-presets.d.ts +23 -0
- package/src/services/voice/kokoro/voice-presets.d.ts.map +1 -0
- package/src/services/voice/kokoro/voice-presets.ts +129 -0
- package/src/services/voice/kokoro/voices.d.ts +30 -0
- package/src/services/voice/kokoro/voices.d.ts.map +1 -0
- package/src/services/voice/kokoro/voices.ts +64 -0
- package/src/services/voice/lifecycle.d.ts +135 -0
- package/src/services/voice/lifecycle.d.ts.map +1 -0
- package/src/services/voice/lifecycle.test.ts +315 -0
- package/src/services/voice/lifecycle.ts +301 -0
- package/src/services/voice/live-diarization-session.d.ts +96 -0
- package/src/services/voice/live-diarization-session.d.ts.map +1 -0
- package/src/services/voice/live-diarization-session.ts +289 -0
- package/src/services/voice/mic-source.d.ts +136 -0
- package/src/services/voice/mic-source.d.ts.map +1 -0
- package/src/services/voice/mic-source.test.ts +210 -0
- package/src/services/voice/mic-source.ts +503 -0
- package/src/services/voice/optimistic-policy.d.ts +109 -0
- package/src/services/voice/optimistic-policy.d.ts.map +1 -0
- package/src/services/voice/optimistic-policy.test.ts +101 -0
- package/src/services/voice/optimistic-policy.ts +192 -0
- package/src/services/voice/optimistic-rollback.ts +343 -0
- package/src/services/voice/partial-stabilizer.d.ts +73 -0
- package/src/services/voice/partial-stabilizer.d.ts.map +1 -0
- package/src/services/voice/partial-stabilizer.test.ts +68 -0
- package/src/services/voice/partial-stabilizer.ts +140 -0
- package/src/services/voice/phoneme-tokenizer.d.ts +49 -0
- package/src/services/voice/phoneme-tokenizer.d.ts.map +1 -0
- package/src/services/voice/phoneme-tokenizer.ts +158 -0
- package/src/services/voice/phrase-cache.d.ts +76 -0
- package/src/services/voice/phrase-cache.d.ts.map +1 -0
- package/src/services/voice/phrase-cache.test.ts +242 -0
- package/src/services/voice/phrase-cache.ts +186 -0
- package/src/services/voice/phrase-chunker.d.ts +62 -0
- package/src/services/voice/phrase-chunker.d.ts.map +1 -0
- package/src/services/voice/phrase-chunker.test.ts +239 -0
- package/src/services/voice/phrase-chunker.ts +281 -0
- package/src/services/voice/pipeline-impls.d.ts +151 -0
- package/src/services/voice/pipeline-impls.d.ts.map +1 -0
- package/src/services/voice/pipeline-impls.l6.test.ts +110 -0
- package/src/services/voice/pipeline-impls.test.ts +292 -0
- package/src/services/voice/pipeline-impls.ts +315 -0
- package/src/services/voice/pipeline.d.ts +216 -0
- package/src/services/voice/pipeline.d.ts.map +1 -0
- package/src/services/voice/pipeline.ts +505 -0
- package/src/services/voice/prefill-client.d.ts +123 -0
- package/src/services/voice/prefill-client.d.ts.map +1 -0
- package/src/services/voice/prefill-client.ts +316 -0
- package/src/services/voice/prefix-preserving-queue.d.ts +113 -0
- package/src/services/voice/prefix-preserving-queue.d.ts.map +1 -0
- package/src/services/voice/prefix-preserving-queue.ts +162 -0
- package/src/services/voice/profile-store.d.ts +248 -0
- package/src/services/voice/profile-store.d.ts.map +1 -0
- package/src/services/voice/profile-store.ts +887 -0
- package/src/services/voice/real-audio-decode.test.ts +148 -0
- package/src/services/voice/ring-buffer.d.ts +40 -0
- package/src/services/voice/ring-buffer.d.ts.map +1 -0
- package/src/services/voice/ring-buffer.test.ts +129 -0
- package/src/services/voice/ring-buffer.ts +123 -0
- package/src/services/voice/rollback-queue.d.ts +24 -0
- package/src/services/voice/rollback-queue.d.ts.map +1 -0
- package/src/services/voice/rollback-queue.ts +74 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts +67 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-placeholder.test.ts +97 -0
- package/src/services/voice/samantha-preset-placeholder.ts +148 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts +87 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-regenerator.ts +393 -0
- package/src/services/voice/scheduler.d.ts +146 -0
- package/src/services/voice/scheduler.d.ts.map +1 -0
- package/src/services/voice/scheduler.t2.test.ts +141 -0
- package/src/services/voice/scheduler.ts +927 -0
- package/src/services/voice/shared-resources.d.ts +190 -0
- package/src/services/voice/shared-resources.d.ts.map +1 -0
- package/src/services/voice/shared-resources.ts +320 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts +74 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts.map +1 -0
- package/src/services/voice/speaker/attribution-pipeline.ts +386 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts +59 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer-fused.real.test.ts +100 -0
- package/src/services/voice/speaker/diarizer-fused.ts +154 -0
- package/src/services/voice/speaker/diarizer.d.ts +75 -0
- package/src/services/voice/speaker/diarizer.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer.ts +218 -0
- package/src/services/voice/speaker/encoder-fused.d.ts +60 -0
- package/src/services/voice/speaker/encoder-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-fused.real.test.ts +113 -0
- package/src/services/voice/speaker/encoder-fused.ts +138 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts +33 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-ggml.ts +79 -0
- package/src/services/voice/speaker/encoder.d.ts +37 -0
- package/src/services/voice/speaker/encoder.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder.ts +105 -0
- package/src/services/voice/speaker-imprint.d.ts +83 -0
- package/src/services/voice/speaker-imprint.d.ts.map +1 -0
- package/src/services/voice/speaker-imprint.test.ts +185 -0
- package/src/services/voice/speaker-imprint.ts +312 -0
- package/src/services/voice/speaker-preset-cache.d.ts +77 -0
- package/src/services/voice/speaker-preset-cache.d.ts.map +1 -0
- package/src/services/voice/speaker-preset-cache.test.ts +154 -0
- package/src/services/voice/speaker-preset-cache.ts +195 -0
- package/src/services/voice/streaming-asr/streaming-pipeline-adapter.ts +292 -0
- package/src/services/voice/system-audio-sink.d.ts +73 -0
- package/src/services/voice/system-audio-sink.d.ts.map +1 -0
- package/src/services/voice/system-audio-sink.test.ts +29 -0
- package/src/services/voice/system-audio-sink.ts +366 -0
- package/src/services/voice/transcriber.d.ts +244 -0
- package/src/services/voice/transcriber.d.ts.map +1 -0
- package/src/services/voice/transcriber.test.ts +392 -0
- package/src/services/voice/transcriber.ts +704 -0
- package/src/services/voice/transcript-knowledge.d.ts +37 -0
- package/src/services/voice/transcript-knowledge.d.ts.map +1 -0
- package/src/services/voice/transcript-knowledge.test.ts +68 -0
- package/src/services/voice/transcript-knowledge.ts +75 -0
- package/src/services/voice/transcript-service.d.ts +41 -0
- package/src/services/voice/transcript-service.d.ts.map +1 -0
- package/src/services/voice/transcript-service.test.ts +137 -0
- package/src/services/voice/transcript-service.ts +141 -0
- package/src/services/voice/transcript-store.d.ts +53 -0
- package/src/services/voice/transcript-store.d.ts.map +1 -0
- package/src/services/voice/transcript-store.test.ts +153 -0
- package/src/services/voice/transcript-store.ts +132 -0
- package/src/services/voice/turn-controller.d.ts +183 -0
- package/src/services/voice/turn-controller.d.ts.map +1 -0
- package/src/services/voice/turn-controller.test.ts +575 -0
- package/src/services/voice/turn-controller.ts +596 -0
- package/src/services/voice/types.d.ts +643 -0
- package/src/services/voice/types.d.ts.map +1 -0
- package/src/services/voice/types.ts +699 -0
- package/src/services/voice/vad.d.ts +282 -0
- package/src/services/voice/vad.d.ts.map +1 -0
- package/src/services/voice/vad.test.ts +480 -0
- package/src/services/voice/vad.ts +827 -0
- package/src/services/voice/vad.v1-v4.test.ts +222 -0
- package/src/services/voice/voice-budget.d.ts +241 -0
- package/src/services/voice/voice-budget.d.ts.map +1 -0
- package/src/services/voice/voice-budget.test.ts +418 -0
- package/src/services/voice/voice-budget.ts +635 -0
- package/src/services/voice/voice-duet.test.ts +375 -0
- package/src/services/voice/voice-emotion-classifier.d.ts +95 -0
- package/src/services/voice/voice-emotion-classifier.d.ts.map +1 -0
- package/src/services/voice/voice-emotion-classifier.test.ts +210 -0
- package/src/services/voice/voice-emotion-classifier.ts +273 -0
- package/src/services/voice/voice-preset-format.d.ts +158 -0
- package/src/services/voice/voice-preset-format.d.ts.map +1 -0
- package/src/services/voice/voice-preset-format.ts +700 -0
- package/src/services/voice/voice-preset-generator.test.ts +89 -0
- package/src/services/voice/voice-profile-artifact.d.ts +116 -0
- package/src/services/voice/voice-profile-artifact.d.ts.map +1 -0
- package/src/services/voice/voice-profile-artifact.test.ts +138 -0
- package/src/services/voice/voice-profile-artifact.ts +518 -0
- package/src/services/voice/voice-profile-routes.d.ts +83 -0
- package/src/services/voice/voice-profile-routes.d.ts.map +1 -0
- package/src/services/voice/voice-profile-routes.test.ts +429 -0
- package/src/services/voice/voice-profile-routes.ts +425 -0
- package/src/services/voice/voice-scenario.ts +154 -0
- package/src/services/voice/voice-settings.d.ts +82 -0
- package/src/services/voice/voice-settings.d.ts.map +1 -0
- package/src/services/voice/voice-settings.ts +172 -0
- package/src/services/voice/voice-state-machine.d.ts +364 -0
- package/src/services/voice/voice-state-machine.d.ts.map +1 -0
- package/src/services/voice/voice-state-machine.ts +727 -0
- package/src/services/voice/voice-workbench-report.test.ts +168 -0
- package/src/services/voice/voice-workbench-report.ts +326 -0
- package/src/services/voice/voice-workbench.test.ts +158 -0
- package/src/services/voice/voice.test.ts +1070 -0
- package/src/services/voice/wake-word-ggml.d.ts +101 -0
- package/src/services/voice/wake-word-ggml.d.ts.map +1 -0
- package/src/services/voice/wake-word-ggml.ts +320 -0
- package/src/services/voice/wake-word.d.ts +255 -0
- package/src/services/voice/wake-word.d.ts.map +1 -0
- package/src/services/voice/wake-word.test.ts +298 -0
- package/src/services/voice/wake-word.ts +554 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts +70 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts.map +1 -0
- package/src/services/voice/wrap-with-first-line-cache.ts +267 -0
- package/src/services/voice-model-updater.d.ts +240 -0
- package/src/services/voice-model-updater.d.ts.map +1 -0
- package/src/services/voice-model-updater.ts +724 -0
- package/src/services/voice-prewarm.d.ts +3 -0
- package/src/services/voice-prewarm.d.ts.map +1 -0
- package/src/services/voice-prewarm.ts +51 -0
- package/dist/index.d.ts +0 -37
- package/dist/index.js +0 -1098
|
@@ -0,0 +1,700 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Binary format for `cache/voice-preset-*.bin`.
|
|
3
|
+
*
|
|
4
|
+
* Two versions are supported:
|
|
5
|
+
*
|
|
6
|
+
* v1 (`magic='ELZ1', version=1`) — legacy two-section layout used by the
|
|
7
|
+
* initial Kokoro-style placeholder. Carries a Float32 speaker embedding +
|
|
8
|
+
* a phrase-cache seed list. Still read for back-compat (older bundles only
|
|
9
|
+
* contain v1).
|
|
10
|
+
*
|
|
11
|
+
* v2 (`magic='ELZ1', version=2`) — superset adopted for the OmniVoice
|
|
12
|
+
* freeze. Adds three OmniVoice-specific sections that the v1 layout had
|
|
13
|
+
* no room for: pre-encoded `ref_audio_tokens` (int32, shape
|
|
14
|
+
* `[K, ref_T]`), a UTF-8 `ref_text` transcript of the reference clip, and
|
|
15
|
+
* a closed-vocabulary `instruct` string (the resolved VoiceDesign
|
|
16
|
+
* attributes). v2 readers handle v1 files transparently (the new sections
|
|
17
|
+
* default to empty). A v1 reader applied to a v2 file fails fast on
|
|
18
|
+
* `truncated-header` because the v2 header is larger.
|
|
19
|
+
*
|
|
20
|
+
* Layout (little-endian throughout):
|
|
21
|
+
*
|
|
22
|
+
* v1 header (24 bytes):
|
|
23
|
+
* +0 4 bytes magic 'ELZ1' (0x315A4C45)
|
|
24
|
+
* +4 4 bytes format version (uint32) — 1
|
|
25
|
+
* +8 4 bytes speaker embedding offset (uint32)
|
|
26
|
+
* +12 4 bytes speaker embedding byte length (uint32)
|
|
27
|
+
* +16 4 bytes phrase cache seed offset (uint32)
|
|
28
|
+
* +20 4 bytes phrase cache seed byte length (uint32)
|
|
29
|
+
*
|
|
30
|
+
* v2 header (64 bytes — additive, all section descriptors are
|
|
31
|
+
* `(offset:uint32, length:uint32)` pairs):
|
|
32
|
+
* +0 4 bytes magic 'ELZ1' (0x315A4C45)
|
|
33
|
+
* +4 4 bytes format version (uint32) — 2
|
|
34
|
+
* +8 4 bytes speaker embedding offset
|
|
35
|
+
* +12 4 bytes speaker embedding byte length
|
|
36
|
+
* +16 4 bytes phrase cache seed offset
|
|
37
|
+
* +20 4 bytes phrase cache seed byte length
|
|
38
|
+
* +24 4 bytes ref_audio_tokens offset
|
|
39
|
+
* +28 4 bytes ref_audio_tokens byte length
|
|
40
|
+
* +32 4 bytes ref_text offset
|
|
41
|
+
* +36 4 bytes ref_text byte length
|
|
42
|
+
* +40 4 bytes instruct offset
|
|
43
|
+
* +44 4 bytes instruct byte length
|
|
44
|
+
* +48 4 bytes metadata offset
|
|
45
|
+
* +52 4 bytes metadata byte length
|
|
46
|
+
* +56 4 bytes reserved (must be 0)
|
|
47
|
+
* +60 4 bytes reserved (must be 0)
|
|
48
|
+
*
|
|
49
|
+
* `ref_audio_tokens` payload (v2):
|
|
50
|
+
* +0 4 bytes K — codebook count (uint32, OmniVoice = 8)
|
|
51
|
+
* +4 4 bytes ref_T — frames per codebook (uint32)
|
|
52
|
+
* +8 ... int32 LE codebook samples, row-major shape `[K, ref_T]`
|
|
53
|
+
*
|
|
54
|
+
* `ref_text` payload (v2): raw UTF-8 bytes (no NUL terminator).
|
|
55
|
+
* `instruct` payload (v2): raw UTF-8 bytes (closed VoiceDesign vocabulary).
|
|
56
|
+
* `metadata` payload (v2): raw UTF-8 JSON bytes (codec sha256, corpus
|
|
57
|
+
* hash, etc.); the runtime never relies on
|
|
58
|
+
* metadata for correctness.
|
|
59
|
+
*
|
|
60
|
+
* Phrase cache seed payload (v1 + v2, identical):
|
|
61
|
+
* uint32 LE N (phrase count)
|
|
62
|
+
* for each phrase:
|
|
63
|
+
* uint16 LE text_byte_len
|
|
64
|
+
* uint8[] canonicalized text (UTF-8)
|
|
65
|
+
* uint32 LE sample_rate
|
|
66
|
+
* uint32 LE pcm_byte_len
|
|
67
|
+
* uint8[] PCM (Float32 LE samples)
|
|
68
|
+
*
|
|
69
|
+
* Per-section invariants:
|
|
70
|
+
* - Section bounds may not overlap the header.
|
|
71
|
+
* - Section bounds must fit within the file length.
|
|
72
|
+
* - A `length=0` section is allowed (means "absent"); the corresponding
|
|
73
|
+
* output field is an empty `Float32Array` / `Int32Array` / empty string.
|
|
74
|
+
* - `embedding.length % 4 == 0` (Float32).
|
|
75
|
+
* - `ref_audio_tokens.length` ≥ 8 (the two header words K, ref_T) and the
|
|
76
|
+
* payload is `8 + K*ref_T*4` bytes.
|
|
77
|
+
*/
|
|
78
|
+
|
|
79
|
+
export const VOICE_PRESET_MAGIC = 0x315a4c45; // 'ELZ1'
|
|
80
|
+
|
|
81
|
+
/** Header byte counts. */
|
|
82
|
+
export const VOICE_PRESET_HEADER_BYTES_V1 = 24;
|
|
83
|
+
export const VOICE_PRESET_HEADER_BYTES_V2 = 64;
|
|
84
|
+
|
|
85
|
+
/** Supported format versions. v2 is the canonical write path. */
|
|
86
|
+
export const VOICE_PRESET_VERSION_V1 = 1;
|
|
87
|
+
export const VOICE_PRESET_VERSION_V2 = 2;
|
|
88
|
+
export const VOICE_PRESET_VERSION_CURRENT = VOICE_PRESET_VERSION_V2;
|
|
89
|
+
|
|
90
|
+
export interface VoicePresetSeedPhrase {
|
|
91
|
+
/** Canonicalized text (lowercase, single-spaced, trimmed). */
|
|
92
|
+
text: string;
|
|
93
|
+
sampleRate: number;
|
|
94
|
+
pcm: Float32Array;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* OmniVoice reference-audio-tokens payload. `K` is the codebook count (=8 for
|
|
99
|
+
* OmniVoice / HiggsAudioV2) and `refT` is the number of frames per codebook.
|
|
100
|
+
* `tokens` is row-major: codebook `k`, frame `t` is at `tokens[k*refT + t]`.
|
|
101
|
+
* An empty payload (refT=0, K=0, tokens length 0) is valid and means "no
|
|
102
|
+
* reference audio bound to this preset" (instruct-only voice).
|
|
103
|
+
*/
|
|
104
|
+
export interface RefAudioTokens {
|
|
105
|
+
K: number;
|
|
106
|
+
refT: number;
|
|
107
|
+
tokens: Int32Array;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export interface VoicePresetFile {
|
|
111
|
+
version: number;
|
|
112
|
+
embedding: Float32Array;
|
|
113
|
+
phrases: ReadonlyArray<VoicePresetSeedPhrase>;
|
|
114
|
+
/** v2 only — empty for v1 files. */
|
|
115
|
+
refAudioTokens: RefAudioTokens;
|
|
116
|
+
/** v2 only — empty for v1 files. */
|
|
117
|
+
refText: string;
|
|
118
|
+
/** v2 only — empty for v1 files. */
|
|
119
|
+
instruct: string;
|
|
120
|
+
/** v2 only — parsed JSON object, empty `{}` for v1 files. */
|
|
121
|
+
metadata: Record<string, unknown>;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export class VoicePresetFormatError extends Error {
|
|
125
|
+
constructor(
|
|
126
|
+
message: string,
|
|
127
|
+
readonly code:
|
|
128
|
+
| "bad-magic"
|
|
129
|
+
| "bad-version"
|
|
130
|
+
| "truncated-header"
|
|
131
|
+
| "truncated-section"
|
|
132
|
+
| "bad-section-bounds"
|
|
133
|
+
| "bad-phrase-record"
|
|
134
|
+
| "bad-embedding-length"
|
|
135
|
+
| "bad-ref-tokens"
|
|
136
|
+
| "bad-metadata",
|
|
137
|
+
) {
|
|
138
|
+
super(message);
|
|
139
|
+
this.name = "VoicePresetFormatError";
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
interface SectionView {
|
|
144
|
+
offset: number;
|
|
145
|
+
length: number;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
interface ParsedHeader {
|
|
149
|
+
version: number;
|
|
150
|
+
headerBytes: number;
|
|
151
|
+
embedding: SectionView;
|
|
152
|
+
phrases: SectionView;
|
|
153
|
+
refAudioTokens: SectionView;
|
|
154
|
+
refText: SectionView;
|
|
155
|
+
instruct: SectionView;
|
|
156
|
+
metadata: SectionView;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const EMPTY_SECTION: SectionView = Object.freeze({ offset: 0, length: 0 });
|
|
160
|
+
|
|
161
|
+
function checkSectionBounds(
|
|
162
|
+
sec: SectionView,
|
|
163
|
+
fileLen: number,
|
|
164
|
+
headerBytes: number,
|
|
165
|
+
): void {
|
|
166
|
+
if (sec.length === 0) return;
|
|
167
|
+
if (sec.offset < headerBytes) {
|
|
168
|
+
throw new VoicePresetFormatError(
|
|
169
|
+
`voice preset section overlaps header (offset=${sec.offset} < header=${headerBytes})`,
|
|
170
|
+
"bad-section-bounds",
|
|
171
|
+
);
|
|
172
|
+
}
|
|
173
|
+
if (sec.offset + sec.length > fileLen) {
|
|
174
|
+
throw new VoicePresetFormatError(
|
|
175
|
+
`voice preset section bounds exceed file length`,
|
|
176
|
+
"bad-section-bounds",
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function readHeader(view: DataView): ParsedHeader {
|
|
182
|
+
if (view.byteLength < VOICE_PRESET_HEADER_BYTES_V1) {
|
|
183
|
+
throw new VoicePresetFormatError(
|
|
184
|
+
`voice preset file truncated: header needs ${VOICE_PRESET_HEADER_BYTES_V1} bytes, got ${view.byteLength}`,
|
|
185
|
+
"truncated-header",
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
const magic = view.getUint32(0, true);
|
|
189
|
+
if (magic !== VOICE_PRESET_MAGIC) {
|
|
190
|
+
throw new VoicePresetFormatError(
|
|
191
|
+
`voice preset bad magic: expected 0x${VOICE_PRESET_MAGIC.toString(16)}, got 0x${magic.toString(16)}`,
|
|
192
|
+
"bad-magic",
|
|
193
|
+
);
|
|
194
|
+
}
|
|
195
|
+
const version = view.getUint32(4, true);
|
|
196
|
+
if (
|
|
197
|
+
version !== VOICE_PRESET_VERSION_V1 &&
|
|
198
|
+
version !== VOICE_PRESET_VERSION_V2
|
|
199
|
+
) {
|
|
200
|
+
throw new VoicePresetFormatError(
|
|
201
|
+
`voice preset unsupported version: ${version} (this build supports 1 and 2)`,
|
|
202
|
+
"bad-version",
|
|
203
|
+
);
|
|
204
|
+
}
|
|
205
|
+
const headerBytes =
|
|
206
|
+
version === VOICE_PRESET_VERSION_V2
|
|
207
|
+
? VOICE_PRESET_HEADER_BYTES_V2
|
|
208
|
+
: VOICE_PRESET_HEADER_BYTES_V1;
|
|
209
|
+
if (view.byteLength < headerBytes) {
|
|
210
|
+
throw new VoicePresetFormatError(
|
|
211
|
+
`voice preset file truncated: v${version} header needs ${headerBytes} bytes, got ${view.byteLength}`,
|
|
212
|
+
"truncated-header",
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const embedding: SectionView = {
|
|
217
|
+
offset: view.getUint32(8, true),
|
|
218
|
+
length: view.getUint32(12, true),
|
|
219
|
+
};
|
|
220
|
+
const phrases: SectionView = {
|
|
221
|
+
offset: view.getUint32(16, true),
|
|
222
|
+
length: view.getUint32(20, true),
|
|
223
|
+
};
|
|
224
|
+
|
|
225
|
+
let refAudioTokens = EMPTY_SECTION;
|
|
226
|
+
let refText = EMPTY_SECTION;
|
|
227
|
+
let instruct = EMPTY_SECTION;
|
|
228
|
+
let metadata = EMPTY_SECTION;
|
|
229
|
+
if (version === VOICE_PRESET_VERSION_V2) {
|
|
230
|
+
refAudioTokens = {
|
|
231
|
+
offset: view.getUint32(24, true),
|
|
232
|
+
length: view.getUint32(28, true),
|
|
233
|
+
};
|
|
234
|
+
refText = {
|
|
235
|
+
offset: view.getUint32(32, true),
|
|
236
|
+
length: view.getUint32(36, true),
|
|
237
|
+
};
|
|
238
|
+
instruct = {
|
|
239
|
+
offset: view.getUint32(40, true),
|
|
240
|
+
length: view.getUint32(44, true),
|
|
241
|
+
};
|
|
242
|
+
metadata = {
|
|
243
|
+
offset: view.getUint32(48, true),
|
|
244
|
+
length: view.getUint32(52, true),
|
|
245
|
+
};
|
|
246
|
+
// Reserved words must be zero — fail closed on accidental reuse.
|
|
247
|
+
const r0 = view.getUint32(56, true);
|
|
248
|
+
const r1 = view.getUint32(60, true);
|
|
249
|
+
if (r0 !== 0 || r1 !== 0) {
|
|
250
|
+
throw new VoicePresetFormatError(
|
|
251
|
+
`voice preset v2 reserved header words must be 0 (got ${r0}, ${r1})`,
|
|
252
|
+
"bad-section-bounds",
|
|
253
|
+
);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const fileLen = view.byteLength;
|
|
258
|
+
checkSectionBounds(embedding, fileLen, headerBytes);
|
|
259
|
+
checkSectionBounds(phrases, fileLen, headerBytes);
|
|
260
|
+
checkSectionBounds(refAudioTokens, fileLen, headerBytes);
|
|
261
|
+
checkSectionBounds(refText, fileLen, headerBytes);
|
|
262
|
+
checkSectionBounds(instruct, fileLen, headerBytes);
|
|
263
|
+
checkSectionBounds(metadata, fileLen, headerBytes);
|
|
264
|
+
|
|
265
|
+
return {
|
|
266
|
+
version,
|
|
267
|
+
headerBytes,
|
|
268
|
+
embedding,
|
|
269
|
+
phrases,
|
|
270
|
+
refAudioTokens,
|
|
271
|
+
refText,
|
|
272
|
+
instruct,
|
|
273
|
+
metadata,
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
function copyFloat32(
|
|
278
|
+
bytes: Uint8Array,
|
|
279
|
+
/** Offset relative to `bytes` (i.e. relative to bytes.byteOffset). */
|
|
280
|
+
relativeOffset: number,
|
|
281
|
+
byteLength: number,
|
|
282
|
+
): Float32Array {
|
|
283
|
+
// The source byte offset is not guaranteed to be 4-aligned in the file
|
|
284
|
+
// buffer, so we copy raw bytes into a fresh ArrayBuffer first.
|
|
285
|
+
const aligned = new Uint8Array(byteLength);
|
|
286
|
+
aligned.set(bytes.subarray(relativeOffset, relativeOffset + byteLength));
|
|
287
|
+
return new Float32Array(aligned.buffer, 0, byteLength / 4);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function copyInt32(
|
|
291
|
+
bytes: Uint8Array,
|
|
292
|
+
relativeOffset: number,
|
|
293
|
+
byteLength: number,
|
|
294
|
+
): Int32Array {
|
|
295
|
+
const aligned = new Uint8Array(byteLength);
|
|
296
|
+
aligned.set(bytes.subarray(relativeOffset, relativeOffset + byteLength));
|
|
297
|
+
return new Int32Array(aligned.buffer, 0, byteLength / 4);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
function readEmbedding(bytes: Uint8Array, sec: SectionView): Float32Array {
|
|
301
|
+
if (sec.length === 0) return new Float32Array(0);
|
|
302
|
+
if (sec.length % 4 !== 0) {
|
|
303
|
+
throw new VoicePresetFormatError(
|
|
304
|
+
`voice preset embedding length ${sec.length} is not a multiple of 4`,
|
|
305
|
+
"bad-embedding-length",
|
|
306
|
+
);
|
|
307
|
+
}
|
|
308
|
+
return copyFloat32(bytes, sec.offset, sec.length);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
function readRefAudioTokens(
|
|
312
|
+
bytes: Uint8Array,
|
|
313
|
+
sec: SectionView,
|
|
314
|
+
): RefAudioTokens {
|
|
315
|
+
if (sec.length === 0) {
|
|
316
|
+
return { K: 0, refT: 0, tokens: new Int32Array(0) };
|
|
317
|
+
}
|
|
318
|
+
if (sec.length < 8) {
|
|
319
|
+
throw new VoicePresetFormatError(
|
|
320
|
+
`voice preset ref_audio_tokens section truncated (need ≥ 8 bytes, got ${sec.length})`,
|
|
321
|
+
"bad-ref-tokens",
|
|
322
|
+
);
|
|
323
|
+
}
|
|
324
|
+
const view = new DataView(
|
|
325
|
+
bytes.buffer,
|
|
326
|
+
bytes.byteOffset + sec.offset,
|
|
327
|
+
sec.length,
|
|
328
|
+
);
|
|
329
|
+
const K = view.getUint32(0, true);
|
|
330
|
+
const refT = view.getUint32(4, true);
|
|
331
|
+
const tokenBytes = sec.length - 8;
|
|
332
|
+
if (tokenBytes % 4 !== 0) {
|
|
333
|
+
throw new VoicePresetFormatError(
|
|
334
|
+
`voice preset ref_audio_tokens payload bytes ${tokenBytes} is not a multiple of 4`,
|
|
335
|
+
"bad-ref-tokens",
|
|
336
|
+
);
|
|
337
|
+
}
|
|
338
|
+
const expected = K * refT * 4;
|
|
339
|
+
if (tokenBytes !== expected) {
|
|
340
|
+
throw new VoicePresetFormatError(
|
|
341
|
+
`voice preset ref_audio_tokens shape mismatch: K=${K}, ref_T=${refT}, expected ${expected} bytes, got ${tokenBytes}`,
|
|
342
|
+
"bad-ref-tokens",
|
|
343
|
+
);
|
|
344
|
+
}
|
|
345
|
+
const tokens =
|
|
346
|
+
tokenBytes === 0
|
|
347
|
+
? new Int32Array(0)
|
|
348
|
+
: copyInt32(bytes, sec.offset + 8, tokenBytes);
|
|
349
|
+
return { K, refT, tokens };
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
function readUtf8(bytes: Uint8Array, sec: SectionView): string {
|
|
353
|
+
if (sec.length === 0) return "";
|
|
354
|
+
const slice = bytes.subarray(sec.offset, sec.offset + sec.length);
|
|
355
|
+
return new TextDecoder("utf-8", { fatal: true }).decode(slice);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
function readMetadata(
|
|
359
|
+
bytes: Uint8Array,
|
|
360
|
+
sec: SectionView,
|
|
361
|
+
): Record<string, unknown> {
|
|
362
|
+
if (sec.length === 0) return {};
|
|
363
|
+
const text = readUtf8(bytes, sec);
|
|
364
|
+
let parsed: unknown;
|
|
365
|
+
try {
|
|
366
|
+
parsed = JSON.parse(text);
|
|
367
|
+
} catch (err) {
|
|
368
|
+
throw new VoicePresetFormatError(
|
|
369
|
+
`voice preset metadata is not valid JSON: ${(err as Error).message}`,
|
|
370
|
+
"bad-metadata",
|
|
371
|
+
);
|
|
372
|
+
}
|
|
373
|
+
if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
374
|
+
throw new VoicePresetFormatError(
|
|
375
|
+
`voice preset metadata must be a JSON object`,
|
|
376
|
+
"bad-metadata",
|
|
377
|
+
);
|
|
378
|
+
}
|
|
379
|
+
return parsed as Record<string, unknown>;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
function readPhrases(
|
|
383
|
+
bytes: Uint8Array,
|
|
384
|
+
sec: SectionView,
|
|
385
|
+
): VoicePresetSeedPhrase[] {
|
|
386
|
+
if (sec.length === 0) return [];
|
|
387
|
+
const view = new DataView(
|
|
388
|
+
bytes.buffer,
|
|
389
|
+
bytes.byteOffset + sec.offset,
|
|
390
|
+
sec.length,
|
|
391
|
+
);
|
|
392
|
+
const decoder = new TextDecoder("utf-8", { fatal: true });
|
|
393
|
+
let pos = 0;
|
|
394
|
+
if (sec.length < 4) {
|
|
395
|
+
throw new VoicePresetFormatError(
|
|
396
|
+
"voice preset phrase section truncated before count",
|
|
397
|
+
"truncated-section",
|
|
398
|
+
);
|
|
399
|
+
}
|
|
400
|
+
const count = view.getUint32(pos, true);
|
|
401
|
+
pos += 4;
|
|
402
|
+
const out: VoicePresetSeedPhrase[] = [];
|
|
403
|
+
for (let i = 0; i < count; i++) {
|
|
404
|
+
if (pos + 2 > sec.length) {
|
|
405
|
+
throw new VoicePresetFormatError(
|
|
406
|
+
`voice preset phrase #${i}: truncated before text length`,
|
|
407
|
+
"bad-phrase-record",
|
|
408
|
+
);
|
|
409
|
+
}
|
|
410
|
+
const textLen = view.getUint16(pos, true);
|
|
411
|
+
pos += 2;
|
|
412
|
+
if (pos + textLen > sec.length) {
|
|
413
|
+
throw new VoicePresetFormatError(
|
|
414
|
+
`voice preset phrase #${i}: text overruns section`,
|
|
415
|
+
"bad-phrase-record",
|
|
416
|
+
);
|
|
417
|
+
}
|
|
418
|
+
const textBytes = new Uint8Array(
|
|
419
|
+
bytes.buffer,
|
|
420
|
+
bytes.byteOffset + sec.offset + pos,
|
|
421
|
+
textLen,
|
|
422
|
+
);
|
|
423
|
+
const text = decoder.decode(textBytes);
|
|
424
|
+
pos += textLen;
|
|
425
|
+
if (pos + 8 > sec.length) {
|
|
426
|
+
throw new VoicePresetFormatError(
|
|
427
|
+
`voice preset phrase #${i}: truncated before sample_rate/pcm_len`,
|
|
428
|
+
"bad-phrase-record",
|
|
429
|
+
);
|
|
430
|
+
}
|
|
431
|
+
const sampleRate = view.getUint32(pos, true);
|
|
432
|
+
pos += 4;
|
|
433
|
+
const pcmByteLen = view.getUint32(pos, true);
|
|
434
|
+
pos += 4;
|
|
435
|
+
if (pcmByteLen % 4 !== 0) {
|
|
436
|
+
throw new VoicePresetFormatError(
|
|
437
|
+
`voice preset phrase #${i}: pcm byte length ${pcmByteLen} is not a multiple of 4`,
|
|
438
|
+
"bad-phrase-record",
|
|
439
|
+
);
|
|
440
|
+
}
|
|
441
|
+
if (pos + pcmByteLen > sec.length) {
|
|
442
|
+
throw new VoicePresetFormatError(
|
|
443
|
+
`voice preset phrase #${i}: pcm overruns section`,
|
|
444
|
+
"bad-phrase-record",
|
|
445
|
+
);
|
|
446
|
+
}
|
|
447
|
+
const pcm = copyFloat32(bytes, sec.offset + pos, pcmByteLen);
|
|
448
|
+
pos += pcmByteLen;
|
|
449
|
+
out.push({ text, sampleRate, pcm });
|
|
450
|
+
}
|
|
451
|
+
return out;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Parse a voice-preset binary blob. Throws `VoicePresetFormatError` on any
|
|
456
|
+
* malformed input — this is the single defensive boundary for the format.
|
|
457
|
+
* Supports both v1 and v2 files. For v1 files the v2-only fields are
|
|
458
|
+
* returned as their empty equivalents.
|
|
459
|
+
*/
|
|
460
|
+
export function readVoicePresetFile(bytes: Uint8Array): VoicePresetFile {
|
|
461
|
+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
|
462
|
+
const header = readHeader(view);
|
|
463
|
+
return {
|
|
464
|
+
version: header.version,
|
|
465
|
+
embedding: readEmbedding(bytes, header.embedding),
|
|
466
|
+
phrases: readPhrases(bytes, header.phrases),
|
|
467
|
+
refAudioTokens: readRefAudioTokens(bytes, header.refAudioTokens),
|
|
468
|
+
refText: readUtf8(bytes, header.refText),
|
|
469
|
+
instruct: readUtf8(bytes, header.instruct),
|
|
470
|
+
metadata: readMetadata(bytes, header.metadata),
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
/**
|
|
475
|
+
* Serialize a voice preset to the v1 binary format. The output is a fresh
|
|
476
|
+
* `Uint8Array` ready to be written to disk.
|
|
477
|
+
*
|
|
478
|
+
* Use this only when the caller deliberately wants the legacy v1 shape (e.g.
|
|
479
|
+
* the existing Kokoro-style placeholder builder). New code should call
|
|
480
|
+
* `writeVoicePresetFileV2`.
|
|
481
|
+
*/
|
|
482
|
+
export function writeVoicePresetFile(file: {
|
|
483
|
+
embedding: Float32Array;
|
|
484
|
+
phrases: ReadonlyArray<VoicePresetSeedPhrase>;
|
|
485
|
+
}): Uint8Array {
|
|
486
|
+
const encoder = new TextEncoder();
|
|
487
|
+
const encodedTexts = file.phrases.map((p) => encoder.encode(p.text));
|
|
488
|
+
|
|
489
|
+
const embBytes = file.embedding.byteLength;
|
|
490
|
+
let phrBytes = 4; // count
|
|
491
|
+
for (let i = 0; i < file.phrases.length; i++) {
|
|
492
|
+
const t = encodedTexts[i];
|
|
493
|
+
if (t.byteLength > 0xffff) {
|
|
494
|
+
throw new VoicePresetFormatError(
|
|
495
|
+
`phrase #${i} text too long (${t.byteLength} bytes, max 65535)`,
|
|
496
|
+
"bad-phrase-record",
|
|
497
|
+
);
|
|
498
|
+
}
|
|
499
|
+
phrBytes += 2 + t.byteLength + 4 + 4 + file.phrases[i].pcm.byteLength;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
const embOff = VOICE_PRESET_HEADER_BYTES_V1;
|
|
503
|
+
const phrOff = embOff + embBytes;
|
|
504
|
+
const total = phrOff + phrBytes;
|
|
505
|
+
|
|
506
|
+
const out = new Uint8Array(total);
|
|
507
|
+
const view = new DataView(out.buffer);
|
|
508
|
+
view.setUint32(0, VOICE_PRESET_MAGIC, true);
|
|
509
|
+
view.setUint32(4, VOICE_PRESET_VERSION_V1, true);
|
|
510
|
+
view.setUint32(8, embOff, true);
|
|
511
|
+
view.setUint32(12, embBytes, true);
|
|
512
|
+
view.setUint32(16, phrOff, true);
|
|
513
|
+
view.setUint32(20, phrBytes, true);
|
|
514
|
+
|
|
515
|
+
// Embedding
|
|
516
|
+
out.set(
|
|
517
|
+
new Uint8Array(
|
|
518
|
+
file.embedding.buffer,
|
|
519
|
+
file.embedding.byteOffset,
|
|
520
|
+
file.embedding.byteLength,
|
|
521
|
+
),
|
|
522
|
+
embOff,
|
|
523
|
+
);
|
|
524
|
+
|
|
525
|
+
// Phrases
|
|
526
|
+
writePhraseSection(out, view, phrOff, file.phrases, encodedTexts);
|
|
527
|
+
|
|
528
|
+
return out;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
function writePhraseSection(
|
|
532
|
+
out: Uint8Array,
|
|
533
|
+
view: DataView,
|
|
534
|
+
startOff: number,
|
|
535
|
+
phrases: ReadonlyArray<VoicePresetSeedPhrase>,
|
|
536
|
+
encodedTexts: Uint8Array[],
|
|
537
|
+
): void {
|
|
538
|
+
let pos = startOff;
|
|
539
|
+
view.setUint32(pos, phrases.length, true);
|
|
540
|
+
pos += 4;
|
|
541
|
+
for (let i = 0; i < phrases.length; i++) {
|
|
542
|
+
const t = encodedTexts[i];
|
|
543
|
+
const phrase = phrases[i];
|
|
544
|
+
view.setUint16(pos, t.byteLength, true);
|
|
545
|
+
pos += 2;
|
|
546
|
+
out.set(t, pos);
|
|
547
|
+
pos += t.byteLength;
|
|
548
|
+
view.setUint32(pos, phrase.sampleRate, true);
|
|
549
|
+
pos += 4;
|
|
550
|
+
view.setUint32(pos, phrase.pcm.byteLength, true);
|
|
551
|
+
pos += 4;
|
|
552
|
+
out.set(
|
|
553
|
+
new Uint8Array(
|
|
554
|
+
phrase.pcm.buffer,
|
|
555
|
+
phrase.pcm.byteOffset,
|
|
556
|
+
phrase.pcm.byteLength,
|
|
557
|
+
),
|
|
558
|
+
pos,
|
|
559
|
+
);
|
|
560
|
+
pos += phrase.pcm.byteLength;
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
/**
|
|
565
|
+
* Write a voice preset in the v2 (additive) layout. Used by the OmniVoice
|
|
566
|
+
* freeze pipeline (`freeze-voice.mjs`) and other producers that need to
|
|
567
|
+
* persist `refAudioTokens` / `refText` / `instruct` alongside the v1
|
|
568
|
+
* embedding + phrase-seed sections.
|
|
569
|
+
*
|
|
570
|
+
* Any field that the caller doesn't need to persist can be omitted (or
|
|
571
|
+
* passed empty). The on-disk section is then written as length=0 and is
|
|
572
|
+
* read back as the empty equivalent.
|
|
573
|
+
*/
|
|
574
|
+
export function writeVoicePresetFileV2(file: {
|
|
575
|
+
embedding?: Float32Array;
|
|
576
|
+
phrases?: ReadonlyArray<VoicePresetSeedPhrase>;
|
|
577
|
+
refAudioTokens?: RefAudioTokens;
|
|
578
|
+
refText?: string;
|
|
579
|
+
instruct?: string;
|
|
580
|
+
metadata?: Record<string, unknown>;
|
|
581
|
+
}): Uint8Array {
|
|
582
|
+
const embedding = file.embedding ?? new Float32Array(0);
|
|
583
|
+
const phrases = file.phrases ?? [];
|
|
584
|
+
const refAudioTokens = file.refAudioTokens ?? {
|
|
585
|
+
K: 0,
|
|
586
|
+
refT: 0,
|
|
587
|
+
tokens: new Int32Array(0),
|
|
588
|
+
};
|
|
589
|
+
const refText = file.refText ?? "";
|
|
590
|
+
const instruct = file.instruct ?? "";
|
|
591
|
+
const metadata = file.metadata ?? {};
|
|
592
|
+
|
|
593
|
+
if (refAudioTokens.K * refAudioTokens.refT !== refAudioTokens.tokens.length) {
|
|
594
|
+
throw new VoicePresetFormatError(
|
|
595
|
+
`ref_audio_tokens shape mismatch: K=${refAudioTokens.K}, ref_T=${refAudioTokens.refT}, but tokens.length=${refAudioTokens.tokens.length}`,
|
|
596
|
+
"bad-ref-tokens",
|
|
597
|
+
);
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
const encoder = new TextEncoder();
|
|
601
|
+
const encodedTexts = phrases.map((p) => encoder.encode(p.text));
|
|
602
|
+
const encodedRefText = encoder.encode(refText);
|
|
603
|
+
const encodedInstruct = encoder.encode(instruct);
|
|
604
|
+
const encodedMetadata =
|
|
605
|
+
Object.keys(metadata).length === 0
|
|
606
|
+
? new Uint8Array(0)
|
|
607
|
+
: encoder.encode(JSON.stringify(metadata));
|
|
608
|
+
|
|
609
|
+
// Compute payload sizes up-front so we can lay out section offsets.
|
|
610
|
+
const embBytes = embedding.byteLength;
|
|
611
|
+
let phrBytes = phrases.length === 0 && encodedTexts.length === 0 ? 0 : 4;
|
|
612
|
+
if (phrBytes > 0) {
|
|
613
|
+
for (let i = 0; i < phrases.length; i++) {
|
|
614
|
+
const t = encodedTexts[i];
|
|
615
|
+
if (t.byteLength > 0xffff) {
|
|
616
|
+
throw new VoicePresetFormatError(
|
|
617
|
+
`phrase #${i} text too long (${t.byteLength} bytes, max 65535)`,
|
|
618
|
+
"bad-phrase-record",
|
|
619
|
+
);
|
|
620
|
+
}
|
|
621
|
+
phrBytes += 2 + t.byteLength + 4 + 4 + phrases[i].pcm.byteLength;
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
const refTokensBytes =
|
|
625
|
+
refAudioTokens.tokens.length === 0 && refAudioTokens.K === 0
|
|
626
|
+
? 0
|
|
627
|
+
: 8 + refAudioTokens.tokens.byteLength;
|
|
628
|
+
|
|
629
|
+
// Lay out sections in declared order. Empty sections claim no space and
|
|
630
|
+
// are recorded as (offset=0, length=0).
|
|
631
|
+
let cursor = VOICE_PRESET_HEADER_BYTES_V2;
|
|
632
|
+
const embOff = embBytes > 0 ? cursor : 0;
|
|
633
|
+
cursor += embBytes;
|
|
634
|
+
const phrOff = phrBytes > 0 ? cursor : 0;
|
|
635
|
+
cursor += phrBytes;
|
|
636
|
+
const refTokensOff = refTokensBytes > 0 ? cursor : 0;
|
|
637
|
+
cursor += refTokensBytes;
|
|
638
|
+
const refTextOff = encodedRefText.byteLength > 0 ? cursor : 0;
|
|
639
|
+
cursor += encodedRefText.byteLength;
|
|
640
|
+
const instructOff = encodedInstruct.byteLength > 0 ? cursor : 0;
|
|
641
|
+
cursor += encodedInstruct.byteLength;
|
|
642
|
+
const metadataOff = encodedMetadata.byteLength > 0 ? cursor : 0;
|
|
643
|
+
cursor += encodedMetadata.byteLength;
|
|
644
|
+
|
|
645
|
+
const total = cursor;
|
|
646
|
+
const out = new Uint8Array(total);
|
|
647
|
+
const view = new DataView(out.buffer);
|
|
648
|
+
|
|
649
|
+
view.setUint32(0, VOICE_PRESET_MAGIC, true);
|
|
650
|
+
view.setUint32(4, VOICE_PRESET_VERSION_V2, true);
|
|
651
|
+
view.setUint32(8, embOff, true);
|
|
652
|
+
view.setUint32(12, embBytes, true);
|
|
653
|
+
view.setUint32(16, phrOff, true);
|
|
654
|
+
view.setUint32(20, phrBytes, true);
|
|
655
|
+
view.setUint32(24, refTokensOff, true);
|
|
656
|
+
view.setUint32(28, refTokensBytes, true);
|
|
657
|
+
view.setUint32(32, refTextOff, true);
|
|
658
|
+
view.setUint32(36, encodedRefText.byteLength, true);
|
|
659
|
+
view.setUint32(40, instructOff, true);
|
|
660
|
+
view.setUint32(44, encodedInstruct.byteLength, true);
|
|
661
|
+
view.setUint32(48, metadataOff, true);
|
|
662
|
+
view.setUint32(52, encodedMetadata.byteLength, true);
|
|
663
|
+
view.setUint32(56, 0, true);
|
|
664
|
+
view.setUint32(60, 0, true);
|
|
665
|
+
|
|
666
|
+
if (embBytes > 0) {
|
|
667
|
+
out.set(
|
|
668
|
+
new Uint8Array(embedding.buffer, embedding.byteOffset, embBytes),
|
|
669
|
+
embOff,
|
|
670
|
+
);
|
|
671
|
+
}
|
|
672
|
+
if (phrBytes > 0) {
|
|
673
|
+
writePhraseSection(out, view, phrOff, phrases, encodedTexts);
|
|
674
|
+
}
|
|
675
|
+
if (refTokensBytes > 0) {
|
|
676
|
+
view.setUint32(refTokensOff, refAudioTokens.K, true);
|
|
677
|
+
view.setUint32(refTokensOff + 4, refAudioTokens.refT, true);
|
|
678
|
+
if (refAudioTokens.tokens.byteLength > 0) {
|
|
679
|
+
out.set(
|
|
680
|
+
new Uint8Array(
|
|
681
|
+
refAudioTokens.tokens.buffer,
|
|
682
|
+
refAudioTokens.tokens.byteOffset,
|
|
683
|
+
refAudioTokens.tokens.byteLength,
|
|
684
|
+
),
|
|
685
|
+
refTokensOff + 8,
|
|
686
|
+
);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
if (encodedRefText.byteLength > 0) {
|
|
690
|
+
out.set(encodedRefText, refTextOff);
|
|
691
|
+
}
|
|
692
|
+
if (encodedInstruct.byteLength > 0) {
|
|
693
|
+
out.set(encodedInstruct, instructOff);
|
|
694
|
+
}
|
|
695
|
+
if (encodedMetadata.byteLength > 0) {
|
|
696
|
+
out.set(encodedMetadata, metadataOff);
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
return out;
|
|
700
|
+
}
|