@elizaos/plugin-local-inference 2.0.0-beta.1 → 2.0.3-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +83 -0
- package/package.json +82 -15
- package/src/actions/generate-media.d.ts +59 -0
- package/src/actions/generate-media.d.ts.map +1 -0
- package/src/actions/generate-media.ts +647 -0
- package/src/actions/identify-speaker.d.ts +23 -0
- package/src/actions/identify-speaker.d.ts.map +1 -0
- package/src/actions/identify-speaker.ts +171 -0
- package/src/actions/transcription-control.d.ts +29 -0
- package/src/actions/transcription-control.d.ts.map +1 -0
- package/src/actions/transcription-control.test.ts +100 -0
- package/src/actions/transcription-control.ts +127 -0
- package/src/adapters/capacitor-llama/__tests__/compat-behavior.test.ts +218 -0
- package/src/adapters/capacitor-llama/__tests__/index.test.ts +68 -0
- package/src/adapters/capacitor-llama/__tests__/structured-output.test.ts +215 -0
- package/src/adapters/capacitor-llama/__tests__/text-streaming.test.ts +174 -0
- package/src/adapters/capacitor-llama/environment.ts +71 -0
- package/src/adapters/capacitor-llama/index.browser.ts +83 -0
- package/src/adapters/capacitor-llama/index.ts +807 -0
- package/src/adapters/capacitor-llama/loader.ts +109 -0
- package/src/adapters/capacitor-llama/structured-output.ts +165 -0
- package/src/adapters/capacitor-llama/text-streaming.ts +227 -0
- package/src/adapters/capacitor-llama/types.ts +374 -0
- package/src/backends/apple-foundation.ts +127 -0
- package/src/index.d.ts +8 -0
- package/src/index.d.ts.map +1 -0
- package/src/index.ts +62 -0
- package/src/local-inference-routes.d.ts +38 -0
- package/src/local-inference-routes.d.ts.map +1 -0
- package/src/local-inference-routes.test.ts +344 -0
- package/src/local-inference-routes.ts +1543 -0
- package/src/provider.d.ts +21 -0
- package/src/provider.d.ts.map +1 -0
- package/src/provider.ts +1082 -0
- package/src/routes/compat-helpers.d.ts +18 -0
- package/src/routes/compat-helpers.d.ts.map +1 -0
- package/src/routes/compat-helpers.ts +274 -0
- package/src/routes/family-member-route.d.ts +62 -0
- package/src/routes/family-member-route.d.ts.map +1 -0
- package/src/routes/family-member-route.ts +353 -0
- package/src/routes/index.d.ts +19 -0
- package/src/routes/index.d.ts.map +1 -0
- package/src/routes/index.ts +60 -0
- package/src/routes/live-diarization-route.d.ts +26 -0
- package/src/routes/live-diarization-route.d.ts.map +1 -0
- package/src/routes/live-diarization-route.test.ts +213 -0
- package/src/routes/live-diarization-route.ts +122 -0
- package/src/routes/local-inference-asr-route.d.ts +4 -0
- package/src/routes/local-inference-asr-route.d.ts.map +1 -0
- package/src/routes/local-inference-asr-route.test.ts +205 -0
- package/src/routes/local-inference-asr-route.ts +163 -0
- package/src/routes/local-inference-asr-transcribe.d.ts +20 -0
- package/src/routes/local-inference-asr-transcribe.d.ts.map +1 -0
- package/src/routes/local-inference-asr-transcribe.test.ts +118 -0
- package/src/routes/local-inference-asr-transcribe.ts +97 -0
- package/src/routes/local-inference-compat-routes.d.ts +16 -0
- package/src/routes/local-inference-compat-routes.d.ts.map +1 -0
- package/src/routes/local-inference-compat-routes.test.ts +485 -0
- package/src/routes/local-inference-compat-routes.ts +808 -0
- package/src/routes/local-inference-tts-route.d.ts +7 -0
- package/src/routes/local-inference-tts-route.d.ts.map +1 -0
- package/src/routes/local-inference-tts-route.test.ts +179 -0
- package/src/routes/local-inference-tts-route.ts +230 -0
- package/src/routes/transcript-audio-store.d.ts +15 -0
- package/src/routes/transcript-audio-store.d.ts.map +1 -0
- package/src/routes/transcript-audio-store.ts +27 -0
- package/src/routes/transcripts-routes.d.ts +36 -0
- package/src/routes/transcripts-routes.d.ts.map +1 -0
- package/src/routes/transcripts-routes.test.ts +144 -0
- package/src/routes/transcripts-routes.ts +159 -0
- package/src/routes/voice-first-run-routes.d.ts +62 -0
- package/src/routes/voice-first-run-routes.d.ts.map +1 -0
- package/src/routes/voice-first-run-routes.ts +524 -0
- package/src/routes/voice-models-routes.d.ts +62 -0
- package/src/routes/voice-models-routes.d.ts.map +1 -0
- package/src/routes/voice-models-routes.ts +554 -0
- package/src/routes/voice-profile-plugin-routes.d.ts +19 -0
- package/src/routes/voice-profile-plugin-routes.d.ts.map +1 -0
- package/src/routes/voice-profile-plugin-routes.ts +138 -0
- package/src/routes/voice-profiles-management-routes.d.ts +52 -0
- package/src/routes/voice-profiles-management-routes.d.ts.map +1 -0
- package/src/routes/voice-profiles-management-routes.ts +476 -0
- package/src/routes/voice-speaker-profile-routes.d.ts +57 -0
- package/src/routes/voice-speaker-profile-routes.d.ts.map +1 -0
- package/src/routes/voice-speaker-profile-routes.ts +199 -0
- package/src/runtime/aosp-llama-loader-selection.test.ts +80 -0
- package/src/runtime/capacitor-llama.d.ts +25 -0
- package/src/runtime/embedding-manager-support.d.ts +77 -0
- package/src/runtime/embedding-manager-support.d.ts.map +1 -0
- package/src/runtime/embedding-manager-support.ts +497 -0
- package/src/runtime/embedding-presets.d.ts +16 -0
- package/src/runtime/embedding-presets.d.ts.map +1 -0
- package/src/runtime/embedding-presets.ts +81 -0
- package/src/runtime/embedding-warmup-policy.d.ts +14 -0
- package/src/runtime/embedding-warmup-policy.d.ts.map +1 -0
- package/src/runtime/embedding-warmup-policy.test.ts +53 -0
- package/src/runtime/embedding-warmup-policy.ts +48 -0
- package/src/runtime/ensure-local-inference-handler.d.ts +62 -0
- package/src/runtime/ensure-local-inference-handler.d.ts.map +1 -0
- package/src/runtime/ensure-local-inference-handler.test.ts +528 -0
- package/src/runtime/ensure-local-inference-handler.ts +1448 -0
- package/src/runtime/index.d.ts +15 -0
- package/src/runtime/index.d.ts.map +1 -0
- package/src/runtime/index.ts +33 -0
- package/src/runtime/mobile-local-inference-gate.d.ts +31 -0
- package/src/runtime/mobile-local-inference-gate.d.ts.map +1 -0
- package/src/runtime/mobile-local-inference-gate.test.ts +69 -0
- package/src/runtime/mobile-local-inference-gate.ts +44 -0
- package/src/runtime/voice-entity-binding.d.ts +103 -0
- package/src/runtime/voice-entity-binding.d.ts.map +1 -0
- package/src/runtime/voice-entity-binding.transcript.test.ts +69 -0
- package/src/runtime/voice-entity-binding.ts +328 -0
- package/src/services/README.md +71 -0
- package/src/services/__tests__/backend-selector.test.ts +101 -0
- package/src/services/__tests__/checkpoint-manager.test.ts +376 -0
- package/src/services/__tests__/gpu-autotune.test.ts +400 -0
- package/src/services/__tests__/llm-streaming-binding.test.ts +85 -0
- package/src/services/__tests__/planner-grammar.test.ts +372 -0
- package/src/services/__tests__/runtime-target.test.ts +176 -0
- package/src/services/active-model-switch-rollback.test.ts +183 -0
- package/src/services/active-model.d.ts +282 -0
- package/src/services/active-model.d.ts.map +1 -0
- package/src/services/active-model.ts +1213 -0
- package/src/services/assignments.d.ts +71 -0
- package/src/services/assignments.d.ts.map +1 -0
- package/src/services/assignments.test.ts +80 -0
- package/src/services/assignments.ts +230 -0
- package/src/services/backend-selector.ts +95 -0
- package/src/services/backend.d.ts +346 -0
- package/src/services/backend.d.ts.map +1 -0
- package/src/services/backend.ts +612 -0
- package/src/services/bionic-host-loader.d.ts +46 -0
- package/src/services/bionic-host-loader.d.ts.map +1 -0
- package/src/services/bionic-host-loader.test.ts +133 -0
- package/src/services/bionic-host-loader.ts +180 -0
- package/src/services/bundled-models.d.ts +34 -0
- package/src/services/bundled-models.d.ts.map +1 -0
- package/src/services/bundled-models.ts +129 -0
- package/src/services/cache-bridge.d.ts +206 -0
- package/src/services/cache-bridge.d.ts.map +1 -0
- package/src/services/cache-bridge.test.ts +516 -0
- package/src/services/cache-bridge.ts +423 -0
- package/src/services/catalog.d.ts +10 -0
- package/src/services/catalog.d.ts.map +1 -0
- package/src/services/catalog.test.ts +238 -0
- package/src/services/catalog.ts +27 -0
- package/src/services/checkpoint-client.d.ts +109 -0
- package/src/services/checkpoint-client.d.ts.map +1 -0
- package/src/services/checkpoint-client.ts +258 -0
- package/src/services/checkpoint-manager.ts +474 -0
- package/src/services/cloud-fallback.d.ts +102 -0
- package/src/services/cloud-fallback.d.ts.map +1 -0
- package/src/services/cloud-fallback.ts +230 -0
- package/src/services/conversation-registry.d.ts +142 -0
- package/src/services/conversation-registry.d.ts.map +1 -0
- package/src/services/conversation-registry.test.ts +235 -0
- package/src/services/conversation-registry.ts +264 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts +95 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts.map +1 -0
- package/src/services/desktop-fused-ffi-backend-runtime.ts +339 -0
- package/src/services/device-bridge.d.ts +188 -0
- package/src/services/device-bridge.d.ts.map +1 -0
- package/src/services/device-bridge.ts +1237 -0
- package/src/services/device-resource-metrics.d.ts +149 -0
- package/src/services/device-resource-metrics.d.ts.map +1 -0
- package/src/services/device-resource-metrics.test.ts +98 -0
- package/src/services/device-resource-metrics.ts +346 -0
- package/src/services/device-tier.d.ts +115 -0
- package/src/services/device-tier.d.ts.map +1 -0
- package/src/services/device-tier.test.ts +371 -0
- package/src/services/device-tier.ts +410 -0
- package/src/services/downloader.d.ts +82 -0
- package/src/services/downloader.d.ts.map +1 -0
- package/src/services/downloader.test.ts +747 -0
- package/src/services/downloader.ts +925 -0
- package/src/services/engine-direct-bundle.test.ts +58 -0
- package/src/services/engine-streaming.test.ts +80 -0
- package/src/services/engine.d.ts +540 -0
- package/src/services/engine.d.ts.map +1 -0
- package/src/services/engine.ts +1909 -0
- package/src/services/ensure-local-artifacts.integration.test.ts +273 -0
- package/src/services/ensure-local-artifacts.test.ts +368 -0
- package/src/services/ensure-local-artifacts.ts +351 -0
- package/src/services/external-scanner.d.ts +17 -0
- package/src/services/external-scanner.d.ts.map +1 -0
- package/src/services/external-scanner.ts +312 -0
- package/src/services/ffi-llm-mock.ts +354 -0
- package/src/services/ffi-llm-streaming-abi.ts +442 -0
- package/src/services/ffi-streaming-backend.d.ts +180 -0
- package/src/services/ffi-streaming-backend.d.ts.map +1 -0
- package/src/services/ffi-streaming-backend.ts +382 -0
- package/src/services/ffi-streaming-runner.d.ts +122 -0
- package/src/services/ffi-streaming-runner.d.ts.map +1 -0
- package/src/services/ffi-streaming-runner.test.ts +60 -0
- package/src/services/ffi-streaming-runner.ts +354 -0
- package/src/services/ffi-unload-ordering.test.ts +162 -0
- package/src/services/gpu-autotune.ts +534 -0
- package/src/services/gpu-detect.d.ts +56 -0
- package/src/services/gpu-detect.d.ts.map +1 -0
- package/src/services/gpu-detect.ts +139 -0
- package/src/services/handler-registry.d.ts +72 -0
- package/src/services/handler-registry.d.ts.map +1 -0
- package/src/services/handler-registry.ts +240 -0
- package/src/services/hardware.d.ts +63 -0
- package/src/services/hardware.d.ts.map +1 -0
- package/src/services/hardware.test.ts +231 -0
- package/src/services/hardware.ts +410 -0
- package/src/services/hf-search.d.ts +26 -0
- package/src/services/hf-search.d.ts.map +1 -0
- package/src/services/hf-search.test.ts +69 -0
- package/src/services/hf-search.ts +420 -0
- package/src/services/image-description-runtime.d.ts +14 -0
- package/src/services/image-description-runtime.d.ts.map +1 -0
- package/src/services/image-description-runtime.test.ts +61 -0
- package/src/services/image-description-runtime.ts +118 -0
- package/src/services/imagegen/aosp-unavailable.d.ts +134 -0
- package/src/services/imagegen/aosp-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/aosp-unavailable.ts +229 -0
- package/src/services/imagegen/backend-selector.d.ts +118 -0
- package/src/services/imagegen/backend-selector.d.ts.map +1 -0
- package/src/services/imagegen/backend-selector.ts +277 -0
- package/src/services/imagegen/coreml-unavailable.d.ts +105 -0
- package/src/services/imagegen/coreml-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/coreml-unavailable.ts +237 -0
- package/src/services/imagegen/errors.d.ts +16 -0
- package/src/services/imagegen/errors.d.ts.map +1 -0
- package/src/services/imagegen/errors.ts +40 -0
- package/src/services/imagegen/index.d.ts +58 -0
- package/src/services/imagegen/index.d.ts.map +1 -0
- package/src/services/imagegen/index.ts +144 -0
- package/src/services/imagegen/mflux.d.ts +74 -0
- package/src/services/imagegen/mflux.d.ts.map +1 -0
- package/src/services/imagegen/mflux.ts +313 -0
- package/src/services/imagegen/sd-cpp.d.ts +180 -0
- package/src/services/imagegen/sd-cpp.d.ts.map +1 -0
- package/src/services/imagegen/sd-cpp.ts +718 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts +83 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/tensorrt-unavailable.ts +295 -0
- package/src/services/imagegen/types.d.ts +181 -0
- package/src/services/imagegen/types.d.ts.map +1 -0
- package/src/services/imagegen/types.ts +193 -0
- package/src/services/index.d.ts +29 -0
- package/src/services/index.d.ts.map +1 -0
- package/src/services/index.ts +211 -0
- package/src/services/inference-capabilities.d.ts +132 -0
- package/src/services/inference-capabilities.d.ts.map +1 -0
- package/src/services/inference-capabilities.test.ts +75 -0
- package/src/services/inference-capabilities.ts +204 -0
- package/src/services/inference-telemetry.d.ts +59 -0
- package/src/services/inference-telemetry.d.ts.map +1 -0
- package/src/services/inference-telemetry.ts +143 -0
- package/src/services/ios-llama-streaming.ts +248 -0
- package/src/services/kv-spill.d.ts +189 -0
- package/src/services/kv-spill.d.ts.map +1 -0
- package/src/services/kv-spill.test.ts +222 -0
- package/src/services/kv-spill.ts +356 -0
- package/src/services/latency-trace.d.ts +346 -0
- package/src/services/latency-trace.d.ts.map +1 -0
- package/src/services/latency-trace.test.ts +266 -0
- package/src/services/latency-trace.ts +844 -0
- package/src/services/llama-server-metrics.ts +304 -0
- package/src/services/llm-streaming-binding.d.ts +96 -0
- package/src/services/llm-streaming-binding.d.ts.map +1 -0
- package/src/services/llm-streaming-binding.ts +136 -0
- package/src/services/load-args.d.ts +82 -0
- package/src/services/load-args.d.ts.map +1 -0
- package/src/services/load-args.ts +81 -0
- package/src/services/manifest/eliza-1.manifest.v1.json +708 -0
- package/src/services/manifest/index.d.ts +4 -0
- package/src/services/manifest/index.d.ts.map +1 -0
- package/src/services/manifest/index.ts +66 -0
- package/src/services/manifest/manifest.test.ts +689 -0
- package/src/services/manifest/schema.d.ts +713 -0
- package/src/services/manifest/schema.d.ts.map +1 -0
- package/src/services/manifest/schema.ts +653 -0
- package/src/services/manifest/types.d.ts +30 -0
- package/src/services/manifest/types.d.ts.map +1 -0
- package/src/services/manifest/types.ts +55 -0
- package/src/services/manifest/validator.d.ts +66 -0
- package/src/services/manifest/validator.d.ts.map +1 -0
- package/src/services/manifest/validator.ts +567 -0
- package/src/services/memory-arbiter.d.ts +318 -0
- package/src/services/memory-arbiter.d.ts.map +1 -0
- package/src/services/memory-arbiter.test.ts +419 -0
- package/src/services/memory-arbiter.ts +925 -0
- package/src/services/memory-monitor.d.ts +122 -0
- package/src/services/memory-monitor.d.ts.map +1 -0
- package/src/services/memory-monitor.test.ts +208 -0
- package/src/services/memory-monitor.ts +297 -0
- package/src/services/memory-pressure.d.ts +130 -0
- package/src/services/memory-pressure.d.ts.map +1 -0
- package/src/services/memory-pressure.ts +414 -0
- package/src/services/mtp-doctor.d.ts +13 -0
- package/src/services/mtp-doctor.d.ts.map +1 -0
- package/src/services/mtp-doctor.ts +78 -0
- package/src/services/network-policy.d.ts +127 -0
- package/src/services/network-policy.d.ts.map +1 -0
- package/src/services/network-policy.ts +346 -0
- package/src/services/paths.d.ts +6 -0
- package/src/services/paths.d.ts.map +1 -0
- package/src/services/paths.ts +25 -0
- package/src/services/planner-skeleton.d.ts +124 -0
- package/src/services/planner-skeleton.d.ts.map +1 -0
- package/src/services/planner-skeleton.ts +175 -0
- package/src/services/providers.d.ts +38 -0
- package/src/services/providers.d.ts.map +1 -0
- package/src/services/providers.ts +507 -0
- package/src/services/ram-budget-cache.test.ts +163 -0
- package/src/services/ram-budget.d.ts +110 -0
- package/src/services/ram-budget.d.ts.map +1 -0
- package/src/services/ram-budget.ts +0 -0
- package/src/services/readiness.d.ts +9 -0
- package/src/services/readiness.d.ts.map +1 -0
- package/src/services/readiness.test.ts +87 -0
- package/src/services/readiness.ts +238 -0
- package/src/services/recommendation.d.ts +111 -0
- package/src/services/recommendation.d.ts.map +1 -0
- package/src/services/recommendation.ts +671 -0
- package/src/services/registry.d.ts +35 -0
- package/src/services/registry.d.ts.map +1 -0
- package/src/services/registry.ts +151 -0
- package/src/services/router-handler.d.ts +92 -0
- package/src/services/router-handler.d.ts.map +1 -0
- package/src/services/router-handler.test.ts +45 -0
- package/src/services/router-handler.ts +407 -0
- package/src/services/routing-policy.d.ts +69 -0
- package/src/services/routing-policy.d.ts.map +1 -0
- package/src/services/routing-policy.test.ts +164 -0
- package/src/services/routing-policy.ts +297 -0
- package/src/services/routing-preferences.d.ts +8 -0
- package/src/services/routing-preferences.d.ts.map +1 -0
- package/src/services/routing-preferences.ts +17 -0
- package/src/services/runtime-target.d.ts +98 -0
- package/src/services/runtime-target.d.ts.map +1 -0
- package/src/services/runtime-target.ts +154 -0
- package/src/services/service.d.ts +128 -0
- package/src/services/service.d.ts.map +1 -0
- package/src/services/service.test.ts +223 -0
- package/src/services/service.ts +735 -0
- package/src/services/session-pool.d.ts +72 -0
- package/src/services/session-pool.d.ts.map +1 -0
- package/src/services/session-pool.ts +153 -0
- package/src/services/structured-output/deterministic-repair.d.ts +23 -0
- package/src/services/structured-output/deterministic-repair.d.ts.map +1 -0
- package/src/services/structured-output/deterministic-repair.test.ts +169 -0
- package/src/services/structured-output/deterministic-repair.ts +443 -0
- package/src/services/structured-output/index.ts +4 -0
- package/src/services/structured-output.d.ts +311 -0
- package/src/services/structured-output.d.ts.map +1 -0
- package/src/services/structured-output.test.ts +483 -0
- package/src/services/structured-output.ts +712 -0
- package/src/services/system-memory.d.ts +33 -0
- package/src/services/system-memory.d.ts.map +1 -0
- package/src/services/system-memory.test.ts +47 -0
- package/src/services/system-memory.ts +67 -0
- package/src/services/transcription-priority.test.ts +211 -0
- package/src/services/types.d.ts +19 -0
- package/src/services/types.d.ts.map +1 -0
- package/src/services/types.ts +55 -0
- package/src/services/verify-on-device.d.ts +34 -0
- package/src/services/verify-on-device.d.ts.map +1 -0
- package/src/services/verify-on-device.test.ts +87 -0
- package/src/services/verify-on-device.ts +127 -0
- package/src/services/verify.d.ts +8 -0
- package/src/services/verify.d.ts.map +1 -0
- package/src/services/verify.ts +13 -0
- package/src/services/vision/aosp-unavailable.d.ts +115 -0
- package/src/services/vision/aosp-unavailable.d.ts.map +1 -0
- package/src/services/vision/aosp-unavailable.ts +163 -0
- package/src/services/vision/capacitor-llama.d.ts +99 -0
- package/src/services/vision/capacitor-llama.d.ts.map +1 -0
- package/src/services/vision/capacitor-llama.ts +255 -0
- package/src/services/vision/cloud-fallback.d.ts +47 -0
- package/src/services/vision/cloud-fallback.d.ts.map +1 -0
- package/src/services/vision/cloud-fallback.test.ts +243 -0
- package/src/services/vision/cloud-fallback.ts +268 -0
- package/src/services/vision/fallback-chain.test.ts +86 -0
- package/src/services/vision/hash.d.ts +71 -0
- package/src/services/vision/hash.d.ts.map +1 -0
- package/src/services/vision/hash.ts +157 -0
- package/src/services/vision/index.d.ts +95 -0
- package/src/services/vision/index.d.ts.map +1 -0
- package/src/services/vision/index.ts +251 -0
- package/src/services/vision/llama-server.d.ts +73 -0
- package/src/services/vision/llama-server.d.ts.map +1 -0
- package/src/services/vision/llama-server.ts +177 -0
- package/src/services/vision/types.d.ts +153 -0
- package/src/services/vision/types.d.ts.map +1 -0
- package/src/services/vision/types.ts +154 -0
- package/src/services/vision/vast-fallback.d.ts +18 -0
- package/src/services/vision/vast-fallback.d.ts.map +1 -0
- package/src/services/vision/vast-fallback.ts +127 -0
- package/src/services/vision-embedding-cache.d.ts +98 -0
- package/src/services/vision-embedding-cache.d.ts.map +1 -0
- package/src/services/vision-embedding-cache.ts +189 -0
- package/src/services/voice/VOICE_WORKBENCH.md +88 -0
- package/src/services/voice/__test-helpers__/fake-ffi.ts +94 -0
- package/src/services/voice/__test-helpers__/synthetic-speech.ts +124 -0
- package/src/services/voice/__tests__/checkpoint-manager.test.ts +241 -0
- package/src/services/voice/__tests__/checkpoint-policy.test.ts +270 -0
- package/src/services/voice/__tests__/eager-context-builder.test.ts +257 -0
- package/src/services/voice/__tests__/eliza1-eot-scorer.test.ts +288 -0
- package/src/services/voice/__tests__/eot-classifier.test.ts +431 -0
- package/src/services/voice/__tests__/optimistic-rollback.test.ts +312 -0
- package/src/services/voice/__tests__/prefill-client.test.ts +266 -0
- package/src/services/voice/__tests__/prefix-preserving-queue.test.ts +208 -0
- package/src/services/voice/__tests__/streaming-asr.test.ts +450 -0
- package/src/services/voice/__tests__/streaming-transcriber.test.ts +339 -0
- package/src/services/voice/__tests__/turn-detector-resolver.test.ts +195 -0
- package/src/services/voice/__tests__/voice-state-machine-prefill.test.ts +275 -0
- package/src/services/voice/__tests__/voice-state-machine.test.ts +354 -0
- package/src/services/voice/asr-timed.real.test.ts +141 -0
- package/src/services/voice/audio-frame-consumer.d.ts +212 -0
- package/src/services/voice/audio-frame-consumer.d.ts.map +1 -0
- package/src/services/voice/audio-frame-consumer.test.ts +343 -0
- package/src/services/voice/audio-frame-consumer.ts +491 -0
- package/src/services/voice/barge-in.d.ts +112 -0
- package/src/services/voice/barge-in.d.ts.map +1 -0
- package/src/services/voice/barge-in.test.ts +244 -0
- package/src/services/voice/barge-in.ts +336 -0
- package/src/services/voice/cancellation-coordinator.d.ts +127 -0
- package/src/services/voice/cancellation-coordinator.d.ts.map +1 -0
- package/src/services/voice/cancellation-coordinator.test.ts +196 -0
- package/src/services/voice/cancellation-coordinator.ts +269 -0
- package/src/services/voice/checkpoint-manager.d.ts +199 -0
- package/src/services/voice/checkpoint-manager.d.ts.map +1 -0
- package/src/services/voice/checkpoint-manager.ts +401 -0
- package/src/services/voice/checkpoint-policy.ts +336 -0
- package/src/services/voice/composite-eot-classifier.test.ts +59 -0
- package/src/services/voice/e2e-harness.test.ts +182 -0
- package/src/services/voice/e2e-harness.ts +743 -0
- package/src/services/voice/eager-context-builder.d.ts +170 -0
- package/src/services/voice/eager-context-builder.d.ts.map +1 -0
- package/src/services/voice/eager-context-builder.ts +262 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts +124 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/eliza1-eot-scorer.ts +242 -0
- package/src/services/voice/embedding-server.ts +200 -0
- package/src/services/voice/embedding.d.ts +133 -0
- package/src/services/voice/embedding.d.ts.map +1 -0
- package/src/services/voice/embedding.test.ts +131 -0
- package/src/services/voice/embedding.ts +243 -0
- package/src/services/voice/emotion-attribution.d.ts +68 -0
- package/src/services/voice/emotion-attribution.d.ts.map +1 -0
- package/src/services/voice/emotion-attribution.test.ts +129 -0
- package/src/services/voice/emotion-attribution.ts +361 -0
- package/src/services/voice/engine-bridge-cancellation.test.ts +422 -0
- package/src/services/voice/engine-bridge.d.ts +759 -0
- package/src/services/voice/engine-bridge.d.ts.map +1 -0
- package/src/services/voice/engine-bridge.test.ts +384 -0
- package/src/services/voice/engine-bridge.ts +2302 -0
- package/src/services/voice/eot-classifier-ggml.d.ts +179 -0
- package/src/services/voice/eot-classifier-ggml.d.ts.map +1 -0
- package/src/services/voice/eot-classifier-ggml.ts +566 -0
- package/src/services/voice/eot-classifier.d.ts +214 -0
- package/src/services/voice/eot-classifier.d.ts.map +1 -0
- package/src/services/voice/eot-classifier.ts +533 -0
- package/src/services/voice/errors.d.ts +20 -0
- package/src/services/voice/errors.d.ts.map +1 -0
- package/src/services/voice/errors.ts +32 -0
- package/src/services/voice/expressive-tags.d.ts +158 -0
- package/src/services/voice/expressive-tags.d.ts.map +1 -0
- package/src/services/voice/expressive-tags.ts +405 -0
- package/src/services/voice/ffi-bindings.d.ts +674 -0
- package/src/services/voice/ffi-bindings.d.ts.map +1 -0
- package/src/services/voice/ffi-bindings.test.ts +728 -0
- package/src/services/voice/ffi-bindings.ts +3225 -0
- package/src/services/voice/first-line-cache.d.ts +181 -0
- package/src/services/voice/first-line-cache.d.ts.map +1 -0
- package/src/services/voice/first-line-cache.ts +725 -0
- package/src/services/voice/fused-eot-scorer.d.ts +51 -0
- package/src/services/voice/fused-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/fused-eot-scorer.ts +135 -0
- package/src/services/voice/index.d.ts +91 -0
- package/src/services/voice/index.d.ts.map +1 -0
- package/src/services/voice/index.ts +481 -0
- package/src/services/voice/kokoro/__tests__/kokoro-backend.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.real.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.test.ts +60 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-discovery.test.ts +277 -0
- package/src/services/voice/kokoro/__tests__/kokoro-ffi-runtime.test.ts +235 -0
- package/src/services/voice/kokoro/__tests__/kokoro-runtime.test.ts +95 -0
- package/src/services/voice/kokoro/__tests__/phonemizer.test.ts +53 -0
- package/src/services/voice/kokoro/__tests__/runtime-selection.test.ts +231 -0
- package/src/services/voice/kokoro/__tests__/voices.test.ts +57 -0
- package/src/services/voice/kokoro/index.ts +79 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts +72 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-backend.ts +207 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts +58 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.ts +177 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts +75 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.ts +233 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts +100 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-runtime.ts +170 -0
- package/src/services/voice/kokoro/phoneme-stream.ts +123 -0
- package/src/services/voice/kokoro/phonemizer.d.ts +50 -0
- package/src/services/voice/kokoro/phonemizer.d.ts.map +1 -0
- package/src/services/voice/kokoro/phonemizer.ts +344 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts +61 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/pick-runtime.test.ts +91 -0
- package/src/services/voice/kokoro/pick-runtime.ts +130 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts +92 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts.map +1 -0
- package/src/services/voice/kokoro/runtime-selection.ts +237 -0
- package/src/services/voice/kokoro/types.d.ts +82 -0
- package/src/services/voice/kokoro/types.d.ts.map +1 -0
- package/src/services/voice/kokoro/types.ts +95 -0
- package/src/services/voice/kokoro/voice-presets.d.ts +23 -0
- package/src/services/voice/kokoro/voice-presets.d.ts.map +1 -0
- package/src/services/voice/kokoro/voice-presets.ts +129 -0
- package/src/services/voice/kokoro/voices.d.ts +30 -0
- package/src/services/voice/kokoro/voices.d.ts.map +1 -0
- package/src/services/voice/kokoro/voices.ts +64 -0
- package/src/services/voice/lifecycle.d.ts +135 -0
- package/src/services/voice/lifecycle.d.ts.map +1 -0
- package/src/services/voice/lifecycle.test.ts +315 -0
- package/src/services/voice/lifecycle.ts +301 -0
- package/src/services/voice/live-diarization-session.d.ts +96 -0
- package/src/services/voice/live-diarization-session.d.ts.map +1 -0
- package/src/services/voice/live-diarization-session.ts +289 -0
- package/src/services/voice/mic-source.d.ts +136 -0
- package/src/services/voice/mic-source.d.ts.map +1 -0
- package/src/services/voice/mic-source.test.ts +210 -0
- package/src/services/voice/mic-source.ts +503 -0
- package/src/services/voice/optimistic-policy.d.ts +109 -0
- package/src/services/voice/optimistic-policy.d.ts.map +1 -0
- package/src/services/voice/optimistic-policy.test.ts +101 -0
- package/src/services/voice/optimistic-policy.ts +192 -0
- package/src/services/voice/optimistic-rollback.ts +343 -0
- package/src/services/voice/partial-stabilizer.d.ts +73 -0
- package/src/services/voice/partial-stabilizer.d.ts.map +1 -0
- package/src/services/voice/partial-stabilizer.test.ts +68 -0
- package/src/services/voice/partial-stabilizer.ts +140 -0
- package/src/services/voice/phoneme-tokenizer.d.ts +49 -0
- package/src/services/voice/phoneme-tokenizer.d.ts.map +1 -0
- package/src/services/voice/phoneme-tokenizer.ts +158 -0
- package/src/services/voice/phrase-cache.d.ts +76 -0
- package/src/services/voice/phrase-cache.d.ts.map +1 -0
- package/src/services/voice/phrase-cache.test.ts +242 -0
- package/src/services/voice/phrase-cache.ts +186 -0
- package/src/services/voice/phrase-chunker.d.ts +62 -0
- package/src/services/voice/phrase-chunker.d.ts.map +1 -0
- package/src/services/voice/phrase-chunker.test.ts +239 -0
- package/src/services/voice/phrase-chunker.ts +281 -0
- package/src/services/voice/pipeline-impls.d.ts +151 -0
- package/src/services/voice/pipeline-impls.d.ts.map +1 -0
- package/src/services/voice/pipeline-impls.l6.test.ts +110 -0
- package/src/services/voice/pipeline-impls.test.ts +292 -0
- package/src/services/voice/pipeline-impls.ts +315 -0
- package/src/services/voice/pipeline.d.ts +216 -0
- package/src/services/voice/pipeline.d.ts.map +1 -0
- package/src/services/voice/pipeline.ts +505 -0
- package/src/services/voice/prefill-client.d.ts +123 -0
- package/src/services/voice/prefill-client.d.ts.map +1 -0
- package/src/services/voice/prefill-client.ts +316 -0
- package/src/services/voice/prefix-preserving-queue.d.ts +113 -0
- package/src/services/voice/prefix-preserving-queue.d.ts.map +1 -0
- package/src/services/voice/prefix-preserving-queue.ts +162 -0
- package/src/services/voice/profile-store.d.ts +248 -0
- package/src/services/voice/profile-store.d.ts.map +1 -0
- package/src/services/voice/profile-store.ts +887 -0
- package/src/services/voice/real-audio-decode.test.ts +148 -0
- package/src/services/voice/ring-buffer.d.ts +40 -0
- package/src/services/voice/ring-buffer.d.ts.map +1 -0
- package/src/services/voice/ring-buffer.test.ts +129 -0
- package/src/services/voice/ring-buffer.ts +123 -0
- package/src/services/voice/rollback-queue.d.ts +24 -0
- package/src/services/voice/rollback-queue.d.ts.map +1 -0
- package/src/services/voice/rollback-queue.ts +74 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts +67 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-placeholder.test.ts +97 -0
- package/src/services/voice/samantha-preset-placeholder.ts +148 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts +87 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-regenerator.ts +393 -0
- package/src/services/voice/scheduler.d.ts +146 -0
- package/src/services/voice/scheduler.d.ts.map +1 -0
- package/src/services/voice/scheduler.t2.test.ts +141 -0
- package/src/services/voice/scheduler.ts +927 -0
- package/src/services/voice/shared-resources.d.ts +190 -0
- package/src/services/voice/shared-resources.d.ts.map +1 -0
- package/src/services/voice/shared-resources.ts +320 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts +74 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts.map +1 -0
- package/src/services/voice/speaker/attribution-pipeline.ts +386 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts +59 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer-fused.real.test.ts +100 -0
- package/src/services/voice/speaker/diarizer-fused.ts +154 -0
- package/src/services/voice/speaker/diarizer.d.ts +75 -0
- package/src/services/voice/speaker/diarizer.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer.ts +218 -0
- package/src/services/voice/speaker/encoder-fused.d.ts +60 -0
- package/src/services/voice/speaker/encoder-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-fused.real.test.ts +113 -0
- package/src/services/voice/speaker/encoder-fused.ts +138 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts +33 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-ggml.ts +79 -0
- package/src/services/voice/speaker/encoder.d.ts +37 -0
- package/src/services/voice/speaker/encoder.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder.ts +105 -0
- package/src/services/voice/speaker-imprint.d.ts +83 -0
- package/src/services/voice/speaker-imprint.d.ts.map +1 -0
- package/src/services/voice/speaker-imprint.test.ts +185 -0
- package/src/services/voice/speaker-imprint.ts +312 -0
- package/src/services/voice/speaker-preset-cache.d.ts +77 -0
- package/src/services/voice/speaker-preset-cache.d.ts.map +1 -0
- package/src/services/voice/speaker-preset-cache.test.ts +154 -0
- package/src/services/voice/speaker-preset-cache.ts +195 -0
- package/src/services/voice/streaming-asr/streaming-pipeline-adapter.ts +292 -0
- package/src/services/voice/system-audio-sink.d.ts +73 -0
- package/src/services/voice/system-audio-sink.d.ts.map +1 -0
- package/src/services/voice/system-audio-sink.test.ts +29 -0
- package/src/services/voice/system-audio-sink.ts +366 -0
- package/src/services/voice/transcriber.d.ts +244 -0
- package/src/services/voice/transcriber.d.ts.map +1 -0
- package/src/services/voice/transcriber.test.ts +392 -0
- package/src/services/voice/transcriber.ts +704 -0
- package/src/services/voice/transcript-knowledge.d.ts +37 -0
- package/src/services/voice/transcript-knowledge.d.ts.map +1 -0
- package/src/services/voice/transcript-knowledge.test.ts +68 -0
- package/src/services/voice/transcript-knowledge.ts +75 -0
- package/src/services/voice/transcript-service.d.ts +41 -0
- package/src/services/voice/transcript-service.d.ts.map +1 -0
- package/src/services/voice/transcript-service.test.ts +137 -0
- package/src/services/voice/transcript-service.ts +141 -0
- package/src/services/voice/transcript-store.d.ts +53 -0
- package/src/services/voice/transcript-store.d.ts.map +1 -0
- package/src/services/voice/transcript-store.test.ts +153 -0
- package/src/services/voice/transcript-store.ts +132 -0
- package/src/services/voice/turn-controller.d.ts +183 -0
- package/src/services/voice/turn-controller.d.ts.map +1 -0
- package/src/services/voice/turn-controller.test.ts +575 -0
- package/src/services/voice/turn-controller.ts +596 -0
- package/src/services/voice/types.d.ts +643 -0
- package/src/services/voice/types.d.ts.map +1 -0
- package/src/services/voice/types.ts +699 -0
- package/src/services/voice/vad.d.ts +282 -0
- package/src/services/voice/vad.d.ts.map +1 -0
- package/src/services/voice/vad.test.ts +480 -0
- package/src/services/voice/vad.ts +827 -0
- package/src/services/voice/vad.v1-v4.test.ts +222 -0
- package/src/services/voice/voice-budget.d.ts +241 -0
- package/src/services/voice/voice-budget.d.ts.map +1 -0
- package/src/services/voice/voice-budget.test.ts +418 -0
- package/src/services/voice/voice-budget.ts +635 -0
- package/src/services/voice/voice-duet.test.ts +375 -0
- package/src/services/voice/voice-emotion-classifier.d.ts +95 -0
- package/src/services/voice/voice-emotion-classifier.d.ts.map +1 -0
- package/src/services/voice/voice-emotion-classifier.test.ts +210 -0
- package/src/services/voice/voice-emotion-classifier.ts +273 -0
- package/src/services/voice/voice-preset-format.d.ts +158 -0
- package/src/services/voice/voice-preset-format.d.ts.map +1 -0
- package/src/services/voice/voice-preset-format.ts +700 -0
- package/src/services/voice/voice-preset-generator.test.ts +89 -0
- package/src/services/voice/voice-profile-artifact.d.ts +116 -0
- package/src/services/voice/voice-profile-artifact.d.ts.map +1 -0
- package/src/services/voice/voice-profile-artifact.test.ts +138 -0
- package/src/services/voice/voice-profile-artifact.ts +518 -0
- package/src/services/voice/voice-profile-routes.d.ts +83 -0
- package/src/services/voice/voice-profile-routes.d.ts.map +1 -0
- package/src/services/voice/voice-profile-routes.test.ts +429 -0
- package/src/services/voice/voice-profile-routes.ts +425 -0
- package/src/services/voice/voice-scenario.ts +154 -0
- package/src/services/voice/voice-settings.d.ts +82 -0
- package/src/services/voice/voice-settings.d.ts.map +1 -0
- package/src/services/voice/voice-settings.ts +172 -0
- package/src/services/voice/voice-state-machine.d.ts +364 -0
- package/src/services/voice/voice-state-machine.d.ts.map +1 -0
- package/src/services/voice/voice-state-machine.ts +727 -0
- package/src/services/voice/voice-workbench-report.test.ts +168 -0
- package/src/services/voice/voice-workbench-report.ts +326 -0
- package/src/services/voice/voice-workbench.test.ts +158 -0
- package/src/services/voice/voice.test.ts +1070 -0
- package/src/services/voice/wake-word-ggml.d.ts +101 -0
- package/src/services/voice/wake-word-ggml.d.ts.map +1 -0
- package/src/services/voice/wake-word-ggml.ts +320 -0
- package/src/services/voice/wake-word.d.ts +255 -0
- package/src/services/voice/wake-word.d.ts.map +1 -0
- package/src/services/voice/wake-word.test.ts +298 -0
- package/src/services/voice/wake-word.ts +554 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts +70 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts.map +1 -0
- package/src/services/voice/wrap-with-first-line-cache.ts +267 -0
- package/src/services/voice-model-updater.d.ts +240 -0
- package/src/services/voice-model-updater.d.ts.map +1 -0
- package/src/services/voice-model-updater.ts +724 -0
- package/src/services/voice-prewarm.d.ts +3 -0
- package/src/services/voice-prewarm.d.ts.map +1 -0
- package/src/services/voice-prewarm.ts +51 -0
- package/dist/index.d.ts +0 -37
- package/dist/index.js +0 -1098
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Expressive (emotion / singing) inline-tag handling for the voice path.
|
|
3
|
+
*
|
|
4
|
+
* The canonical schema is the **omnivoice-singing inline-tag vocabulary,
|
|
5
|
+
* verbatim** (no SSML, no new format — it is exactly what the fine-tuned
|
|
6
|
+
* `omnivoice-singing` GGUF understands when the tags appear inline in the
|
|
7
|
+
* text passed to `eliza_inference_tts_synthesize` / `/v1/audio/speech`):
|
|
8
|
+
*
|
|
9
|
+
* emotion tags [happy] [sad] [angry] [nervous] [calm] [excited] [whisper]
|
|
10
|
+
* singing tag [singing]
|
|
11
|
+
* preserved [laughter] [sigh] (non-verbals — passed through, not
|
|
12
|
+
* consumed as a scope-setting tag)
|
|
13
|
+
*
|
|
14
|
+
* Tags are *inline* and *scoped*: a tag applies from where it appears to the
|
|
15
|
+
* next tag (or end of text). Mid-sentence shifts are allowed
|
|
16
|
+
* (`"that's [excited] amazing"`). `parseExpressiveTags` segments the text by
|
|
17
|
+
* tag boundaries so the chunker / TTS backend can carry the in-scope emotion
|
|
18
|
+
* with each phrase.
|
|
19
|
+
*
|
|
20
|
+
* Two consumers:
|
|
21
|
+
* - the *singing* TTS GGUF: it parses the tags itself, so the bridge passes
|
|
22
|
+
* the tag-bearing text through (`segment.text` keeps `[happy]` etc. — see
|
|
23
|
+
* `makeTextToSpeechHandler`, which deliberately does NOT strip them).
|
|
24
|
+
* - a *base* TTS GGUF (no `emotion-tags` / `singing` capability): the tags
|
|
25
|
+
* would otherwise be spoken literally, so `stripExpressiveTags` removes
|
|
26
|
+
* them before synthesis. (The model shouldn't have emitted them when the
|
|
27
|
+
* bundle lacks the capability — but defense in depth.)
|
|
28
|
+
*
|
|
29
|
+
* Coordinates with:
|
|
30
|
+
* - `manifest/schema.ts` → `ELIZA_1_VOICE_CAPABILITIES` (`emotion-tags` /
|
|
31
|
+
* `singing` gate whether the prompt instructs the model to emit tags).
|
|
32
|
+
* - WS-4's Stage-1 envelope `emotion` enum field (one-line field-evaluator
|
|
33
|
+
* registration there) — `EXPRESSIVE_EMOTION_ENUM` is the shared value set.
|
|
34
|
+
* - WS-3's `voice_emotion` fine-tune corpus task — both forms (inline tags +
|
|
35
|
+
* the `emotion` field) are populated on voice-mode rows.
|
|
36
|
+
*/
|
|
37
|
+
/** Emotion tags that set the affect scope for the text that follows. */
|
|
38
|
+
export declare const EXPRESSIVE_EMOTION_TAGS: readonly ["happy", "sad", "angry", "nervous", "calm", "excited", "whisper"];
|
|
39
|
+
export type ExpressiveEmotion = (typeof EXPRESSIVE_EMOTION_TAGS)[number];
|
|
40
|
+
/**
|
|
41
|
+
* The Stage-1 envelope `emotion` enum value set (decision #2 in
|
|
42
|
+
* `.swarm/IMPLEMENTATION_PLAN.md` §1 — the optional field-evaluator WS-4 wires
|
|
43
|
+
* registers exactly this enum). `none` is the default / "no expressive cue"
|
|
44
|
+
* value so the field is always present and structured-decode can singleton-fill
|
|
45
|
+
* it. `whisper` is included because it travels the same inline-tag channel even
|
|
46
|
+
* though it is a delivery style rather than an affect.
|
|
47
|
+
*/
|
|
48
|
+
export declare const EXPRESSIVE_EMOTION_ENUM: readonly ["none", "happy", "sad", "angry", "nervous", "calm", "excited", "whisper"];
|
|
49
|
+
export type ExpressiveEmotionEnum = (typeof EXPRESSIVE_EMOTION_ENUM)[number];
|
|
50
|
+
/** The singing tag — a style flag, not an affect; orthogonal to emotion. */
|
|
51
|
+
export declare const EXPRESSIVE_SINGING_TAG: "singing";
|
|
52
|
+
/**
|
|
53
|
+
* Preserved non-verbals — these are *rendered* as sound effects by the TTS,
|
|
54
|
+
* not consumed as scope-setting tags. They pass straight through the bridge.
|
|
55
|
+
*/
|
|
56
|
+
export declare const EXPRESSIVE_NONVERBAL_TAGS: readonly ["laughter", "sigh"];
|
|
57
|
+
export type ExpressiveNonverbal = (typeof EXPRESSIVE_NONVERBAL_TAGS)[number];
|
|
58
|
+
/**
|
|
59
|
+
* The full inline-tag vocabulary (emotion + singing + preserved non-verbals),
|
|
60
|
+
* verbatim — the union the `omnivoice-singing` GGUF understands. Use this for
|
|
61
|
+
* the prompt clause and the `tagLeakage` check.
|
|
62
|
+
*/
|
|
63
|
+
export declare const EXPRESSIVE_TAGS: readonly ["happy", "sad", "angry", "nervous", "calm", "excited", "whisper", "singing", "laughter", "sigh"];
|
|
64
|
+
export type ExpressiveTag = (typeof EXPRESSIVE_TAGS)[number];
|
|
65
|
+
/** `true` iff `tag` (without brackets, case-insensitive) is a legal expressive tag. */
|
|
66
|
+
export declare function isExpressiveTag(tag: string): tag is ExpressiveTag;
|
|
67
|
+
/** `true` iff `value` is a legal `emotion` enum value (incl. `"none"`). */
|
|
68
|
+
export declare function isExpressiveEmotionEnum(value: string): value is ExpressiveEmotionEnum;
|
|
69
|
+
/** One scoped segment of an expressive `replyText`. */
|
|
70
|
+
export interface ExpressiveSegment {
|
|
71
|
+
/**
|
|
72
|
+
* The segment text. For the singing-GGUF path this *keeps* the leading
|
|
73
|
+
* scope-setting emotion/singing tag and any inline non-verbals (the GGUF
|
|
74
|
+
* parses them); for a base-TTS path use `cleanText` / `stripExpressiveTags`.
|
|
75
|
+
*/
|
|
76
|
+
text: string;
|
|
77
|
+
/** The segment text with every recognized expressive tag removed. */
|
|
78
|
+
cleanText: string;
|
|
79
|
+
/** The emotion in scope for this segment (`null` = no emotion cue). */
|
|
80
|
+
emotion: ExpressiveEmotion | null;
|
|
81
|
+
/** Whether `[singing]` is in scope for this segment. */
|
|
82
|
+
singing: boolean;
|
|
83
|
+
/** Preserved non-verbals that appeared inside this segment, in order. */
|
|
84
|
+
nonverbals: ExpressiveNonverbal[];
|
|
85
|
+
}
|
|
86
|
+
export interface ParsedExpressiveText {
|
|
87
|
+
/** The whole `replyText` with every recognized expressive tag removed. */
|
|
88
|
+
cleanText: string;
|
|
89
|
+
/** The text split at every emotion/singing scope boundary. */
|
|
90
|
+
segments: ExpressiveSegment[];
|
|
91
|
+
/** The dominant (first scope-setting) emotion across the text, or `null`. */
|
|
92
|
+
dominantEmotion: ExpressiveEmotion | null;
|
|
93
|
+
/** `true` iff any `[singing]` tag appeared. */
|
|
94
|
+
anySinging: boolean;
|
|
95
|
+
/** `true` iff any recognized expressive tag appeared. */
|
|
96
|
+
hasTags: boolean;
|
|
97
|
+
/** Literal `[token]` occurrences that look like a tag but aren't in the
|
|
98
|
+
* vocabulary (e.g. a hallucinated `[grumpy]`) — recorded, not silently
|
|
99
|
+
* dropped, so the `tagLeakage` check can flag them. */
|
|
100
|
+
unknownTags: string[];
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Parse a `replyText` into expressive segments. A scope-setting tag is an
|
|
104
|
+
* emotion tag or `[singing]`; a `[laughter]` / `[sigh]` is a non-verbal that
|
|
105
|
+
* is recorded on the current segment but does not start a new one. The
|
|
106
|
+
* segment text retains the scope tag (for the singing-GGUF pass-through) and
|
|
107
|
+
* `cleanText` is the same text with all expressive tags removed.
|
|
108
|
+
*
|
|
109
|
+
* Empty / whitespace-only segments between two adjacent scope tags are
|
|
110
|
+
* dropped (a leading `[happy][whisper]hi` → one segment, scope = whisper).
|
|
111
|
+
*/
|
|
112
|
+
export declare function parseExpressiveTags(replyText: string): ParsedExpressiveText;
|
|
113
|
+
/** Strip every recognized expressive tag (emotion / singing / non-verbal)
|
|
114
|
+
* from `text`. Used on a base-TTS path so a literal `[happy]` never reaches
|
|
115
|
+
* the audio. Unknown bracket tokens (`[grumpy]`) are left as-is — they are
|
|
116
|
+
* the model's text, not a tag we recognise. */
|
|
117
|
+
export declare function stripExpressiveTags(text: string): string;
|
|
118
|
+
/**
|
|
119
|
+
* Map the dominant emotion (or `null`) to the Stage-1 envelope `emotion` enum
|
|
120
|
+
* value (`null` → `"none"`). Inverse: `EXPRESSIVE_EMOTION_ENUM` minus `"none"`.
|
|
121
|
+
*/
|
|
122
|
+
export declare function emotionToEnum(emotion: ExpressiveEmotion | null): ExpressiveEmotionEnum;
|
|
123
|
+
/** Map the Stage-1 envelope `emotion` enum value back to an emotion (or `null`). */
|
|
124
|
+
export declare function enumToEmotion(value: ExpressiveEmotionEnum | string | null | undefined): ExpressiveEmotion | null;
|
|
125
|
+
/**
|
|
126
|
+
* Candidate labels a connector or explicitly emotion-aware ASR adapter may
|
|
127
|
+
* surface as structured metadata. The local fused ASR path must not be treated
|
|
128
|
+
* as model-native emotion recognition unless that backend explicitly advertises
|
|
129
|
+
* such a field; callers should record heuristic attribution separately.
|
|
130
|
+
*/
|
|
131
|
+
export declare const QWEN3_ASR_EMOTION_LABELS: readonly ["surprise", "calm", "happiness", "sadness", "disgust", "anger", "fear"];
|
|
132
|
+
export type Qwen3AsrEmotionLabel = (typeof QWEN3_ASR_EMOTION_LABELS)[number];
|
|
133
|
+
/**
|
|
134
|
+
* Explicit mapping from an ASR-perceived emotion label to the tag vocab the
|
|
135
|
+
* generator emits. `whisper` / `singing` are delivery styles, not affects, and
|
|
136
|
+
* are excluded from the fidelity score (scored separately as "style preserved"
|
|
137
|
+
* via the `instruct` round-trip). Labels with no clean tag analogue map to
|
|
138
|
+
* `null` (counted as "no agreement" rather than forced).
|
|
139
|
+
*/
|
|
140
|
+
export declare const ASR_LABEL_TO_EMOTION_TAG: Readonly<Record<Qwen3AsrEmotionLabel, ExpressiveEmotion | null>>;
|
|
141
|
+
/** Normalize an arbitrary ASR-emitted emotion string (any casing, possibly an
|
|
142
|
+
* adjective form) to a `Qwen3AsrEmotionLabel` if it matches, else `null`. */
|
|
143
|
+
export declare function normalizeAsrEmotionLabel(raw: string | null | undefined): Qwen3AsrEmotionLabel | null;
|
|
144
|
+
/** Map an ASR-perceived emotion (raw string) straight to the tag vocab, via
|
|
145
|
+
* `normalizeAsrEmotionLabel` + `ASR_LABEL_TO_EMOTION_TAG`. `null` when it
|
|
146
|
+
* doesn't map. */
|
|
147
|
+
export declare function asrEmotionToTag(raw: string | null | undefined): ExpressiveEmotion | null;
|
|
148
|
+
/**
|
|
149
|
+
* The clause appended to the voice-mode response instruction telling the model
|
|
150
|
+
* it MAY annotate `replyText` with inline expressive tags. Only emit this when
|
|
151
|
+
* `manifest.voice.capabilities` includes `emotion-tags` (don't instruct the
|
|
152
|
+
* model to emit tags a base-TTS bundle will speak literally). `singingAllowed`
|
|
153
|
+
* controls whether `[singing]` is offered (gate on the `singing` capability).
|
|
154
|
+
*/
|
|
155
|
+
export declare function expressiveTagPromptClause(opts?: {
|
|
156
|
+
singingAllowed?: boolean;
|
|
157
|
+
}): string;
|
|
158
|
+
//# sourceMappingURL=expressive-tags.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"expressive-tags.d.ts","sourceRoot":"","sources":["expressive-tags.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AAMH,wEAAwE;AACxE,eAAO,MAAM,uBAAuB,6EAQ1B,CAAC;AAEX,MAAM,MAAM,iBAAiB,GAAG,CAAC,OAAO,uBAAuB,CAAC,CAAC,MAAM,CAAC,CAAC;AAEzE;;;;;;;GAOG;AACH,eAAO,MAAM,uBAAuB,qFAG1B,CAAC;AAEX,MAAM,MAAM,qBAAqB,GAAG,CAAC,OAAO,uBAAuB,CAAC,CAAC,MAAM,CAAC,CAAC;AAE7E,4EAA4E;AAC5E,eAAO,MAAM,sBAAsB,EAAG,SAAkB,CAAC;AAEzD;;;GAGG;AACH,eAAO,MAAM,yBAAyB,+BAAgC,CAAC;AAEvE,MAAM,MAAM,mBAAmB,GAAG,CAAC,OAAO,yBAAyB,CAAC,CAAC,MAAM,CAAC,CAAC;AAE7E;;;;GAIG;AACH,eAAO,MAAM,eAAe,4GAIlB,CAAC;AAEX,MAAM,MAAM,aAAa,GAAG,CAAC,OAAO,eAAe,CAAC,CAAC,MAAM,CAAC,CAAC;AAM7D,uFAAuF;AACvF,wBAAgB,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,IAAI,aAAa,CAEjE;AAED,2EAA2E;AAC3E,wBAAgB,uBAAuB,CACtC,KAAK,EAAE,MAAM,GACX,KAAK,IAAI,qBAAqB,CAEhC;AAkBD,uDAAuD;AACvD,MAAM,WAAW,iBAAiB;IACjC;;;;OAIG;IACH,IAAI,EAAE,MAAM,CAAC;IACb,qEAAqE;IACrE,SAAS,EAAE,MAAM,CAAC;IAClB,uEAAuE;IACvE,OAAO,EAAE,iBAAiB,GAAG,IAAI,CAAC;IAClC,wDAAwD;IACxD,OAAO,EAAE,OAAO,CAAC;IACjB,yEAAyE;IACzE,UAAU,EAAE,mBAAmB,EAAE,CAAC;CAClC;AAED,MAAM,WAAW,oBAAoB;IACpC,0EAA0E;IAC1E,SAAS,EAAE,MAAM,CAAC;IAClB,8DAA8D;IAC9D,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,6EAA6E;IAC7E,eAAe,EAAE,iBAAiB,GAAG,IAAI,CAAC;IAC1C,+CAA+C;IAC/C,UAAU,EAAE,OAAO,CAAC;IACpB,yDAAyD;IACzD,OAAO,EAAE,OAAO,CAAC;IACjB;;4DAEwD;IACxD,WAAW,EAAE,MAAM,EAAE,CAAC;CACtB;AAED;;;;;;;;;GASG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,GAAG,oBAAoB,CAiG3E;AAED;;;gDAGgD;AAChD,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAOxD;AAED;;;GAGG;AACH,wBAAgB,aAAa,CAC5B,OAAO,EAAE,iBAAiB,GAAG,IAAI,GAC/B,qBAAqB,CAEvB;AAED,oFAAoF;AACpF,wBAAgB,aAAa,CAC5B,KAAK,EAAE,qBAAqB,GAAG,MAAM,GAAG,IAAI,GAAG,SAAS,GACtD,iBAAiB,GAAG,IAAI,CAG1B;AAMD;;;;;GAKG;AACH,eAAO,MAAM,wBAAwB,mFAQ3B,CAAC;AAEX,MAAM,MAAM,oBAAoB,GAAG,CAAC,OAAO,wBAAwB,CAAC,CAAC,MAAM,CAAC,CAAC;AAE7E;;;;;;GAMG;AACH,eAAO,MAAM,wBAAwB,EAAE,QAAQ,CAC9C,MAAM,CAAC,oBAAoB,EAAE,iBAAiB,GAAG,IAAI,CAAC,CAStD,CAAC;AAEF;8EAC8E;AAC9E,wBAAgB,wBAAwB,CACvC,GAAG,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAC5B,oBAAoB,GAAG,IAAI,CAmB7B;AAED;;mBAEmB;AACnB,wBAAgB,eAAe,CAC9B,GAAG,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAC5B,iBAAiB,GAAG,IAAI,CAG1B;AAMD;;;;;;GAMG;AACH,wBAAgB,yBAAyB,CACxC,IAAI,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAA;CAAO,GACrC,MAAM,CAeR"}
|
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Expressive (emotion / singing) inline-tag handling for the voice path.
|
|
3
|
+
*
|
|
4
|
+
* The canonical schema is the **omnivoice-singing inline-tag vocabulary,
|
|
5
|
+
* verbatim** (no SSML, no new format — it is exactly what the fine-tuned
|
|
6
|
+
* `omnivoice-singing` GGUF understands when the tags appear inline in the
|
|
7
|
+
* text passed to `eliza_inference_tts_synthesize` / `/v1/audio/speech`):
|
|
8
|
+
*
|
|
9
|
+
* emotion tags [happy] [sad] [angry] [nervous] [calm] [excited] [whisper]
|
|
10
|
+
* singing tag [singing]
|
|
11
|
+
* preserved [laughter] [sigh] (non-verbals — passed through, not
|
|
12
|
+
* consumed as a scope-setting tag)
|
|
13
|
+
*
|
|
14
|
+
* Tags are *inline* and *scoped*: a tag applies from where it appears to the
|
|
15
|
+
* next tag (or end of text). Mid-sentence shifts are allowed
|
|
16
|
+
* (`"that's [excited] amazing"`). `parseExpressiveTags` segments the text by
|
|
17
|
+
* tag boundaries so the chunker / TTS backend can carry the in-scope emotion
|
|
18
|
+
* with each phrase.
|
|
19
|
+
*
|
|
20
|
+
* Two consumers:
|
|
21
|
+
* - the *singing* TTS GGUF: it parses the tags itself, so the bridge passes
|
|
22
|
+
* the tag-bearing text through (`segment.text` keeps `[happy]` etc. — see
|
|
23
|
+
* `makeTextToSpeechHandler`, which deliberately does NOT strip them).
|
|
24
|
+
* - a *base* TTS GGUF (no `emotion-tags` / `singing` capability): the tags
|
|
25
|
+
* would otherwise be spoken literally, so `stripExpressiveTags` removes
|
|
26
|
+
* them before synthesis. (The model shouldn't have emitted them when the
|
|
27
|
+
* bundle lacks the capability — but defense in depth.)
|
|
28
|
+
*
|
|
29
|
+
* Coordinates with:
|
|
30
|
+
* - `manifest/schema.ts` → `ELIZA_1_VOICE_CAPABILITIES` (`emotion-tags` /
|
|
31
|
+
* `singing` gate whether the prompt instructs the model to emit tags).
|
|
32
|
+
* - WS-4's Stage-1 envelope `emotion` enum field (one-line field-evaluator
|
|
33
|
+
* registration there) — `EXPRESSIVE_EMOTION_ENUM` is the shared value set.
|
|
34
|
+
* - WS-3's `voice_emotion` fine-tune corpus task — both forms (inline tags +
|
|
35
|
+
* the `emotion` field) are populated on voice-mode rows.
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
// The tag vocabulary
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
/** Emotion tags that set the affect scope for the text that follows. */
|
|
43
|
+
export const EXPRESSIVE_EMOTION_TAGS = [
|
|
44
|
+
"happy",
|
|
45
|
+
"sad",
|
|
46
|
+
"angry",
|
|
47
|
+
"nervous",
|
|
48
|
+
"calm",
|
|
49
|
+
"excited",
|
|
50
|
+
"whisper",
|
|
51
|
+
] as const;
|
|
52
|
+
|
|
53
|
+
export type ExpressiveEmotion = (typeof EXPRESSIVE_EMOTION_TAGS)[number];
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* The Stage-1 envelope `emotion` enum value set (decision #2 in
|
|
57
|
+
* `.swarm/IMPLEMENTATION_PLAN.md` §1 — the optional field-evaluator WS-4 wires
|
|
58
|
+
* registers exactly this enum). `none` is the default / "no expressive cue"
|
|
59
|
+
* value so the field is always present and structured-decode can singleton-fill
|
|
60
|
+
* it. `whisper` is included because it travels the same inline-tag channel even
|
|
61
|
+
* though it is a delivery style rather than an affect.
|
|
62
|
+
*/
|
|
63
|
+
export const EXPRESSIVE_EMOTION_ENUM = [
|
|
64
|
+
"none",
|
|
65
|
+
...EXPRESSIVE_EMOTION_TAGS,
|
|
66
|
+
] as const;
|
|
67
|
+
|
|
68
|
+
export type ExpressiveEmotionEnum = (typeof EXPRESSIVE_EMOTION_ENUM)[number];
|
|
69
|
+
|
|
70
|
+
/** The singing tag — a style flag, not an affect; orthogonal to emotion. */
|
|
71
|
+
export const EXPRESSIVE_SINGING_TAG = "singing" as const;
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Preserved non-verbals — these are *rendered* as sound effects by the TTS,
|
|
75
|
+
* not consumed as scope-setting tags. They pass straight through the bridge.
|
|
76
|
+
*/
|
|
77
|
+
export const EXPRESSIVE_NONVERBAL_TAGS = ["laughter", "sigh"] as const;
|
|
78
|
+
|
|
79
|
+
export type ExpressiveNonverbal = (typeof EXPRESSIVE_NONVERBAL_TAGS)[number];
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* The full inline-tag vocabulary (emotion + singing + preserved non-verbals),
|
|
83
|
+
* verbatim — the union the `omnivoice-singing` GGUF understands. Use this for
|
|
84
|
+
* the prompt clause and the `tagLeakage` check.
|
|
85
|
+
*/
|
|
86
|
+
export const EXPRESSIVE_TAGS = [
|
|
87
|
+
...EXPRESSIVE_EMOTION_TAGS,
|
|
88
|
+
EXPRESSIVE_SINGING_TAG,
|
|
89
|
+
...EXPRESSIVE_NONVERBAL_TAGS,
|
|
90
|
+
] as const;
|
|
91
|
+
|
|
92
|
+
export type ExpressiveTag = (typeof EXPRESSIVE_TAGS)[number];
|
|
93
|
+
|
|
94
|
+
const EMOTION_SET: ReadonlySet<string> = new Set(EXPRESSIVE_EMOTION_TAGS);
|
|
95
|
+
const NONVERBAL_SET: ReadonlySet<string> = new Set(EXPRESSIVE_NONVERBAL_TAGS);
|
|
96
|
+
const ALL_TAG_SET: ReadonlySet<string> = new Set(EXPRESSIVE_TAGS);
|
|
97
|
+
|
|
98
|
+
/** `true` iff `tag` (without brackets, case-insensitive) is a legal expressive tag. */
|
|
99
|
+
export function isExpressiveTag(tag: string): tag is ExpressiveTag {
|
|
100
|
+
return ALL_TAG_SET.has(tag.trim().toLowerCase());
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/** `true` iff `value` is a legal `emotion` enum value (incl. `"none"`). */
|
|
104
|
+
export function isExpressiveEmotionEnum(
|
|
105
|
+
value: string,
|
|
106
|
+
): value is ExpressiveEmotionEnum {
|
|
107
|
+
return value === "none" || EMOTION_SET.has(value);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Match `[tag]` with optional surrounding whitespace inside the brackets.
|
|
111
|
+
// Anchored to the bracket characters; the inner text is captured for lookup.
|
|
112
|
+
// **A fresh regex per call** — a global regex carries `lastIndex` state, and
|
|
113
|
+
// `parseExpressiveTags` calls `String.prototype.replace` (which resets a
|
|
114
|
+
// shared regex's `lastIndex` to 0) from inside its own `exec` loop on the same
|
|
115
|
+
// pattern; sharing one object there is an infinite loop. `tagRegex()` hands
|
|
116
|
+
// out independent instances.
|
|
117
|
+
const TAG_RE_SOURCE = "\\[\\s*([a-zA-Z][a-zA-Z-]*)\\s*\\]";
|
|
118
|
+
function tagRegex(): RegExp {
|
|
119
|
+
return new RegExp(TAG_RE_SOURCE, "g");
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
// parseExpressiveTags
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
/** One scoped segment of an expressive `replyText`. */
|
|
127
|
+
export interface ExpressiveSegment {
|
|
128
|
+
/**
|
|
129
|
+
* The segment text. For the singing-GGUF path this *keeps* the leading
|
|
130
|
+
* scope-setting emotion/singing tag and any inline non-verbals (the GGUF
|
|
131
|
+
* parses them); for a base-TTS path use `cleanText` / `stripExpressiveTags`.
|
|
132
|
+
*/
|
|
133
|
+
text: string;
|
|
134
|
+
/** The segment text with every recognized expressive tag removed. */
|
|
135
|
+
cleanText: string;
|
|
136
|
+
/** The emotion in scope for this segment (`null` = no emotion cue). */
|
|
137
|
+
emotion: ExpressiveEmotion | null;
|
|
138
|
+
/** Whether `[singing]` is in scope for this segment. */
|
|
139
|
+
singing: boolean;
|
|
140
|
+
/** Preserved non-verbals that appeared inside this segment, in order. */
|
|
141
|
+
nonverbals: ExpressiveNonverbal[];
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
export interface ParsedExpressiveText {
|
|
145
|
+
/** The whole `replyText` with every recognized expressive tag removed. */
|
|
146
|
+
cleanText: string;
|
|
147
|
+
/** The text split at every emotion/singing scope boundary. */
|
|
148
|
+
segments: ExpressiveSegment[];
|
|
149
|
+
/** The dominant (first scope-setting) emotion across the text, or `null`. */
|
|
150
|
+
dominantEmotion: ExpressiveEmotion | null;
|
|
151
|
+
/** `true` iff any `[singing]` tag appeared. */
|
|
152
|
+
anySinging: boolean;
|
|
153
|
+
/** `true` iff any recognized expressive tag appeared. */
|
|
154
|
+
hasTags: boolean;
|
|
155
|
+
/** Literal `[token]` occurrences that look like a tag but aren't in the
|
|
156
|
+
* vocabulary (e.g. a hallucinated `[grumpy]`) — recorded, not silently
|
|
157
|
+
* dropped, so the `tagLeakage` check can flag them. */
|
|
158
|
+
unknownTags: string[];
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Parse a `replyText` into expressive segments. A scope-setting tag is an
|
|
163
|
+
* emotion tag or `[singing]`; a `[laughter]` / `[sigh]` is a non-verbal that
|
|
164
|
+
* is recorded on the current segment but does not start a new one. The
|
|
165
|
+
* segment text retains the scope tag (for the singing-GGUF pass-through) and
|
|
166
|
+
* `cleanText` is the same text with all expressive tags removed.
|
|
167
|
+
*
|
|
168
|
+
* Empty / whitespace-only segments between two adjacent scope tags are
|
|
169
|
+
* dropped (a leading `[happy][whisper]hi` → one segment, scope = whisper).
|
|
170
|
+
*/
|
|
171
|
+
export function parseExpressiveTags(replyText: string): ParsedExpressiveText {
|
|
172
|
+
const text = typeof replyText === "string" ? replyText : "";
|
|
173
|
+
const segments: ExpressiveSegment[] = [];
|
|
174
|
+
const unknownTags: string[] = [];
|
|
175
|
+
let dominantEmotion: ExpressiveEmotion | null = null;
|
|
176
|
+
let anySinging = false;
|
|
177
|
+
let hasTags = false;
|
|
178
|
+
|
|
179
|
+
// Walk the matches, accumulating the text between scope boundaries.
|
|
180
|
+
let cursor = 0;
|
|
181
|
+
let curEmotion: ExpressiveEmotion | null = null;
|
|
182
|
+
let curSinging = false;
|
|
183
|
+
let curRawParts: string[] = [];
|
|
184
|
+
let curNonverbals: ExpressiveNonverbal[] = [];
|
|
185
|
+
|
|
186
|
+
const flush = (): void => {
|
|
187
|
+
const raw = curRawParts.join("");
|
|
188
|
+
// Fresh regex — must NOT touch `re`'s `lastIndex` (we're inside `re`'s loop).
|
|
189
|
+
const clean = raw.replace(tagRegex(), "").trim();
|
|
190
|
+
// A segment with no visible text and no non-verbals carries nothing.
|
|
191
|
+
if (clean.length === 0 && curNonverbals.length === 0) return;
|
|
192
|
+
segments.push({
|
|
193
|
+
text: raw,
|
|
194
|
+
cleanText: clean,
|
|
195
|
+
emotion: curEmotion,
|
|
196
|
+
singing: curSinging,
|
|
197
|
+
nonverbals: [...curNonverbals],
|
|
198
|
+
});
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
const re = tagRegex();
|
|
202
|
+
let m: RegExpExecArray | null;
|
|
203
|
+
// biome-ignore lint/suspicious/noAssignInExpressions: standard regex-exec loop.
|
|
204
|
+
while ((m = re.exec(text)) !== null) {
|
|
205
|
+
// Zero-width matches can't happen (the pattern needs `[…]`), but guard
|
|
206
|
+
// anyway so a pattern change can't wedge the loop.
|
|
207
|
+
if (m[0].length === 0) {
|
|
208
|
+
re.lastIndex += 1;
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
const before = text.slice(cursor, m.index);
|
|
212
|
+
cursor = m.index + m[0].length;
|
|
213
|
+
const inner = (m[1] ?? "").toLowerCase();
|
|
214
|
+
if (EMOTION_SET.has(inner) || inner === EXPRESSIVE_SINGING_TAG) {
|
|
215
|
+
// A scope-setting tag: append the lead-in text + this tag to the
|
|
216
|
+
// *current* segment's raw, then flush and start a new scope. (Keeping
|
|
217
|
+
// the tag in the raw text means the singing GGUF still sees it at the
|
|
218
|
+
// head of the next phrase.)
|
|
219
|
+
curRawParts.push(before);
|
|
220
|
+
flush();
|
|
221
|
+
curRawParts = [m[0]];
|
|
222
|
+
curNonverbals = [];
|
|
223
|
+
hasTags = true;
|
|
224
|
+
if (inner === EXPRESSIVE_SINGING_TAG) {
|
|
225
|
+
curSinging = true;
|
|
226
|
+
anySinging = true;
|
|
227
|
+
} else {
|
|
228
|
+
curEmotion = inner as ExpressiveEmotion;
|
|
229
|
+
if (dominantEmotion === null) dominantEmotion = curEmotion;
|
|
230
|
+
}
|
|
231
|
+
} else if (NONVERBAL_SET.has(inner)) {
|
|
232
|
+
// A non-verbal: keep it in the raw text, record it, don't start a scope.
|
|
233
|
+
curRawParts.push(before, m[0]);
|
|
234
|
+
curNonverbals.push(inner as ExpressiveNonverbal);
|
|
235
|
+
hasTags = true;
|
|
236
|
+
} else {
|
|
237
|
+
// Unknown bracket token — keep it verbatim (it's the model's text) and
|
|
238
|
+
// record it for the leakage check.
|
|
239
|
+
curRawParts.push(before, m[0]);
|
|
240
|
+
unknownTags.push(m[0]);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
curRawParts.push(text.slice(cursor));
|
|
244
|
+
flush();
|
|
245
|
+
|
|
246
|
+
// If there were no tags at all, present the whole text as one neutral segment.
|
|
247
|
+
if (segments.length === 0) {
|
|
248
|
+
const clean = text.trim();
|
|
249
|
+
if (clean.length > 0) {
|
|
250
|
+
segments.push({
|
|
251
|
+
text,
|
|
252
|
+
cleanText: clean,
|
|
253
|
+
emotion: null,
|
|
254
|
+
singing: false,
|
|
255
|
+
nonverbals: [],
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
cleanText: text.replace(tagRegex(), "").replace(/\s+/g, " ").trim(),
|
|
262
|
+
segments,
|
|
263
|
+
dominantEmotion,
|
|
264
|
+
anySinging,
|
|
265
|
+
hasTags,
|
|
266
|
+
unknownTags,
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/** Strip every recognized expressive tag (emotion / singing / non-verbal)
|
|
271
|
+
* from `text`. Used on a base-TTS path so a literal `[happy]` never reaches
|
|
272
|
+
* the audio. Unknown bracket tokens (`[grumpy]`) are left as-is — they are
|
|
273
|
+
* the model's text, not a tag we recognise. */
|
|
274
|
+
export function stripExpressiveTags(text: string): string {
|
|
275
|
+
return text
|
|
276
|
+
.replace(tagRegex(), (full, inner) =>
|
|
277
|
+
ALL_TAG_SET.has(String(inner).toLowerCase()) ? "" : full,
|
|
278
|
+
)
|
|
279
|
+
.replace(/[ \t]{2,}/g, " ")
|
|
280
|
+
.trim();
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Map the dominant emotion (or `null`) to the Stage-1 envelope `emotion` enum
|
|
285
|
+
* value (`null` → `"none"`). Inverse: `EXPRESSIVE_EMOTION_ENUM` minus `"none"`.
|
|
286
|
+
*/
|
|
287
|
+
export function emotionToEnum(
|
|
288
|
+
emotion: ExpressiveEmotion | null,
|
|
289
|
+
): ExpressiveEmotionEnum {
|
|
290
|
+
return emotion ?? "none";
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/** Map the Stage-1 envelope `emotion` enum value back to an emotion (or `null`). */
|
|
294
|
+
export function enumToEmotion(
|
|
295
|
+
value: ExpressiveEmotionEnum | string | null | undefined,
|
|
296
|
+
): ExpressiveEmotion | null {
|
|
297
|
+
if (!value || value === "none") return null;
|
|
298
|
+
return EMOTION_SET.has(value) ? (value as ExpressiveEmotion) : null;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// ---------------------------------------------------------------------------
|
|
302
|
+
// Optional ASR emotion metadata mapping
|
|
303
|
+
// ---------------------------------------------------------------------------
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Candidate labels a connector or explicitly emotion-aware ASR adapter may
|
|
307
|
+
* surface as structured metadata. The local fused ASR path must not be treated
|
|
308
|
+
* as model-native emotion recognition unless that backend explicitly advertises
|
|
309
|
+
* such a field; callers should record heuristic attribution separately.
|
|
310
|
+
*/
|
|
311
|
+
export const QWEN3_ASR_EMOTION_LABELS = [
|
|
312
|
+
"surprise",
|
|
313
|
+
"calm",
|
|
314
|
+
"happiness",
|
|
315
|
+
"sadness",
|
|
316
|
+
"disgust",
|
|
317
|
+
"anger",
|
|
318
|
+
"fear",
|
|
319
|
+
] as const;
|
|
320
|
+
|
|
321
|
+
export type Qwen3AsrEmotionLabel = (typeof QWEN3_ASR_EMOTION_LABELS)[number];
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Explicit mapping from an ASR-perceived emotion label to the tag vocab the
|
|
325
|
+
* generator emits. `whisper` / `singing` are delivery styles, not affects, and
|
|
326
|
+
* are excluded from the fidelity score (scored separately as "style preserved"
|
|
327
|
+
* via the `instruct` round-trip). Labels with no clean tag analogue map to
|
|
328
|
+
* `null` (counted as "no agreement" rather than forced).
|
|
329
|
+
*/
|
|
330
|
+
export const ASR_LABEL_TO_EMOTION_TAG: Readonly<
|
|
331
|
+
Record<Qwen3AsrEmotionLabel, ExpressiveEmotion | null>
|
|
332
|
+
> = {
|
|
333
|
+
happiness: "happy",
|
|
334
|
+
sadness: "sad",
|
|
335
|
+
anger: "angry",
|
|
336
|
+
fear: "nervous",
|
|
337
|
+
calm: "calm",
|
|
338
|
+
surprise: "excited",
|
|
339
|
+
disgust: null,
|
|
340
|
+
};
|
|
341
|
+
|
|
342
|
+
/** Normalize an arbitrary ASR-emitted emotion string (any casing, possibly an
|
|
343
|
+
* adjective form) to a `Qwen3AsrEmotionLabel` if it matches, else `null`. */
|
|
344
|
+
export function normalizeAsrEmotionLabel(
|
|
345
|
+
raw: string | null | undefined,
|
|
346
|
+
): Qwen3AsrEmotionLabel | null {
|
|
347
|
+
if (!raw) return null;
|
|
348
|
+
const v = raw.trim().toLowerCase();
|
|
349
|
+
// Direct hit.
|
|
350
|
+
if ((QWEN3_ASR_EMOTION_LABELS as readonly string[]).includes(v)) {
|
|
351
|
+
return v as Qwen3AsrEmotionLabel;
|
|
352
|
+
}
|
|
353
|
+
// Common adjective forms → noun labels.
|
|
354
|
+
const ADJ: Record<string, Qwen3AsrEmotionLabel> = {
|
|
355
|
+
happy: "happiness",
|
|
356
|
+
sad: "sadness",
|
|
357
|
+
angry: "anger",
|
|
358
|
+
fearful: "fear",
|
|
359
|
+
afraid: "fear",
|
|
360
|
+
scared: "fear",
|
|
361
|
+
surprised: "surprise",
|
|
362
|
+
disgusted: "disgust",
|
|
363
|
+
};
|
|
364
|
+
return ADJ[v] ?? null;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/** Map an ASR-perceived emotion (raw string) straight to the tag vocab, via
|
|
368
|
+
* `normalizeAsrEmotionLabel` + `ASR_LABEL_TO_EMOTION_TAG`. `null` when it
|
|
369
|
+
* doesn't map. */
|
|
370
|
+
export function asrEmotionToTag(
|
|
371
|
+
raw: string | null | undefined,
|
|
372
|
+
): ExpressiveEmotion | null {
|
|
373
|
+
const label = normalizeAsrEmotionLabel(raw);
|
|
374
|
+
return label ? ASR_LABEL_TO_EMOTION_TAG[label] : null;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// ---------------------------------------------------------------------------
|
|
378
|
+
// The voice-output prompt clause
|
|
379
|
+
// ---------------------------------------------------------------------------
|
|
380
|
+
|
|
381
|
+
/**
|
|
382
|
+
* The clause appended to the voice-mode response instruction telling the model
|
|
383
|
+
* it MAY annotate `replyText` with inline expressive tags. Only emit this when
|
|
384
|
+
* `manifest.voice.capabilities` includes `emotion-tags` (don't instruct the
|
|
385
|
+
* model to emit tags a base-TTS bundle will speak literally). `singingAllowed`
|
|
386
|
+
* controls whether `[singing]` is offered (gate on the `singing` capability).
|
|
387
|
+
*/
|
|
388
|
+
export function expressiveTagPromptClause(
|
|
389
|
+
opts: { singingAllowed?: boolean } = {},
|
|
390
|
+
): string {
|
|
391
|
+
const singing = opts.singingAllowed === true;
|
|
392
|
+
const vocab = [
|
|
393
|
+
...EXPRESSIVE_EMOTION_TAGS.map((t) => `[${t}]`),
|
|
394
|
+
...(singing ? [`[${EXPRESSIVE_SINGING_TAG}]`] : []),
|
|
395
|
+
...EXPRESSIVE_NONVERBAL_TAGS.map((t) => `[${t}]`),
|
|
396
|
+
].join(" ");
|
|
397
|
+
return (
|
|
398
|
+
"When the turn is spoken aloud, you MAY annotate replyText with inline " +
|
|
399
|
+
`expressive tags from this exact set: ${vocab}. ` +
|
|
400
|
+
"A tag applies from where it appears to the next tag (or end of text); " +
|
|
401
|
+
"mid-sentence shifts are allowed. Use them sparingly and only when the " +
|
|
402
|
+
"affect is genuine. Do not use tags in text-only turns, and do not invent " +
|
|
403
|
+
"tags outside this set."
|
|
404
|
+
);
|
|
405
|
+
}
|