@omote/core 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +463 -207
- package/dist/index.d.ts +463 -207
- package/dist/index.js +542 -186
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +534 -178
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -5982,7 +5982,7 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
5982
5982
|
video.remove();
|
|
5983
5983
|
return new RawVideo(frames, duration);
|
|
5984
5984
|
}
|
|
5985
|
-
var ONNX_WEB, import_meta, __defProp2, __export2, emptyObj, node_fs_default, emptyObj2, node_path_default, emptyObj3, node_url_default, VERSION, IS_PROCESS_AVAILABLE, IS_NODE_ENV, IS_FS_AVAILABLE, IS_PATH_AVAILABLE, IS_DENO_RUNTIME, IS_BUN_RUNTIME, IS_BROWSER_ENV, IS_WEBWORKER_ENV, IS_WEB_CACHE_AVAILABLE, IS_WEBGPU_AVAILABLE, IS_WEBNN_AVAILABLE, isSafari, IS_SAFARI, apis, RUNNING_LOCALLY, dirname__, DEFAULT_CACHE_DIR, DEFAULT_LOCAL_MODEL_PATH, localModelPath, env2, Callable, CONTENT_TYPE_MAP, FileResponse, FileCache, ERROR_MAPPING, MAX_EXTERNAL_DATA_CHUNKS, REPO_ID_REGEX, P2FFT, NP2FFT, FFT, uint16_to_float32, onnxruntime_node_exports, noop, emptyObj4, onnxruntime_node_default, Readable, pipeline, createWriteStream, createReadStream, DEVICE_TO_EXECUTION_PROVIDER_MAPPING, LOG_LEVELS, DEFAULT_LOG_LEVEL, supportedDevices, defaultDevices, ONNX, ORT_SYMBOL, InferenceSession2, IS_WEB_ENV, webInitChain, wasmLoadPromise, webInferenceChain, ONNX_ENV, wrap, _a, TensorOpRegistry, DataTypeMap, Tensor22, arrayToIndexTensor, PriorityQueue, CharTrie, CharTrieNode, TokenLattice, TokenLatticeNode, DictionarySplitter, LRUCache, TOKEN_TYPES, Token, ORDERED_MAPPING_TABLE, ESCAPE_CHARACTERS, Statement, Program, If, For, Break, Continue, SetStatement, Macro, Comment, Expression, MemberExpression, CallExpression, Identifier, Literal, IntegerLiteral, FloatLiteral, StringLiteral, ArrayLiteral, TupleLiteral, ObjectLiteral, BinaryExpression, FilterExpression, FilterStatement, SelectExpression, TestExpression, UnaryExpression, SliceExpression, KeywordArgumentExpression, SpreadExpression, CallStatement, Ternary, BreakControl, ContinueControl, RuntimeValue, IntegerValue, FloatValue, StringValue, BooleanValue, ObjectValue, KeywordArgumentsValue, ArrayValue, TupleValue, FunctionValue, NullValue, UndefinedValue, Environment, Interpreter, NEWLINE, OPEN_STATEMENT, CLOSE_STATEMENT, Template, WHISPER_LANGUAGES, WHISPER_LANGUAGE_MAPPING, WHISPER_TO_LANGUAGE_CODE_MAPPING, PUNCTUATION_REGEX, PUNCTUATION_ONLY_REGEX, BLOOM_SPLIT_CHARS, PROBLEMATIC_REGEX_MAP, AddedToken, TokenizerModel, WordPieceTokenizer, Unigram, BYTES_TO_UNICODE, UNICODE_TO_BYTES, BPE, LegacyTokenizerModel, Normalizer, Replace, UnicodeNormalizer, NFC, NFD, NFKC, NFKD, StripNormalizer, StripAccents, Lowercase, Prepend, NormalizerSequence, BertNormalizer, PreTokenizer, BertPreTokenizer, ByteLevelPreTokenizer, SplitPreTokenizer, PunctuationPreTokenizer, DigitsPreTokenizer, PostProcessor, BertProcessing, RobertaProcessing, TemplateProcessing, ByteLevelPostProcessor, PostProcessorSequence, Decoder, ReplaceDecoder, ByteFallback, FuseDecoder, StripDecoder, WordPieceDecoder, ByteLevelDecoder, CTCDecoder, DecoderSequence, BPEDecoder, VitsDecoder, MetaspacePreTokenizer, MetaspaceDecoder, Precompiled, PreTokenizerSequence, WhitespacePreTokenizer, WhitespaceSplit, ReplacePreTokenizer, FixedLengthPreTokenizer, SPECIAL_TOKEN_ATTRIBUTES, PreTrainedTokenizer, TokenizersBackend, BertTokenizer, AlbertTokenizer, MobileBertTokenizer, SqueezeBertTokenizer, DebertaTokenizer, DebertaV2Tokenizer, HerbertTokenizer, ConvBertTokenizer, RoFormerTokenizer, DistilBertTokenizer, CamembertTokenizer, XLMTokenizer, ElectraTokenizer, T5Tokenizer, GPT2Tokenizer, BartTokenizer, MBartTokenizer, MBart50Tokenizer, RobertaTokenizer, BloomTokenizer, SPIECE_UNDERLINE, LlamaTokenizer, CodeLlamaTokenizer, XLMRobertaTokenizer, MPNetTokenizer, FalconTokenizer, GPTNeoXTokenizer, EsmTokenizer, Qwen2Tokenizer, GemmaTokenizer, Grok1Tokenizer, NllbTokenizer, M2M100Tokenizer, WhisperTokenizer, CodeGenTokenizer, CLIPTokenizer, SiglipTokenizer, MarianTokenizer, Wav2Vec2CTCTokenizer, BlenderbotTokenizer, BlenderbotSmallTokenizer, SpeechT5Tokenizer, NougatTokenizer, VitsTokenizer, CohereTokenizer, MgpstrTokenizer, Ernie4_5_Tokenizer, _a2, AutoTokenizer, GITHUB_ISSUE_URL, FEATURE_EXTRACTOR_NAME, IMAGE_PROCESSOR_NAME, PROCESSOR_NAME, CHAT_TEMPLATE_NAME, _a3, Processor, processors_exports, FeatureExtractor, feature_extractors_exports, noop2, Readable2, noop3, pipeline2, HERTZ_TO_MEL_MAPPING, MEL_TO_HERTZ_MAPPING, RawAudio, ASTFeatureExtractor, EncodecFeatureExtractor, ChatterboxFeatureExtractor, ClapFeatureExtractor, DacFeatureExtractor, Gemma3nAudioFeatureExtractor, MoonshineFeatureExtractor, EPSILON, ParakeetFeatureExtractor, PyAnnoteFeatureExtractor, SeamlessM4TFeatureExtractor, SnacFeatureExtractor, SpeechT5FeatureExtractor, Wav2Vec2FeatureExtractor, WeSpeakerFeatureExtractor, WhisperFeatureExtractor, emptyObj5, sharp_default, createCanvasFunction, ImageDataClass, loadImageFunction, IS_BROWSER_OR_WEBWORKER, RESAMPLING_MAPPING, CONTENT_TYPE_MAP2, RawImage, load_image, ImageProcessor, AutoFeatureExtractor, _a4, ChatterboxProcessor, image_processors_exports, BeitFeatureExtractor, BitImageProcessor, ChineseCLIPFeatureExtractor, CLIPImageProcessor, CLIPFeatureExtractor, ConvNextImageProcessor, ConvNextFeatureExtractor, DeiTImageProcessor, DeiTFeatureExtractor, DetrImageProcessor, DetrFeatureExtractor, DINOv3ViTImageProcessor, DonutImageProcessor, DonutFeatureExtractor, DPTImageProcessor, DPTFeatureExtractor, EfficientNetImageProcessor, GLPNFeatureExtractor, GroundingDinoImageProcessor, Idefics3ImageProcessor, VLMImageProcessor, JinaCLIPImageProcessor, LlavaOnevisionImageProcessor, MaskFormerImageProcessor, MaskFormerFeatureExtractor, Mask2FormerImageProcessor, MobileNetV1ImageProcessor, MobileNetV1FeatureExtractor, MobileNetV2ImageProcessor, MobileNetV2FeatureExtractor, MobileNetV3ImageProcessor, MobileNetV3FeatureExtractor, MobileNetV4ImageProcessor, MobileNetV4FeatureExtractor, MobileViTImageProcessor, MobileViTFeatureExtractor, NougatImageProcessor, OwlViTImageProcessor, OwlViTFeatureExtractor, Owlv2ImageProcessor, IMAGE_SIZE, SLICE_AXES, ceil, floor, sqrt, Phi3VImageProcessor, PvtImageProcessor, Qwen2VLImageProcessor, RTDetrImageProcessor, SamImageProcessor, SegformerImageProcessor, SegformerFeatureExtractor, SiglipImageProcessor, Swin2SRImageProcessor, ViTImageProcessor, ViTFeatureExtractor, VitMatteImageProcessor, VitPoseImageProcessor, YolosImageProcessor, YolosFeatureExtractor, AutoImageProcessor, _a5, Florence2Processor, _a6, Gemma3nProcessor, _a7, GroundingDinoProcessor, _a8, Idefics3Processor, _a9, VLChatProcessor, _a10, JinaCLIPProcessor, _a11, LlavaProcessor, DECODE_TYPE_MAPPING, _a12, MgpstrProcessor, _a13, MoonshineProcessor, _a14, OwlViTProcessor, IMAGE_TOKEN, IMAGE_TOKEN_PATTERN, _a15, Phi3VProcessor, IMAGE_TOKEN2, _a16, PaliGemmaProcessor, _a17, PyAnnoteProcessor, _a18, Qwen2VLProcessor, _a19, SamProcessor, Sam2Processor, Sam2VideoProcessor, _a20, SpeechT5Processor, _a21, UltravoxProcessor, AUDIO_TOKEN, BEGIN_AUDIO_TOKEN, NUM_AUDIO_TOKENS, _a22, VoxtralProcessor, _a23, Wav2Vec2Processor, _a24, Wav2Vec2ProcessorWithLM, _a25, WhisperProcessor, AutoProcessor, PretrainedConfig, AutoConfig, DEVICE_TYPES, isWebGpuFp16Supported, DATA_TYPES, DEFAULT_DEVICE_DTYPE_MAPPING, DEFAULT_DTYPE_SUFFIX_MAPPING, LogitsProcessor, LogitsWarper, LogitsProcessorList, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, SuppressTokensAtBeginLogitsProcessor, WhisperTimeStampLogitsProcessor, NoRepeatNGramLogitsProcessor, RepetitionPenaltyLogitsProcessor, MinLengthLogitsProcessor, MinNewTokensLengthLogitsProcessor, NoBadWordsLogitsProcessor, ClassifierFreeGuidanceLogitsProcessor, TemperatureLogitsWarper, TopPLogitsWarper, TopKLogitsWarper, GenerationConfig, StoppingCriteria, StoppingCriteriaList, MaxLengthCriteria, EosTokenCriteria, InterruptableStoppingCriteria, LogitsSampler, GreedySampler, MultinomialSampler, BeamSearchSampler, WhisperGenerationConfig, MODEL_TYPES, MODEL_TYPE_MAPPING, MODEL_NAME_TO_CLASS_MAPPING, MODEL_CLASS_TO_NAME_MAPPING, PreTrainedModel, ModelOutput, BaseModelOutput, BertPreTrainedModel, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering, NeoBertPreTrainedModel, NeoBertModel, NeoBertForMaskedLM, NeoBertForSequenceClassification, NeoBertForTokenClassification, NeoBertForQuestionAnswering, ModernBertPreTrainedModel, ModernBertModel, ModernBertForMaskedLM, ModernBertForSequenceClassification, ModernBertForTokenClassification, ModernBertDecoderPreTrainedModel, ModernBertDecoderModel, ModernBertDecoderForCausalLM, NomicBertPreTrainedModel, NomicBertModel, RoFormerPreTrainedModel, RoFormerModel, RoFormerForMaskedLM, RoFormerForSequenceClassification, RoFormerForTokenClassification, RoFormerForQuestionAnswering, ConvBertPreTrainedModel, ConvBertModel, ConvBertForMaskedLM, ConvBertForSequenceClassification, ConvBertForTokenClassification, ConvBertForQuestionAnswering, ElectraPreTrainedModel, ElectraModel, ElectraForMaskedLM, ElectraForSequenceClassification, ElectraForTokenClassification, ElectraForQuestionAnswering, CamembertPreTrainedModel, CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForTokenClassification, CamembertForQuestionAnswering, DebertaPreTrainedModel, DebertaModel, DebertaForMaskedLM, DebertaForSequenceClassification, DebertaForTokenClassification, DebertaForQuestionAnswering, DebertaV2PreTrainedModel, DebertaV2Model, DebertaV2ForMaskedLM, DebertaV2ForSequenceClassification, DebertaV2ForTokenClassification, DebertaV2ForQuestionAnswering, DistilBertPreTrainedModel, DistilBertModel, DistilBertForSequenceClassification, DistilBertForTokenClassification, DistilBertForQuestionAnswering, DistilBertForMaskedLM, EsmPreTrainedModel, EsmModel, EsmForMaskedLM, EsmForSequenceClassification, EsmForTokenClassification, MobileBertPreTrainedModel, MobileBertModel, MobileBertForMaskedLM, MobileBertForSequenceClassification, MobileBertForQuestionAnswering, MPNetPreTrainedModel, MPNetModel, MPNetForMaskedLM, MPNetForSequenceClassification, MPNetForTokenClassification, MPNetForQuestionAnswering, SqueezeBertPreTrainedModel, SqueezeBertModel, SqueezeBertForMaskedLM, SqueezeBertForSequenceClassification, SqueezeBertForQuestionAnswering, AlbertPreTrainedModel, AlbertModel, AlbertForSequenceClassification, AlbertForQuestionAnswering, AlbertForMaskedLM, T5PreTrainedModel, T5Model, T5ForConditionalGeneration, LongT5PreTrainedModel, LongT5Model, LongT5ForConditionalGeneration, MT5PreTrainedModel, MT5Model, MT5ForConditionalGeneration, BartPretrainedModel, BartModel, BartForConditionalGeneration, BartForSequenceClassification, MBartPreTrainedModel, MBartModel, MBartForConditionalGeneration, MBartForSequenceClassification, MBartForCausalLM, BlenderbotPreTrainedModel, BlenderbotModel, BlenderbotForConditionalGeneration, BlenderbotSmallPreTrainedModel, BlenderbotSmallModel, BlenderbotSmallForConditionalGeneration, RobertaPreTrainedModel, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaForQuestionAnswering, XLMPreTrainedModel, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForTokenClassification, XLMForQuestionAnswering, XLMRobertaPreTrainedModel, XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification, XLMRobertaForQuestionAnswering, ASTPreTrainedModel, ASTModel, ASTForAudioClassification, WhisperPreTrainedModel, WhisperModel, WhisperForConditionalGeneration, LiteWhisperForConditionalGeneration, MoonshinePreTrainedModel, MoonshineModel, MoonshineForConditionalGeneration, VisionEncoderDecoderModel, LlavaPreTrainedModel, LlavaForConditionalGeneration, LlavaOnevisionForConditionalGeneration, Moondream1ForConditionalGeneration, Florence2PreTrainedModel, Florence2ForConditionalGeneration, PaliGemmaPreTrainedModel, PaliGemmaForConditionalGeneration, LlavaQwen2ForCausalLM, Gemma3nPreTrainedModel, Gemma3nForConditionalGeneration, Idefics3PreTrainedModel, Idefics3ForConditionalGeneration, SmolVLMForConditionalGeneration, Phi3VPreTrainedModel, Phi3VForCausalLM, CLIPPreTrainedModel, CLIPModel, CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection, SiglipPreTrainedModel, SiglipModel, SiglipTextModel, SiglipVisionModel, ChineseCLIPPreTrainedModel, ChineseCLIPModel, JinaCLIPPreTrainedModel, JinaCLIPModel, JinaCLIPTextModel, JinaCLIPVisionModel, CLIPSegPreTrainedModel, CLIPSegModel, CLIPSegForImageSegmentation, GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, GptOssPreTrainedModel, GptOssModel, GptOssForCausalLM, JAISPreTrainedModel, JAISModel, JAISLMHeadModel, GPTNeoPreTrainedModel, GPTNeoModel, GPTNeoForCausalLM, GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXForCausalLM, GPTJPreTrainedModel, GPTJModel, GPTJForCausalLM, GPTBigCodePreTrainedModel, GPTBigCodeModel, GPTBigCodeForCausalLM, CodeGenPreTrainedModel, CodeGenModel, CodeGenForCausalLM, LlamaPreTrainedModel, LlamaModel, LlamaForCausalLM, Llama4PreTrainedModel, Llama4ForCausalLM, NanoChatPreTrainedModel, NanoChatModel, NanoChatForCausalLM, ApertusPreTrainedModel, ApertusModel, ApertusForCausalLM, ArceePreTrainedModel, ArceeModel, ArceeForCausalLM, Lfm2PreTrainedModel, Lfm2Model, Lfm2ForCausalLM, SmolLM3PreTrainedModel, SmolLM3Model, SmolLM3ForCausalLM, HeliumPreTrainedModel, HeliumModel, HeliumForCausalLM, GlmPreTrainedModel, GlmModel, GlmForCausalLM, ExaonePreTrainedModel, ExaoneModel, ExaoneForCausalLM, MobileLLMPreTrainedModel, MobileLLMModel, MobileLLMForCausalLM, OlmoPreTrainedModel, OlmoModel, OlmoForCausalLM, Olmo2PreTrainedModel, Olmo2Model, Olmo2ForCausalLM, Olmo3PreTrainedModel, Olmo3Model, Olmo3ForCausalLM, GranitePreTrainedModel, GraniteModel, GraniteForCausalLM, GraniteMoeHybridPreTrainedModel, GraniteMoeHybridModel, GraniteMoeHybridForCausalLM, CoherePreTrainedModel, CohereModel, CohereForCausalLM, GemmaPreTrainedModel, GemmaModel, GemmaForCausalLM, Gemma2PreTrainedModel, Gemma2Model, Gemma2ForCausalLM, VaultGemmaPreTrainedModel, VaultGemmaModel, VaultGemmaForCausalLM, Gemma3PreTrainedModel, Gemma3Model, Gemma3ForCausalLM, OpenELMPreTrainedModel, OpenELMModel, OpenELMForCausalLM, Qwen2PreTrainedModel, Qwen2Model, Qwen2ForCausalLM, Qwen3PreTrainedModel, Qwen3Model, Qwen3ForCausalLM, Qwen2VLPreTrainedModel, Qwen2VLForConditionalGeneration, PhiPreTrainedModel, PhiModel, PhiForCausalLM, Phi3PreTrainedModel, Phi3Model, Phi3ForCausalLM, BloomPreTrainedModel, BloomModel, BloomForCausalLM, MptPreTrainedModel, MptModel, MptForCausalLM, OPTPreTrainedModel, OPTModel, OPTForCausalLM, ViTPreTrainedModel, ViTModel, ViTForImageClassification, IJepaPreTrainedModel, IJepaModel, IJepaForImageClassification, VitPosePreTrainedModel, VitPoseForPoseEstimation, PvtPreTrainedModel, PvtModel, PvtForImageClassification, ViTMAEPreTrainedModel, ViTMAEModel, ViTMSNPreTrainedModel, ViTMSNModel, ViTMSNForImageClassification, GroupViTPreTrainedModel, GroupViTModel, FastViTPreTrainedModel, FastViTModel, FastViTForImageClassification, VitMattePreTrainedModel, VitMatteForImageMatting, MobileViTPreTrainedModel, MobileViTModel, MobileViTForImageClassification, MobileViTV2PreTrainedModel, MobileViTV2Model, MobileViTV2ForImageClassification, OwlViTPreTrainedModel, OwlViTModel, OwlViTForObjectDetection, Owlv2PreTrainedModel, Owlv2Model, Owlv2ForObjectDetection, BeitPreTrainedModel, BeitModel, BeitForImageClassification, DetrPreTrainedModel, DetrModel, DetrForObjectDetection, DetrForSegmentation, DetrObjectDetectionOutput, DetrSegmentationOutput, RTDetrPreTrainedModel, RTDetrModel, RTDetrForObjectDetection, RTDetrObjectDetectionOutput, RTDetrV2PreTrainedModel, RTDetrV2Model, RTDetrV2ForObjectDetection, RTDetrV2ObjectDetectionOutput, RFDetrPreTrainedModel, RFDetrModel, RFDetrForObjectDetection, RFDetrObjectDetectionOutput, DFinePreTrainedModel, DFineModel, DFineForObjectDetection, TableTransformerPreTrainedModel, TableTransformerModel, TableTransformerForObjectDetection, TableTransformerObjectDetectionOutput, DeiTPreTrainedModel, DeiTModel, DeiTForImageClassification, HieraPreTrainedModel, HieraModel, HieraForImageClassification, ResNetPreTrainedModel, ResNetModel, ResNetForImageClassification, SwinPreTrainedModel, SwinModel, SwinForImageClassification, SwinForSemanticSegmentation, Swin2SRPreTrainedModel, Swin2SRModel, Swin2SRForImageSuperResolution, DPTPreTrainedModel, DPTModel, DPTForDepthEstimation, DepthAnythingPreTrainedModel, DepthAnythingForDepthEstimation, SapiensPreTrainedModel, SapiensForSemanticSegmentation, SapiensForDepthEstimation, SapiensForNormalEstimation, DepthProPreTrainedModel, DepthProForDepthEstimation, Metric3DPreTrainedModel, Metric3DForDepthEstimation, Metric3Dv2PreTrainedModel, Metric3Dv2ForDepthEstimation, MaskFormerPreTrainedModel, MaskFormerModel, MaskFormerForInstanceSegmentation, GLPNPreTrainedModel, GLPNModel, GLPNForDepthEstimation, DonutSwinPreTrainedModel, DonutSwinModel, ConvNextPreTrainedModel, ConvNextModel, ConvNextForImageClassification, ConvNextV2PreTrainedModel, ConvNextV2Model, ConvNextV2ForImageClassification, Dinov2PreTrainedModel, Dinov2Model, Dinov2ForImageClassification, Dinov2WithRegistersPreTrainedModel, Dinov2WithRegistersModel, Dinov2WithRegistersForImageClassification, DINOv3ViTPreTrainedModel, DINOv3ViTModel, DINOv3ConvNextPreTrainedModel, DINOv3ConvNextModel, GroundingDinoPreTrainedModel, GroundingDinoForObjectDetection, YolosPreTrainedModel, YolosModel, YolosForObjectDetection, YolosObjectDetectionOutput, SamPreTrainedModel, SamModel, SamImageSegmentationOutput, Sam2ImageSegmentationOutput, Sam2PreTrainedModel, Sam2Model, EdgeTamModel, Sam3TrackerModel, MarianPreTrainedModel, MarianModel, MarianMTModel, M2M100PreTrainedModel, M2M100Model, M2M100ForConditionalGeneration, Wav2Vec2PreTrainedModel, Wav2Vec2Model, Wav2Vec2ForCTC, Wav2Vec2ForSequenceClassification, Wav2Vec2ForAudioFrameClassification, ParakeetPreTrainedModel, ParakeetForCTC, PyAnnotePreTrainedModel, PyAnnoteModel, PyAnnoteForAudioFrameClassification, WeSpeakerResNetPreTrainedModel, WeSpeakerResNetModel, UniSpeechPreTrainedModel, UniSpeechModel, UniSpeechForCTC, UniSpeechForSequenceClassification, UniSpeechSatPreTrainedModel, UniSpeechSatModel, UniSpeechSatForCTC, UniSpeechSatForSequenceClassification, UniSpeechSatForAudioFrameClassification, Wav2Vec2BertPreTrainedModel, Wav2Vec2BertModel, Wav2Vec2BertForCTC, Wav2Vec2BertForSequenceClassification, HubertPreTrainedModel, HubertModel, HubertForCTC, HubertForSequenceClassification, WavLMPreTrainedModel, WavLMModel, WavLMForCTC, WavLMForSequenceClassification, WavLMForXVector, WavLMForAudioFrameClassification, StyleTextToSpeech2PreTrainedModel, StyleTextToSpeech2Model, SpeechT5PreTrainedModel, SpeechT5Model, SpeechT5ForSpeechToText, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SupertonicPreTrainedModel, SupertonicForConditionalGeneration, TrOCRPreTrainedModel, TrOCRForCausalLM, MistralPreTrainedModel, MistralModel, MistralForCausalLM, Ernie4_5_PretrainedModel, Ernie4_5_Model, Ernie4_5_ForCausalLM, Starcoder2PreTrainedModel, Starcoder2Model, Starcoder2ForCausalLM, FalconPreTrainedModel, FalconModel, FalconForCausalLM, ClapPreTrainedModel, ClapModel, ClapTextModelWithProjection, ClapAudioModelWithProjection, VitsPreTrainedModel, VitsModel, SegformerPreTrainedModel, SegformerModel, SegformerForImageClassification, SegformerForSemanticSegmentation, StableLmPreTrainedModel, StableLmModel, StableLmForCausalLM, EfficientNetPreTrainedModel, EfficientNetModel, EfficientNetForImageClassification, MusicgenPreTrainedModel, MusicgenModel, MusicgenForCausalLM, MusicgenForConditionalGeneration, MobileNetV1PreTrainedModel, MobileNetV1Model, MobileNetV1ForImageClassification, MobileNetV1ForSemanticSegmentation, MobileNetV2PreTrainedModel, MobileNetV2Model, MobileNetV2ForImageClassification, MobileNetV2ForSemanticSegmentation, MobileNetV3PreTrainedModel, MobileNetV3Model, MobileNetV3ForImageClassification, MobileNetV3ForSemanticSegmentation, MobileNetV4PreTrainedModel, MobileNetV4Model, MobileNetV4ForImageClassification, MobileNetV4ForSemanticSegmentation, DecisionTransformerPreTrainedModel, DecisionTransformerModel, MultiModalityPreTrainedModel, MultiModalityCausalLM, MgpstrModelOutput, MgpstrPreTrainedModel, MgpstrForSceneTextRecognition, PatchTSTPreTrainedModel, PatchTSTModel, PatchTSTForPrediction, PatchTSMixerPreTrainedModel, PatchTSMixerModel, PatchTSMixerForPrediction, UltravoxPreTrainedModel, UltravoxModel, VoxtralForConditionalGeneration, MimiPreTrainedModel, MimiEncoderOutput, MimiDecoderOutput, MimiModel, MimiEncoderModel, MimiDecoderModel, DacPreTrainedModel, DacEncoderOutput, DacDecoderOutput, DacModel, DacEncoderModel, DacDecoderModel, SnacPreTrainedModel, SnacModel, SnacEncoderModel, SnacDecoderModel, ChatterboxPreTrainedModel, ChatterboxModel, _a26, PretrainedMixin, MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_MAPPING_NAMES_AUTO_ENCODER, MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_MULTIMODALITY_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_FOR_CTC_MAPPING_NAMES, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES, MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES, MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_CLASS_TYPE_MAPPING, CUSTOM_MAPPING, CUSTOM_ARCHITECTURES, _a27, AutoModel, _a28, AutoModelForSequenceClassification, _a29, AutoModelForTokenClassification, _a30, AutoModelForSeq2SeqLM, _a31, AutoModelForSpeechSeq2Seq, _a32, AutoModelForTextToSpectrogram, _a33, AutoModelForTextToWaveform, _a34, AutoModelForCausalLM, _a35, AutoModelForMaskedLM, _a36, AutoModelForQuestionAnswering, _a37, AutoModelForVision2Seq, _a38, AutoModelForImageClassification, _a39, AutoModelForImageSegmentation, _a40, AutoModelForSemanticSegmentation, _a41, AutoModelForUniversalSegmentation, _a42, AutoModelForObjectDetection, _a43, AutoModelForZeroShotObjectDetection, _a44, AutoModelForMaskGeneration, _a45, AutoModelForCTC, _a46, AutoModelForAudioClassification, _a47, AutoModelForXVector, _a48, AutoModelForAudioFrameClassification, _a49, AutoModelForDocumentQuestionAnswering, _a50, AutoModelForImageMatting, _a51, AutoModelForImageToImage, _a52, AutoModelForDepthEstimation, _a53, AutoModelForNormalEstimation, _a54, AutoModelForPoseEstimation, _a55, AutoModelForImageFeatureExtraction, _a56, AutoModelForImageTextToText, _a57, AutoModelForAudioTextToText, Seq2SeqLMOutput, SequenceClassifierOutput, XVectorOutput, TokenClassifierOutput, MaskedLMOutput, QuestionAnsweringModelOutput, CausalLMOutput, CausalLMOutputWithPast, ImageMattingOutput, VitsModelOutput, Pipeline, TextClassificationPipeline, TokenClassificationPipeline, QuestionAnsweringPipeline, FillMaskPipeline, Text2TextGenerationPipeline, SummarizationPipeline, TranslationPipeline, TextGenerationPipeline, ZeroShotClassificationPipeline, AudioClassificationPipeline, ZeroShotAudioClassificationPipeline, AutomaticSpeechRecognitionPipeline, TextToAudioPipeline, ImageToTextPipeline, ImageClassificationPipeline, ImageSegmentationPipeline, BackgroundRemovalPipeline, ZeroShotImageClassificationPipeline, ObjectDetectionPipeline, ZeroShotObjectDetectionPipeline, DocumentQuestionAnsweringPipeline, ImageToImagePipeline, DepthEstimationPipeline, FeatureExtractionPipeline, ImageFeatureExtractionPipeline, SUPPORTED_TASKS, TASK_ALIASES, RawVideoFrame, RawVideo, BaseStreamer, stdout_write, TextStreamer, WhisperTextStreamer;
|
|
5985
|
+
var ONNX_WEB, import_meta, __defProp2, __export2, emptyObj, node_fs_default, emptyObj2, node_path_default, emptyObj3, node_url_default, VERSION, IS_PROCESS_AVAILABLE, IS_NODE_ENV, IS_FS_AVAILABLE, IS_PATH_AVAILABLE, IS_DENO_RUNTIME, IS_BUN_RUNTIME, IS_BROWSER_ENV, IS_WEBWORKER_ENV, IS_WEB_CACHE_AVAILABLE, IS_WEBGPU_AVAILABLE, IS_WEBNN_AVAILABLE, isSafari2, IS_SAFARI, apis, RUNNING_LOCALLY, dirname__, DEFAULT_CACHE_DIR, DEFAULT_LOCAL_MODEL_PATH, localModelPath, env2, Callable, CONTENT_TYPE_MAP, FileResponse, FileCache, ERROR_MAPPING, MAX_EXTERNAL_DATA_CHUNKS, REPO_ID_REGEX, P2FFT, NP2FFT, FFT, uint16_to_float32, onnxruntime_node_exports, noop, emptyObj4, onnxruntime_node_default, Readable, pipeline, createWriteStream, createReadStream, DEVICE_TO_EXECUTION_PROVIDER_MAPPING, LOG_LEVELS, DEFAULT_LOG_LEVEL, supportedDevices, defaultDevices, ONNX, ORT_SYMBOL, InferenceSession2, IS_WEB_ENV, webInitChain, wasmLoadPromise, webInferenceChain, ONNX_ENV, wrap, _a, TensorOpRegistry, DataTypeMap, Tensor22, arrayToIndexTensor, PriorityQueue, CharTrie, CharTrieNode, TokenLattice, TokenLatticeNode, DictionarySplitter, LRUCache, TOKEN_TYPES, Token, ORDERED_MAPPING_TABLE, ESCAPE_CHARACTERS, Statement, Program, If, For, Break, Continue, SetStatement, Macro, Comment, Expression, MemberExpression, CallExpression, Identifier, Literal, IntegerLiteral, FloatLiteral, StringLiteral, ArrayLiteral, TupleLiteral, ObjectLiteral, BinaryExpression, FilterExpression, FilterStatement, SelectExpression, TestExpression, UnaryExpression, SliceExpression, KeywordArgumentExpression, SpreadExpression, CallStatement, Ternary, BreakControl, ContinueControl, RuntimeValue, IntegerValue, FloatValue, StringValue, BooleanValue, ObjectValue, KeywordArgumentsValue, ArrayValue, TupleValue, FunctionValue, NullValue, UndefinedValue, Environment, Interpreter, NEWLINE, OPEN_STATEMENT, CLOSE_STATEMENT, Template, WHISPER_LANGUAGES, WHISPER_LANGUAGE_MAPPING, WHISPER_TO_LANGUAGE_CODE_MAPPING, PUNCTUATION_REGEX, PUNCTUATION_ONLY_REGEX, BLOOM_SPLIT_CHARS, PROBLEMATIC_REGEX_MAP, AddedToken, TokenizerModel, WordPieceTokenizer, Unigram, BYTES_TO_UNICODE, UNICODE_TO_BYTES, BPE, LegacyTokenizerModel, Normalizer, Replace, UnicodeNormalizer, NFC, NFD, NFKC, NFKD, StripNormalizer, StripAccents, Lowercase, Prepend, NormalizerSequence, BertNormalizer, PreTokenizer, BertPreTokenizer, ByteLevelPreTokenizer, SplitPreTokenizer, PunctuationPreTokenizer, DigitsPreTokenizer, PostProcessor, BertProcessing, RobertaProcessing, TemplateProcessing, ByteLevelPostProcessor, PostProcessorSequence, Decoder, ReplaceDecoder, ByteFallback, FuseDecoder, StripDecoder, WordPieceDecoder, ByteLevelDecoder, CTCDecoder, DecoderSequence, BPEDecoder, VitsDecoder, MetaspacePreTokenizer, MetaspaceDecoder, Precompiled, PreTokenizerSequence, WhitespacePreTokenizer, WhitespaceSplit, ReplacePreTokenizer, FixedLengthPreTokenizer, SPECIAL_TOKEN_ATTRIBUTES, PreTrainedTokenizer, TokenizersBackend, BertTokenizer, AlbertTokenizer, MobileBertTokenizer, SqueezeBertTokenizer, DebertaTokenizer, DebertaV2Tokenizer, HerbertTokenizer, ConvBertTokenizer, RoFormerTokenizer, DistilBertTokenizer, CamembertTokenizer, XLMTokenizer, ElectraTokenizer, T5Tokenizer, GPT2Tokenizer, BartTokenizer, MBartTokenizer, MBart50Tokenizer, RobertaTokenizer, BloomTokenizer, SPIECE_UNDERLINE, LlamaTokenizer, CodeLlamaTokenizer, XLMRobertaTokenizer, MPNetTokenizer, FalconTokenizer, GPTNeoXTokenizer, EsmTokenizer, Qwen2Tokenizer, GemmaTokenizer, Grok1Tokenizer, NllbTokenizer, M2M100Tokenizer, WhisperTokenizer, CodeGenTokenizer, CLIPTokenizer, SiglipTokenizer, MarianTokenizer, Wav2Vec2CTCTokenizer, BlenderbotTokenizer, BlenderbotSmallTokenizer, SpeechT5Tokenizer, NougatTokenizer, VitsTokenizer, CohereTokenizer, MgpstrTokenizer, Ernie4_5_Tokenizer, _a2, AutoTokenizer, GITHUB_ISSUE_URL, FEATURE_EXTRACTOR_NAME, IMAGE_PROCESSOR_NAME, PROCESSOR_NAME, CHAT_TEMPLATE_NAME, _a3, Processor, processors_exports, FeatureExtractor, feature_extractors_exports, noop2, Readable2, noop3, pipeline2, HERTZ_TO_MEL_MAPPING, MEL_TO_HERTZ_MAPPING, RawAudio, ASTFeatureExtractor, EncodecFeatureExtractor, ChatterboxFeatureExtractor, ClapFeatureExtractor, DacFeatureExtractor, Gemma3nAudioFeatureExtractor, MoonshineFeatureExtractor, EPSILON, ParakeetFeatureExtractor, PyAnnoteFeatureExtractor, SeamlessM4TFeatureExtractor, SnacFeatureExtractor, SpeechT5FeatureExtractor, Wav2Vec2FeatureExtractor, WeSpeakerFeatureExtractor, WhisperFeatureExtractor, emptyObj5, sharp_default, createCanvasFunction, ImageDataClass, loadImageFunction, IS_BROWSER_OR_WEBWORKER, RESAMPLING_MAPPING, CONTENT_TYPE_MAP2, RawImage, load_image, ImageProcessor, AutoFeatureExtractor, _a4, ChatterboxProcessor, image_processors_exports, BeitFeatureExtractor, BitImageProcessor, ChineseCLIPFeatureExtractor, CLIPImageProcessor, CLIPFeatureExtractor, ConvNextImageProcessor, ConvNextFeatureExtractor, DeiTImageProcessor, DeiTFeatureExtractor, DetrImageProcessor, DetrFeatureExtractor, DINOv3ViTImageProcessor, DonutImageProcessor, DonutFeatureExtractor, DPTImageProcessor, DPTFeatureExtractor, EfficientNetImageProcessor, GLPNFeatureExtractor, GroundingDinoImageProcessor, Idefics3ImageProcessor, VLMImageProcessor, JinaCLIPImageProcessor, LlavaOnevisionImageProcessor, MaskFormerImageProcessor, MaskFormerFeatureExtractor, Mask2FormerImageProcessor, MobileNetV1ImageProcessor, MobileNetV1FeatureExtractor, MobileNetV2ImageProcessor, MobileNetV2FeatureExtractor, MobileNetV3ImageProcessor, MobileNetV3FeatureExtractor, MobileNetV4ImageProcessor, MobileNetV4FeatureExtractor, MobileViTImageProcessor, MobileViTFeatureExtractor, NougatImageProcessor, OwlViTImageProcessor, OwlViTFeatureExtractor, Owlv2ImageProcessor, IMAGE_SIZE, SLICE_AXES, ceil, floor, sqrt, Phi3VImageProcessor, PvtImageProcessor, Qwen2VLImageProcessor, RTDetrImageProcessor, SamImageProcessor, SegformerImageProcessor, SegformerFeatureExtractor, SiglipImageProcessor, Swin2SRImageProcessor, ViTImageProcessor, ViTFeatureExtractor, VitMatteImageProcessor, VitPoseImageProcessor, YolosImageProcessor, YolosFeatureExtractor, AutoImageProcessor, _a5, Florence2Processor, _a6, Gemma3nProcessor, _a7, GroundingDinoProcessor, _a8, Idefics3Processor, _a9, VLChatProcessor, _a10, JinaCLIPProcessor, _a11, LlavaProcessor, DECODE_TYPE_MAPPING, _a12, MgpstrProcessor, _a13, MoonshineProcessor, _a14, OwlViTProcessor, IMAGE_TOKEN, IMAGE_TOKEN_PATTERN, _a15, Phi3VProcessor, IMAGE_TOKEN2, _a16, PaliGemmaProcessor, _a17, PyAnnoteProcessor, _a18, Qwen2VLProcessor, _a19, SamProcessor, Sam2Processor, Sam2VideoProcessor, _a20, SpeechT5Processor, _a21, UltravoxProcessor, AUDIO_TOKEN, BEGIN_AUDIO_TOKEN, NUM_AUDIO_TOKENS, _a22, VoxtralProcessor, _a23, Wav2Vec2Processor, _a24, Wav2Vec2ProcessorWithLM, _a25, WhisperProcessor, AutoProcessor, PretrainedConfig, AutoConfig, DEVICE_TYPES, isWebGpuFp16Supported, DATA_TYPES, DEFAULT_DEVICE_DTYPE_MAPPING, DEFAULT_DTYPE_SUFFIX_MAPPING, LogitsProcessor, LogitsWarper, LogitsProcessorList, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, SuppressTokensAtBeginLogitsProcessor, WhisperTimeStampLogitsProcessor, NoRepeatNGramLogitsProcessor, RepetitionPenaltyLogitsProcessor, MinLengthLogitsProcessor, MinNewTokensLengthLogitsProcessor, NoBadWordsLogitsProcessor, ClassifierFreeGuidanceLogitsProcessor, TemperatureLogitsWarper, TopPLogitsWarper, TopKLogitsWarper, GenerationConfig, StoppingCriteria, StoppingCriteriaList, MaxLengthCriteria, EosTokenCriteria, InterruptableStoppingCriteria, LogitsSampler, GreedySampler, MultinomialSampler, BeamSearchSampler, WhisperGenerationConfig, MODEL_TYPES, MODEL_TYPE_MAPPING, MODEL_NAME_TO_CLASS_MAPPING, MODEL_CLASS_TO_NAME_MAPPING, PreTrainedModel, ModelOutput, BaseModelOutput, BertPreTrainedModel, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering, NeoBertPreTrainedModel, NeoBertModel, NeoBertForMaskedLM, NeoBertForSequenceClassification, NeoBertForTokenClassification, NeoBertForQuestionAnswering, ModernBertPreTrainedModel, ModernBertModel, ModernBertForMaskedLM, ModernBertForSequenceClassification, ModernBertForTokenClassification, ModernBertDecoderPreTrainedModel, ModernBertDecoderModel, ModernBertDecoderForCausalLM, NomicBertPreTrainedModel, NomicBertModel, RoFormerPreTrainedModel, RoFormerModel, RoFormerForMaskedLM, RoFormerForSequenceClassification, RoFormerForTokenClassification, RoFormerForQuestionAnswering, ConvBertPreTrainedModel, ConvBertModel, ConvBertForMaskedLM, ConvBertForSequenceClassification, ConvBertForTokenClassification, ConvBertForQuestionAnswering, ElectraPreTrainedModel, ElectraModel, ElectraForMaskedLM, ElectraForSequenceClassification, ElectraForTokenClassification, ElectraForQuestionAnswering, CamembertPreTrainedModel, CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForTokenClassification, CamembertForQuestionAnswering, DebertaPreTrainedModel, DebertaModel, DebertaForMaskedLM, DebertaForSequenceClassification, DebertaForTokenClassification, DebertaForQuestionAnswering, DebertaV2PreTrainedModel, DebertaV2Model, DebertaV2ForMaskedLM, DebertaV2ForSequenceClassification, DebertaV2ForTokenClassification, DebertaV2ForQuestionAnswering, DistilBertPreTrainedModel, DistilBertModel, DistilBertForSequenceClassification, DistilBertForTokenClassification, DistilBertForQuestionAnswering, DistilBertForMaskedLM, EsmPreTrainedModel, EsmModel, EsmForMaskedLM, EsmForSequenceClassification, EsmForTokenClassification, MobileBertPreTrainedModel, MobileBertModel, MobileBertForMaskedLM, MobileBertForSequenceClassification, MobileBertForQuestionAnswering, MPNetPreTrainedModel, MPNetModel, MPNetForMaskedLM, MPNetForSequenceClassification, MPNetForTokenClassification, MPNetForQuestionAnswering, SqueezeBertPreTrainedModel, SqueezeBertModel, SqueezeBertForMaskedLM, SqueezeBertForSequenceClassification, SqueezeBertForQuestionAnswering, AlbertPreTrainedModel, AlbertModel, AlbertForSequenceClassification, AlbertForQuestionAnswering, AlbertForMaskedLM, T5PreTrainedModel, T5Model, T5ForConditionalGeneration, LongT5PreTrainedModel, LongT5Model, LongT5ForConditionalGeneration, MT5PreTrainedModel, MT5Model, MT5ForConditionalGeneration, BartPretrainedModel, BartModel, BartForConditionalGeneration, BartForSequenceClassification, MBartPreTrainedModel, MBartModel, MBartForConditionalGeneration, MBartForSequenceClassification, MBartForCausalLM, BlenderbotPreTrainedModel, BlenderbotModel, BlenderbotForConditionalGeneration, BlenderbotSmallPreTrainedModel, BlenderbotSmallModel, BlenderbotSmallForConditionalGeneration, RobertaPreTrainedModel, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaForQuestionAnswering, XLMPreTrainedModel, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForTokenClassification, XLMForQuestionAnswering, XLMRobertaPreTrainedModel, XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification, XLMRobertaForQuestionAnswering, ASTPreTrainedModel, ASTModel, ASTForAudioClassification, WhisperPreTrainedModel, WhisperModel, WhisperForConditionalGeneration, LiteWhisperForConditionalGeneration, MoonshinePreTrainedModel, MoonshineModel, MoonshineForConditionalGeneration, VisionEncoderDecoderModel, LlavaPreTrainedModel, LlavaForConditionalGeneration, LlavaOnevisionForConditionalGeneration, Moondream1ForConditionalGeneration, Florence2PreTrainedModel, Florence2ForConditionalGeneration, PaliGemmaPreTrainedModel, PaliGemmaForConditionalGeneration, LlavaQwen2ForCausalLM, Gemma3nPreTrainedModel, Gemma3nForConditionalGeneration, Idefics3PreTrainedModel, Idefics3ForConditionalGeneration, SmolVLMForConditionalGeneration, Phi3VPreTrainedModel, Phi3VForCausalLM, CLIPPreTrainedModel, CLIPModel, CLIPTextModel, CLIPTextModelWithProjection, CLIPVisionModel, CLIPVisionModelWithProjection, SiglipPreTrainedModel, SiglipModel, SiglipTextModel, SiglipVisionModel, ChineseCLIPPreTrainedModel, ChineseCLIPModel, JinaCLIPPreTrainedModel, JinaCLIPModel, JinaCLIPTextModel, JinaCLIPVisionModel, CLIPSegPreTrainedModel, CLIPSegModel, CLIPSegForImageSegmentation, GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, GptOssPreTrainedModel, GptOssModel, GptOssForCausalLM, JAISPreTrainedModel, JAISModel, JAISLMHeadModel, GPTNeoPreTrainedModel, GPTNeoModel, GPTNeoForCausalLM, GPTNeoXPreTrainedModel, GPTNeoXModel, GPTNeoXForCausalLM, GPTJPreTrainedModel, GPTJModel, GPTJForCausalLM, GPTBigCodePreTrainedModel, GPTBigCodeModel, GPTBigCodeForCausalLM, CodeGenPreTrainedModel, CodeGenModel, CodeGenForCausalLM, LlamaPreTrainedModel, LlamaModel, LlamaForCausalLM, Llama4PreTrainedModel, Llama4ForCausalLM, NanoChatPreTrainedModel, NanoChatModel, NanoChatForCausalLM, ApertusPreTrainedModel, ApertusModel, ApertusForCausalLM, ArceePreTrainedModel, ArceeModel, ArceeForCausalLM, Lfm2PreTrainedModel, Lfm2Model, Lfm2ForCausalLM, SmolLM3PreTrainedModel, SmolLM3Model, SmolLM3ForCausalLM, HeliumPreTrainedModel, HeliumModel, HeliumForCausalLM, GlmPreTrainedModel, GlmModel, GlmForCausalLM, ExaonePreTrainedModel, ExaoneModel, ExaoneForCausalLM, MobileLLMPreTrainedModel, MobileLLMModel, MobileLLMForCausalLM, OlmoPreTrainedModel, OlmoModel, OlmoForCausalLM, Olmo2PreTrainedModel, Olmo2Model, Olmo2ForCausalLM, Olmo3PreTrainedModel, Olmo3Model, Olmo3ForCausalLM, GranitePreTrainedModel, GraniteModel, GraniteForCausalLM, GraniteMoeHybridPreTrainedModel, GraniteMoeHybridModel, GraniteMoeHybridForCausalLM, CoherePreTrainedModel, CohereModel, CohereForCausalLM, GemmaPreTrainedModel, GemmaModel, GemmaForCausalLM, Gemma2PreTrainedModel, Gemma2Model, Gemma2ForCausalLM, VaultGemmaPreTrainedModel, VaultGemmaModel, VaultGemmaForCausalLM, Gemma3PreTrainedModel, Gemma3Model, Gemma3ForCausalLM, OpenELMPreTrainedModel, OpenELMModel, OpenELMForCausalLM, Qwen2PreTrainedModel, Qwen2Model, Qwen2ForCausalLM, Qwen3PreTrainedModel, Qwen3Model, Qwen3ForCausalLM, Qwen2VLPreTrainedModel, Qwen2VLForConditionalGeneration, PhiPreTrainedModel, PhiModel, PhiForCausalLM, Phi3PreTrainedModel, Phi3Model, Phi3ForCausalLM, BloomPreTrainedModel, BloomModel, BloomForCausalLM, MptPreTrainedModel, MptModel, MptForCausalLM, OPTPreTrainedModel, OPTModel, OPTForCausalLM, ViTPreTrainedModel, ViTModel, ViTForImageClassification, IJepaPreTrainedModel, IJepaModel, IJepaForImageClassification, VitPosePreTrainedModel, VitPoseForPoseEstimation, PvtPreTrainedModel, PvtModel, PvtForImageClassification, ViTMAEPreTrainedModel, ViTMAEModel, ViTMSNPreTrainedModel, ViTMSNModel, ViTMSNForImageClassification, GroupViTPreTrainedModel, GroupViTModel, FastViTPreTrainedModel, FastViTModel, FastViTForImageClassification, VitMattePreTrainedModel, VitMatteForImageMatting, MobileViTPreTrainedModel, MobileViTModel, MobileViTForImageClassification, MobileViTV2PreTrainedModel, MobileViTV2Model, MobileViTV2ForImageClassification, OwlViTPreTrainedModel, OwlViTModel, OwlViTForObjectDetection, Owlv2PreTrainedModel, Owlv2Model, Owlv2ForObjectDetection, BeitPreTrainedModel, BeitModel, BeitForImageClassification, DetrPreTrainedModel, DetrModel, DetrForObjectDetection, DetrForSegmentation, DetrObjectDetectionOutput, DetrSegmentationOutput, RTDetrPreTrainedModel, RTDetrModel, RTDetrForObjectDetection, RTDetrObjectDetectionOutput, RTDetrV2PreTrainedModel, RTDetrV2Model, RTDetrV2ForObjectDetection, RTDetrV2ObjectDetectionOutput, RFDetrPreTrainedModel, RFDetrModel, RFDetrForObjectDetection, RFDetrObjectDetectionOutput, DFinePreTrainedModel, DFineModel, DFineForObjectDetection, TableTransformerPreTrainedModel, TableTransformerModel, TableTransformerForObjectDetection, TableTransformerObjectDetectionOutput, DeiTPreTrainedModel, DeiTModel, DeiTForImageClassification, HieraPreTrainedModel, HieraModel, HieraForImageClassification, ResNetPreTrainedModel, ResNetModel, ResNetForImageClassification, SwinPreTrainedModel, SwinModel, SwinForImageClassification, SwinForSemanticSegmentation, Swin2SRPreTrainedModel, Swin2SRModel, Swin2SRForImageSuperResolution, DPTPreTrainedModel, DPTModel, DPTForDepthEstimation, DepthAnythingPreTrainedModel, DepthAnythingForDepthEstimation, SapiensPreTrainedModel, SapiensForSemanticSegmentation, SapiensForDepthEstimation, SapiensForNormalEstimation, DepthProPreTrainedModel, DepthProForDepthEstimation, Metric3DPreTrainedModel, Metric3DForDepthEstimation, Metric3Dv2PreTrainedModel, Metric3Dv2ForDepthEstimation, MaskFormerPreTrainedModel, MaskFormerModel, MaskFormerForInstanceSegmentation, GLPNPreTrainedModel, GLPNModel, GLPNForDepthEstimation, DonutSwinPreTrainedModel, DonutSwinModel, ConvNextPreTrainedModel, ConvNextModel, ConvNextForImageClassification, ConvNextV2PreTrainedModel, ConvNextV2Model, ConvNextV2ForImageClassification, Dinov2PreTrainedModel, Dinov2Model, Dinov2ForImageClassification, Dinov2WithRegistersPreTrainedModel, Dinov2WithRegistersModel, Dinov2WithRegistersForImageClassification, DINOv3ViTPreTrainedModel, DINOv3ViTModel, DINOv3ConvNextPreTrainedModel, DINOv3ConvNextModel, GroundingDinoPreTrainedModel, GroundingDinoForObjectDetection, YolosPreTrainedModel, YolosModel, YolosForObjectDetection, YolosObjectDetectionOutput, SamPreTrainedModel, SamModel, SamImageSegmentationOutput, Sam2ImageSegmentationOutput, Sam2PreTrainedModel, Sam2Model, EdgeTamModel, Sam3TrackerModel, MarianPreTrainedModel, MarianModel, MarianMTModel, M2M100PreTrainedModel, M2M100Model, M2M100ForConditionalGeneration, Wav2Vec2PreTrainedModel, Wav2Vec2Model, Wav2Vec2ForCTC, Wav2Vec2ForSequenceClassification, Wav2Vec2ForAudioFrameClassification, ParakeetPreTrainedModel, ParakeetForCTC, PyAnnotePreTrainedModel, PyAnnoteModel, PyAnnoteForAudioFrameClassification, WeSpeakerResNetPreTrainedModel, WeSpeakerResNetModel, UniSpeechPreTrainedModel, UniSpeechModel, UniSpeechForCTC, UniSpeechForSequenceClassification, UniSpeechSatPreTrainedModel, UniSpeechSatModel, UniSpeechSatForCTC, UniSpeechSatForSequenceClassification, UniSpeechSatForAudioFrameClassification, Wav2Vec2BertPreTrainedModel, Wav2Vec2BertModel, Wav2Vec2BertForCTC, Wav2Vec2BertForSequenceClassification, HubertPreTrainedModel, HubertModel, HubertForCTC, HubertForSequenceClassification, WavLMPreTrainedModel, WavLMModel, WavLMForCTC, WavLMForSequenceClassification, WavLMForXVector, WavLMForAudioFrameClassification, StyleTextToSpeech2PreTrainedModel, StyleTextToSpeech2Model, SpeechT5PreTrainedModel, SpeechT5Model, SpeechT5ForSpeechToText, SpeechT5ForTextToSpeech, SpeechT5HifiGan, SupertonicPreTrainedModel, SupertonicForConditionalGeneration, TrOCRPreTrainedModel, TrOCRForCausalLM, MistralPreTrainedModel, MistralModel, MistralForCausalLM, Ernie4_5_PretrainedModel, Ernie4_5_Model, Ernie4_5_ForCausalLM, Starcoder2PreTrainedModel, Starcoder2Model, Starcoder2ForCausalLM, FalconPreTrainedModel, FalconModel, FalconForCausalLM, ClapPreTrainedModel, ClapModel, ClapTextModelWithProjection, ClapAudioModelWithProjection, VitsPreTrainedModel, VitsModel, SegformerPreTrainedModel, SegformerModel, SegformerForImageClassification, SegformerForSemanticSegmentation, StableLmPreTrainedModel, StableLmModel, StableLmForCausalLM, EfficientNetPreTrainedModel, EfficientNetModel, EfficientNetForImageClassification, MusicgenPreTrainedModel, MusicgenModel, MusicgenForCausalLM, MusicgenForConditionalGeneration, MobileNetV1PreTrainedModel, MobileNetV1Model, MobileNetV1ForImageClassification, MobileNetV1ForSemanticSegmentation, MobileNetV2PreTrainedModel, MobileNetV2Model, MobileNetV2ForImageClassification, MobileNetV2ForSemanticSegmentation, MobileNetV3PreTrainedModel, MobileNetV3Model, MobileNetV3ForImageClassification, MobileNetV3ForSemanticSegmentation, MobileNetV4PreTrainedModel, MobileNetV4Model, MobileNetV4ForImageClassification, MobileNetV4ForSemanticSegmentation, DecisionTransformerPreTrainedModel, DecisionTransformerModel, MultiModalityPreTrainedModel, MultiModalityCausalLM, MgpstrModelOutput, MgpstrPreTrainedModel, MgpstrForSceneTextRecognition, PatchTSTPreTrainedModel, PatchTSTModel, PatchTSTForPrediction, PatchTSMixerPreTrainedModel, PatchTSMixerModel, PatchTSMixerForPrediction, UltravoxPreTrainedModel, UltravoxModel, VoxtralForConditionalGeneration, MimiPreTrainedModel, MimiEncoderOutput, MimiDecoderOutput, MimiModel, MimiEncoderModel, MimiDecoderModel, DacPreTrainedModel, DacEncoderOutput, DacDecoderOutput, DacModel, DacEncoderModel, DacDecoderModel, SnacPreTrainedModel, SnacModel, SnacEncoderModel, SnacDecoderModel, ChatterboxPreTrainedModel, ChatterboxModel, _a26, PretrainedMixin, MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_MAPPING_NAMES_AUTO_ENCODER, MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_MULTIMODALITY_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_FOR_CTC_MAPPING_NAMES, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES, MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES, MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_CLASS_TYPE_MAPPING, CUSTOM_MAPPING, CUSTOM_ARCHITECTURES, _a27, AutoModel, _a28, AutoModelForSequenceClassification, _a29, AutoModelForTokenClassification, _a30, AutoModelForSeq2SeqLM, _a31, AutoModelForSpeechSeq2Seq, _a32, AutoModelForTextToSpectrogram, _a33, AutoModelForTextToWaveform, _a34, AutoModelForCausalLM, _a35, AutoModelForMaskedLM, _a36, AutoModelForQuestionAnswering, _a37, AutoModelForVision2Seq, _a38, AutoModelForImageClassification, _a39, AutoModelForImageSegmentation, _a40, AutoModelForSemanticSegmentation, _a41, AutoModelForUniversalSegmentation, _a42, AutoModelForObjectDetection, _a43, AutoModelForZeroShotObjectDetection, _a44, AutoModelForMaskGeneration, _a45, AutoModelForCTC, _a46, AutoModelForAudioClassification, _a47, AutoModelForXVector, _a48, AutoModelForAudioFrameClassification, _a49, AutoModelForDocumentQuestionAnswering, _a50, AutoModelForImageMatting, _a51, AutoModelForImageToImage, _a52, AutoModelForDepthEstimation, _a53, AutoModelForNormalEstimation, _a54, AutoModelForPoseEstimation, _a55, AutoModelForImageFeatureExtraction, _a56, AutoModelForImageTextToText, _a57, AutoModelForAudioTextToText, Seq2SeqLMOutput, SequenceClassifierOutput, XVectorOutput, TokenClassifierOutput, MaskedLMOutput, QuestionAnsweringModelOutput, CausalLMOutput, CausalLMOutputWithPast, ImageMattingOutput, VitsModelOutput, Pipeline, TextClassificationPipeline, TokenClassificationPipeline, QuestionAnsweringPipeline, FillMaskPipeline, Text2TextGenerationPipeline, SummarizationPipeline, TranslationPipeline, TextGenerationPipeline, ZeroShotClassificationPipeline, AudioClassificationPipeline, ZeroShotAudioClassificationPipeline, AutomaticSpeechRecognitionPipeline, TextToAudioPipeline, ImageToTextPipeline, ImageClassificationPipeline, ImageSegmentationPipeline, BackgroundRemovalPipeline, ZeroShotImageClassificationPipeline, ObjectDetectionPipeline, ZeroShotObjectDetectionPipeline, DocumentQuestionAnsweringPipeline, ImageToImagePipeline, DepthEstimationPipeline, FeatureExtractionPipeline, ImageFeatureExtractionPipeline, SUPPORTED_TASKS, TASK_ALIASES, RawVideoFrame, RawVideo, BaseStreamer, stdout_write, TextStreamer, WhisperTextStreamer;
|
|
5986
5986
|
var init_transformers_web = __esm({
|
|
5987
5987
|
"node_modules/@huggingface/transformers/dist/transformers.web.js"() {
|
|
5988
5988
|
"use strict";
|
|
@@ -6014,7 +6014,7 @@ var init_transformers_web = __esm({
|
|
|
6014
6014
|
IS_WEB_CACHE_AVAILABLE = typeof self !== "undefined" && "caches" in self;
|
|
6015
6015
|
IS_WEBGPU_AVAILABLE = IS_NODE_ENV || typeof navigator !== "undefined" && "gpu" in navigator;
|
|
6016
6016
|
IS_WEBNN_AVAILABLE = typeof navigator !== "undefined" && "ml" in navigator;
|
|
6017
|
-
|
|
6017
|
+
isSafari2 = () => {
|
|
6018
6018
|
if (typeof navigator === "undefined") {
|
|
6019
6019
|
return false;
|
|
6020
6020
|
}
|
|
@@ -6024,7 +6024,7 @@ var init_transformers_web = __esm({
|
|
|
6024
6024
|
const notOtherBrowser = !userAgent.match(/CriOS|FxiOS|EdgiOS|OPiOS|mercury|brave/i) && !userAgent.includes("Chrome") && !userAgent.includes("Android");
|
|
6025
6025
|
return isAppleVendor && notOtherBrowser;
|
|
6026
6026
|
};
|
|
6027
|
-
IS_SAFARI =
|
|
6027
|
+
IS_SAFARI = isSafari2();
|
|
6028
6028
|
apis = Object.freeze({
|
|
6029
6029
|
/** Whether we are running in a browser environment (and not a web worker) */
|
|
6030
6030
|
IS_BROWSER_ENV,
|
|
@@ -26505,6 +26505,7 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
26505
26505
|
// src/index.ts
|
|
26506
26506
|
var index_exports = {};
|
|
26507
26507
|
__export(index_exports, {
|
|
26508
|
+
ARKIT_BLENDSHAPES: () => ARKIT_BLENDSHAPES,
|
|
26508
26509
|
AgentCoreAdapter: () => AgentCoreAdapter,
|
|
26509
26510
|
AnimationGraph: () => AnimationGraph,
|
|
26510
26511
|
AudioChunkCoalescer: () => AudioChunkCoalescer,
|
|
@@ -26540,6 +26541,8 @@ __export(index_exports, {
|
|
|
26540
26541
|
SileroVADWorker: () => SileroVADWorker,
|
|
26541
26542
|
SyncedAudioPipeline: () => SyncedAudioPipeline,
|
|
26542
26543
|
TenantManager: () => TenantManager,
|
|
26544
|
+
WAV2ARKIT_BLENDSHAPES: () => WAV2ARKIT_BLENDSHAPES,
|
|
26545
|
+
Wav2ArkitCpuInference: () => Wav2ArkitCpuInference,
|
|
26543
26546
|
Wav2Vec2Inference: () => Wav2Vec2Inference,
|
|
26544
26547
|
WhisperInference: () => WhisperInference,
|
|
26545
26548
|
blendEmotions: () => blendEmotions,
|
|
@@ -26551,6 +26554,7 @@ __export(index_exports, {
|
|
|
26551
26554
|
configureLogging: () => configureLogging,
|
|
26552
26555
|
configureTelemetry: () => configureTelemetry,
|
|
26553
26556
|
createEmotionVector: () => createEmotionVector,
|
|
26557
|
+
createLipSync: () => createLipSync,
|
|
26554
26558
|
createLogger: () => createLogger,
|
|
26555
26559
|
createSessionWithFallback: () => createSessionWithFallback,
|
|
26556
26560
|
createSileroVAD: () => createSileroVAD,
|
|
@@ -26575,6 +26579,7 @@ __export(index_exports, {
|
|
|
26575
26579
|
isIOSSafari: () => isIOSSafari,
|
|
26576
26580
|
isMobile: () => isMobile,
|
|
26577
26581
|
isOnnxRuntimeLoaded: () => isOnnxRuntimeLoaded,
|
|
26582
|
+
isSafari: () => isSafari,
|
|
26578
26583
|
isSpeechRecognitionAvailable: () => isSpeechRecognitionAvailable,
|
|
26579
26584
|
isWebGPUAvailable: () => isWebGPUAvailable,
|
|
26580
26585
|
lerpEmotion: () => lerpEmotion,
|
|
@@ -26583,15 +26588,18 @@ __export(index_exports, {
|
|
|
26583
26588
|
nukeBrowserCaches: () => nukeBrowserCaches,
|
|
26584
26589
|
parseHuggingFaceUrl: () => parseHuggingFaceUrl,
|
|
26585
26590
|
preloadModels: () => preloadModels,
|
|
26591
|
+
remapWav2ArkitToLam: () => remapWav2ArkitToLam,
|
|
26586
26592
|
resetLoggingConfig: () => resetLoggingConfig,
|
|
26587
26593
|
resolveBackend: () => resolveBackend,
|
|
26588
26594
|
scanForInvalidCaches: () => scanForInvalidCaches,
|
|
26589
26595
|
setLogLevel: () => setLogLevel,
|
|
26590
26596
|
setLoggingEnabled: () => setLoggingEnabled,
|
|
26591
26597
|
shouldEnableWasmProxy: () => shouldEnableWasmProxy,
|
|
26598
|
+
shouldUseCpuLipSync: () => shouldUseCpuLipSync,
|
|
26592
26599
|
shouldUseNativeASR: () => shouldUseNativeASR,
|
|
26593
26600
|
shouldUseServerLipSync: () => shouldUseServerLipSync,
|
|
26594
26601
|
supportsVADWorker: () => supportsVADWorker,
|
|
26602
|
+
symmetrizeBlendshapes: () => symmetrizeBlendshapes,
|
|
26595
26603
|
validateCachedResponse: () => validateCachedResponse
|
|
26596
26604
|
});
|
|
26597
26605
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -26832,6 +26840,19 @@ var AudioScheduler = class {
|
|
|
26832
26840
|
async initialize() {
|
|
26833
26841
|
console.log("[AudioScheduler] Ready for lazy initialization");
|
|
26834
26842
|
}
|
|
26843
|
+
/**
|
|
26844
|
+
* Eagerly create and warm up the AudioContext
|
|
26845
|
+
*
|
|
26846
|
+
* Call this when a playback session starts (e.g., when AI response begins).
|
|
26847
|
+
* The AudioContext needs time to initialize the audio hardware — on Windows
|
|
26848
|
+
* this can take 50-100ms. By warming up early (before audio data arrives),
|
|
26849
|
+
* the context is fully ready when schedule() is first called.
|
|
26850
|
+
*
|
|
26851
|
+
* Must be called after a user gesture (click/tap) for autoplay policy.
|
|
26852
|
+
*/
|
|
26853
|
+
async warmup() {
|
|
26854
|
+
await this.ensureContext();
|
|
26855
|
+
}
|
|
26835
26856
|
/**
|
|
26836
26857
|
* Ensure AudioContext is created and ready
|
|
26837
26858
|
* Called lazily on first schedule() - requires user gesture
|
|
@@ -26862,7 +26883,7 @@ var AudioScheduler = class {
|
|
|
26862
26883
|
const ctx = await this.ensureContext();
|
|
26863
26884
|
const channels = this.options.channels ?? 1;
|
|
26864
26885
|
if (!this.isPlaying) {
|
|
26865
|
-
this.nextPlayTime = ctx.currentTime;
|
|
26886
|
+
this.nextPlayTime = ctx.currentTime + 0.05;
|
|
26866
26887
|
this.isPlaying = true;
|
|
26867
26888
|
}
|
|
26868
26889
|
const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
|
|
@@ -26936,8 +26957,19 @@ var AudioScheduler = class {
|
|
|
26936
26957
|
}
|
|
26937
26958
|
/**
|
|
26938
26959
|
* Reset scheduler state for new playback session
|
|
26960
|
+
* Stops any orphaned sources that weren't cleaned up by cancelAll()
|
|
26939
26961
|
*/
|
|
26940
26962
|
reset() {
|
|
26963
|
+
if (this.context) {
|
|
26964
|
+
const now = this.context.currentTime;
|
|
26965
|
+
for (const { source, gainNode } of this.scheduledSources) {
|
|
26966
|
+
try {
|
|
26967
|
+
gainNode.gain.setValueAtTime(0, now);
|
|
26968
|
+
source.stop(now);
|
|
26969
|
+
} catch {
|
|
26970
|
+
}
|
|
26971
|
+
}
|
|
26972
|
+
}
|
|
26941
26973
|
this.nextPlayTime = 0;
|
|
26942
26974
|
this.isPlaying = false;
|
|
26943
26975
|
this.scheduledSources = [];
|
|
@@ -27065,7 +27097,7 @@ var LAMPipeline = class {
|
|
|
27065
27097
|
newBuffer.set(this.buffer, 0);
|
|
27066
27098
|
newBuffer.set(samples, this.buffer.length);
|
|
27067
27099
|
this.buffer = newBuffer;
|
|
27068
|
-
|
|
27100
|
+
while (this.buffer.length >= this.REQUIRED_SAMPLES) {
|
|
27069
27101
|
await this.processBuffer(lam);
|
|
27070
27102
|
}
|
|
27071
27103
|
}
|
|
@@ -27218,12 +27250,20 @@ var LAMPipeline = class {
|
|
|
27218
27250
|
};
|
|
27219
27251
|
|
|
27220
27252
|
// src/audio/SyncedAudioPipeline.ts
|
|
27253
|
+
function pcm16ToFloat32(buffer) {
|
|
27254
|
+
const byteLen = buffer.byteLength & ~1;
|
|
27255
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
27256
|
+
const float32 = new Float32Array(int16.length);
|
|
27257
|
+
for (let i = 0; i < int16.length; i++) {
|
|
27258
|
+
float32[i] = int16[i] / 32768;
|
|
27259
|
+
}
|
|
27260
|
+
return float32;
|
|
27261
|
+
}
|
|
27221
27262
|
var SyncedAudioPipeline = class extends EventEmitter {
|
|
27222
27263
|
constructor(options) {
|
|
27223
27264
|
super();
|
|
27224
27265
|
this.options = options;
|
|
27225
|
-
this.
|
|
27226
|
-
this.bufferedChunks = [];
|
|
27266
|
+
this.playbackStarted = false;
|
|
27227
27267
|
this.monitorInterval = null;
|
|
27228
27268
|
this.frameAnimationId = null;
|
|
27229
27269
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
@@ -27234,11 +27274,6 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
27234
27274
|
});
|
|
27235
27275
|
this.lamPipeline = new LAMPipeline({
|
|
27236
27276
|
sampleRate,
|
|
27237
|
-
onInference: (frameCount) => {
|
|
27238
|
-
if (this.waitingForFirstLAM) {
|
|
27239
|
-
this.onFirstLAMComplete();
|
|
27240
|
-
}
|
|
27241
|
-
},
|
|
27242
27277
|
onError: (error) => {
|
|
27243
27278
|
this.emit("error", error);
|
|
27244
27279
|
}
|
|
@@ -27254,25 +27289,24 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
27254
27289
|
* Start a new playback session
|
|
27255
27290
|
*
|
|
27256
27291
|
* Resets all state and prepares for incoming audio chunks.
|
|
27257
|
-
*
|
|
27292
|
+
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
27258
27293
|
*/
|
|
27259
27294
|
start() {
|
|
27295
|
+
this.stopMonitoring();
|
|
27260
27296
|
this.scheduler.reset();
|
|
27261
27297
|
this.coalescer.reset();
|
|
27262
27298
|
this.lamPipeline.reset();
|
|
27263
|
-
this.
|
|
27264
|
-
this.
|
|
27299
|
+
this.playbackStarted = false;
|
|
27300
|
+
this.scheduler.warmup();
|
|
27265
27301
|
this.startFrameLoop();
|
|
27266
27302
|
this.startMonitoring();
|
|
27267
27303
|
}
|
|
27268
27304
|
/**
|
|
27269
27305
|
* Receive audio chunk from network
|
|
27270
27306
|
*
|
|
27271
|
-
*
|
|
27272
|
-
*
|
|
27273
|
-
*
|
|
27274
|
-
* - Audio scheduling waits until first LAM completes
|
|
27275
|
-
* - Then all buffered audio is scheduled together with LAM frames
|
|
27307
|
+
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
27308
|
+
* This prevents LAM inference (50-300ms) from blocking audio scheduling,
|
|
27309
|
+
* which caused audible stuttering with continuous audio streams.
|
|
27276
27310
|
*
|
|
27277
27311
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
27278
27312
|
*/
|
|
@@ -27281,51 +27315,15 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
27281
27315
|
if (!combined) {
|
|
27282
27316
|
return;
|
|
27283
27317
|
}
|
|
27284
|
-
const
|
|
27285
|
-
const
|
|
27286
|
-
|
|
27287
|
-
|
|
27288
|
-
|
|
27289
|
-
if (this.waitingForFirstLAM) {
|
|
27290
|
-
this.bufferedChunks.push(combined);
|
|
27291
|
-
const estimatedTime = this.scheduler.getCurrentTime();
|
|
27292
|
-
await this.lamPipeline.push(float32, estimatedTime, this.options.lam);
|
|
27293
|
-
} else {
|
|
27294
|
-
const scheduleTime = await this.scheduler.schedule(float32);
|
|
27295
|
-
await this.lamPipeline.push(float32, scheduleTime, this.options.lam);
|
|
27318
|
+
const float32 = pcm16ToFloat32(combined);
|
|
27319
|
+
const scheduleTime = await this.scheduler.schedule(float32);
|
|
27320
|
+
if (!this.playbackStarted) {
|
|
27321
|
+
this.playbackStarted = true;
|
|
27322
|
+
this.emit("playback_start", scheduleTime);
|
|
27296
27323
|
}
|
|
27297
|
-
|
|
27298
|
-
|
|
27299
|
-
|
|
27300
|
-
*
|
|
27301
|
-
* This is the critical synchronization point:
|
|
27302
|
-
* - LAM frames are now ready in the queue
|
|
27303
|
-
* - Schedule all buffered audio chunks
|
|
27304
|
-
* - Adjust LAM frame timestamps to match actual schedule time
|
|
27305
|
-
* - Audio and LAM start playing together, perfectly synchronized
|
|
27306
|
-
*/
|
|
27307
|
-
async onFirstLAMComplete() {
|
|
27308
|
-
this.waitingForFirstLAM = false;
|
|
27309
|
-
const beforeSchedule = this.scheduler.getCurrentTime();
|
|
27310
|
-
let actualStartTime = beforeSchedule;
|
|
27311
|
-
for (let i = 0; i < this.bufferedChunks.length; i++) {
|
|
27312
|
-
const buffer = this.bufferedChunks[i];
|
|
27313
|
-
const int16 = new Int16Array(buffer);
|
|
27314
|
-
const float32 = new Float32Array(int16.length);
|
|
27315
|
-
for (let j = 0; j < int16.length; j++) {
|
|
27316
|
-
float32[j] = int16[j] / 32768;
|
|
27317
|
-
}
|
|
27318
|
-
const scheduleTime = await this.scheduler.schedule(float32);
|
|
27319
|
-
if (i === 0) {
|
|
27320
|
-
actualStartTime = scheduleTime;
|
|
27321
|
-
}
|
|
27322
|
-
}
|
|
27323
|
-
const timeOffset = actualStartTime - beforeSchedule;
|
|
27324
|
-
if (timeOffset !== 0) {
|
|
27325
|
-
this.lamPipeline.adjustTimestamps(timeOffset);
|
|
27326
|
-
}
|
|
27327
|
-
this.bufferedChunks = [];
|
|
27328
|
-
this.emit("playback_start", actualStartTime);
|
|
27324
|
+
this.lamPipeline.push(float32, scheduleTime, this.options.lam).catch((err) => {
|
|
27325
|
+
this.emit("error", err);
|
|
27326
|
+
});
|
|
27329
27327
|
}
|
|
27330
27328
|
/**
|
|
27331
27329
|
* End of audio stream
|
|
@@ -27357,10 +27355,9 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
27357
27355
|
async stop(fadeOutMs = 50) {
|
|
27358
27356
|
this.stopMonitoring();
|
|
27359
27357
|
await this.scheduler.cancelAll(fadeOutMs);
|
|
27360
|
-
this.bufferedChunks = [];
|
|
27361
27358
|
this.coalescer.reset();
|
|
27362
27359
|
this.lamPipeline.reset();
|
|
27363
|
-
this.
|
|
27360
|
+
this.playbackStarted = false;
|
|
27364
27361
|
this.emit("playback_complete", void 0);
|
|
27365
27362
|
}
|
|
27366
27363
|
/**
|
|
@@ -27417,8 +27414,7 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
27417
27414
|
*/
|
|
27418
27415
|
getState() {
|
|
27419
27416
|
return {
|
|
27420
|
-
|
|
27421
|
-
bufferedChunks: this.bufferedChunks.length,
|
|
27417
|
+
playbackStarted: this.playbackStarted,
|
|
27422
27418
|
coalescerFill: this.coalescer.fillLevel,
|
|
27423
27419
|
lamFill: this.lamPipeline.fillLevel,
|
|
27424
27420
|
queuedFrames: this.lamPipeline.queuedFrameCount,
|
|
@@ -27434,7 +27430,6 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
27434
27430
|
this.scheduler.dispose();
|
|
27435
27431
|
this.coalescer.reset();
|
|
27436
27432
|
this.lamPipeline.reset();
|
|
27437
|
-
this.bufferedChunks = [];
|
|
27438
27433
|
}
|
|
27439
27434
|
};
|
|
27440
27435
|
|
|
@@ -28876,12 +28871,12 @@ var Logger = class _Logger {
|
|
|
28876
28871
|
};
|
|
28877
28872
|
var loggerCache = /* @__PURE__ */ new Map();
|
|
28878
28873
|
function createLogger(module2) {
|
|
28879
|
-
let
|
|
28880
|
-
if (!
|
|
28881
|
-
|
|
28882
|
-
loggerCache.set(module2,
|
|
28874
|
+
let logger13 = loggerCache.get(module2);
|
|
28875
|
+
if (!logger13) {
|
|
28876
|
+
logger13 = new Logger(module2);
|
|
28877
|
+
loggerCache.set(module2, logger13);
|
|
28883
28878
|
}
|
|
28884
|
-
return
|
|
28879
|
+
return logger13;
|
|
28885
28880
|
}
|
|
28886
28881
|
var noopLogger = {
|
|
28887
28882
|
module: "noop",
|
|
@@ -28925,7 +28920,7 @@ function hasWebGPUApi() {
|
|
|
28925
28920
|
return "gpu" in navigator && navigator.gpu !== void 0;
|
|
28926
28921
|
}
|
|
28927
28922
|
function getRecommendedBackend() {
|
|
28928
|
-
if (isIOS()) {
|
|
28923
|
+
if (isSafari() || isIOS()) {
|
|
28929
28924
|
return "wasm";
|
|
28930
28925
|
}
|
|
28931
28926
|
return "webgpu";
|
|
@@ -28969,6 +28964,14 @@ function shouldEnableWasmProxy() {
|
|
|
28969
28964
|
}
|
|
28970
28965
|
return true;
|
|
28971
28966
|
}
|
|
28967
|
+
function isSafari() {
|
|
28968
|
+
if (typeof navigator === "undefined") return false;
|
|
28969
|
+
const ua = navigator.userAgent.toLowerCase();
|
|
28970
|
+
return /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
|
|
28971
|
+
}
|
|
28972
|
+
function shouldUseCpuLipSync() {
|
|
28973
|
+
return isSafari() || isIOS();
|
|
28974
|
+
}
|
|
28972
28975
|
function isSpeechRecognitionAvailable() {
|
|
28973
28976
|
if (typeof window === "undefined") return false;
|
|
28974
28977
|
return "SpeechRecognition" in window || "webkitSpeechRecognition" in window;
|
|
@@ -29115,8 +29118,7 @@ function isOnnxRuntimeLoaded() {
|
|
|
29115
29118
|
return ortInstance !== null;
|
|
29116
29119
|
}
|
|
29117
29120
|
|
|
29118
|
-
// src/inference/
|
|
29119
|
-
var logger2 = createLogger("Wav2Vec2");
|
|
29121
|
+
// src/inference/blendshapeUtils.ts
|
|
29120
29122
|
var LAM_BLENDSHAPES = [
|
|
29121
29123
|
"browDownLeft",
|
|
29122
29124
|
"browDownRight",
|
|
@@ -29171,40 +29173,7 @@ var LAM_BLENDSHAPES = [
|
|
|
29171
29173
|
"noseSneerRight",
|
|
29172
29174
|
"tongueOut"
|
|
29173
29175
|
];
|
|
29174
|
-
var
|
|
29175
|
-
"<pad>",
|
|
29176
|
-
"<s>",
|
|
29177
|
-
"</s>",
|
|
29178
|
-
"<unk>",
|
|
29179
|
-
"|",
|
|
29180
|
-
"E",
|
|
29181
|
-
"T",
|
|
29182
|
-
"A",
|
|
29183
|
-
"O",
|
|
29184
|
-
"N",
|
|
29185
|
-
"I",
|
|
29186
|
-
"H",
|
|
29187
|
-
"S",
|
|
29188
|
-
"R",
|
|
29189
|
-
"D",
|
|
29190
|
-
"L",
|
|
29191
|
-
"U",
|
|
29192
|
-
"M",
|
|
29193
|
-
"W",
|
|
29194
|
-
"C",
|
|
29195
|
-
"F",
|
|
29196
|
-
"G",
|
|
29197
|
-
"Y",
|
|
29198
|
-
"P",
|
|
29199
|
-
"B",
|
|
29200
|
-
"V",
|
|
29201
|
-
"K",
|
|
29202
|
-
"'",
|
|
29203
|
-
"X",
|
|
29204
|
-
"J",
|
|
29205
|
-
"Q",
|
|
29206
|
-
"Z"
|
|
29207
|
-
];
|
|
29176
|
+
var ARKIT_BLENDSHAPES = LAM_BLENDSHAPES;
|
|
29208
29177
|
var ARKIT_SYMMETRIC_PAIRS = [
|
|
29209
29178
|
["jawLeft", "jawRight"],
|
|
29210
29179
|
["mouthLeft", "mouthRight"],
|
|
@@ -29240,6 +29209,107 @@ function symmetrizeBlendshapes(frame) {
|
|
|
29240
29209
|
}
|
|
29241
29210
|
return result;
|
|
29242
29211
|
}
|
|
29212
|
+
var WAV2ARKIT_BLENDSHAPES = [
|
|
29213
|
+
"browDownLeft",
|
|
29214
|
+
"browDownRight",
|
|
29215
|
+
"browInnerUp",
|
|
29216
|
+
"browOuterUpLeft",
|
|
29217
|
+
"browOuterUpRight",
|
|
29218
|
+
"cheekPuff",
|
|
29219
|
+
"cheekSquintLeft",
|
|
29220
|
+
"cheekSquintRight",
|
|
29221
|
+
"eyeBlinkLeft",
|
|
29222
|
+
"eyeBlinkRight",
|
|
29223
|
+
"eyeLookDownLeft",
|
|
29224
|
+
"eyeLookDownRight",
|
|
29225
|
+
"eyeLookInLeft",
|
|
29226
|
+
"eyeLookInRight",
|
|
29227
|
+
"eyeLookOutLeft",
|
|
29228
|
+
"eyeLookOutRight",
|
|
29229
|
+
"eyeLookUpLeft",
|
|
29230
|
+
"eyeLookUpRight",
|
|
29231
|
+
"eyeSquintLeft",
|
|
29232
|
+
"eyeSquintRight",
|
|
29233
|
+
"eyeWideLeft",
|
|
29234
|
+
"eyeWideRight",
|
|
29235
|
+
"jawForward",
|
|
29236
|
+
"jawLeft",
|
|
29237
|
+
"jawOpen",
|
|
29238
|
+
"mouthFrownLeft",
|
|
29239
|
+
"mouthFrownRight",
|
|
29240
|
+
"mouthFunnel",
|
|
29241
|
+
"mouthLeft",
|
|
29242
|
+
"mouthLowerDownLeft",
|
|
29243
|
+
"mouthLowerDownRight",
|
|
29244
|
+
"mouthPressLeft",
|
|
29245
|
+
"mouthPressRight",
|
|
29246
|
+
"mouthPucker",
|
|
29247
|
+
"mouthRight",
|
|
29248
|
+
"mouthRollLower",
|
|
29249
|
+
"mouthRollUpper",
|
|
29250
|
+
"mouthShrugLower",
|
|
29251
|
+
"mouthShrugUpper",
|
|
29252
|
+
"mouthSmileLeft",
|
|
29253
|
+
"mouthSmileRight",
|
|
29254
|
+
"mouthStretchLeft",
|
|
29255
|
+
"mouthStretchRight",
|
|
29256
|
+
"mouthUpperUpLeft",
|
|
29257
|
+
"mouthUpperUpRight",
|
|
29258
|
+
"noseSneerLeft",
|
|
29259
|
+
"noseSneerRight",
|
|
29260
|
+
"tongueOut",
|
|
29261
|
+
"mouthClose",
|
|
29262
|
+
"mouthDimpleLeft",
|
|
29263
|
+
"mouthDimpleRight",
|
|
29264
|
+
"jawRight"
|
|
29265
|
+
];
|
|
29266
|
+
var REMAP_WAV2ARKIT_TO_LAM = WAV2ARKIT_BLENDSHAPES.map(
|
|
29267
|
+
(name) => LAM_BLENDSHAPES.indexOf(name)
|
|
29268
|
+
);
|
|
29269
|
+
function remapWav2ArkitToLam(frame) {
|
|
29270
|
+
const result = new Float32Array(52);
|
|
29271
|
+
for (let i = 0; i < 52; i++) {
|
|
29272
|
+
result[REMAP_WAV2ARKIT_TO_LAM[i]] = frame[i];
|
|
29273
|
+
}
|
|
29274
|
+
return result;
|
|
29275
|
+
}
|
|
29276
|
+
|
|
29277
|
+
// src/inference/Wav2Vec2Inference.ts
|
|
29278
|
+
var logger2 = createLogger("Wav2Vec2");
|
|
29279
|
+
var CTC_VOCAB = [
|
|
29280
|
+
"<pad>",
|
|
29281
|
+
"<s>",
|
|
29282
|
+
"</s>",
|
|
29283
|
+
"<unk>",
|
|
29284
|
+
"|",
|
|
29285
|
+
"E",
|
|
29286
|
+
"T",
|
|
29287
|
+
"A",
|
|
29288
|
+
"O",
|
|
29289
|
+
"N",
|
|
29290
|
+
"I",
|
|
29291
|
+
"H",
|
|
29292
|
+
"S",
|
|
29293
|
+
"R",
|
|
29294
|
+
"D",
|
|
29295
|
+
"L",
|
|
29296
|
+
"U",
|
|
29297
|
+
"M",
|
|
29298
|
+
"W",
|
|
29299
|
+
"C",
|
|
29300
|
+
"F",
|
|
29301
|
+
"G",
|
|
29302
|
+
"Y",
|
|
29303
|
+
"P",
|
|
29304
|
+
"B",
|
|
29305
|
+
"V",
|
|
29306
|
+
"K",
|
|
29307
|
+
"'",
|
|
29308
|
+
"X",
|
|
29309
|
+
"J",
|
|
29310
|
+
"Q",
|
|
29311
|
+
"Z"
|
|
29312
|
+
];
|
|
29243
29313
|
var Wav2Vec2Inference = class {
|
|
29244
29314
|
constructor(config) {
|
|
29245
29315
|
this.session = null;
|
|
@@ -29478,6 +29548,7 @@ var Wav2Vec2Inference = class {
|
|
|
29478
29548
|
blendshapes,
|
|
29479
29549
|
asrLogits,
|
|
29480
29550
|
text,
|
|
29551
|
+
numFrames: numA2EFrames,
|
|
29481
29552
|
numA2EFrames,
|
|
29482
29553
|
numASRFrames,
|
|
29483
29554
|
inferenceTimeMs
|
|
@@ -29845,8 +29916,293 @@ var WhisperInference = class _WhisperInference {
|
|
|
29845
29916
|
}
|
|
29846
29917
|
};
|
|
29847
29918
|
|
|
29919
|
+
// src/inference/Wav2ArkitCpuInference.ts
|
|
29920
|
+
var logger5 = createLogger("Wav2ArkitCpu");
|
|
29921
|
+
var Wav2ArkitCpuInference = class {
|
|
29922
|
+
constructor(config) {
|
|
29923
|
+
this.session = null;
|
|
29924
|
+
this.ort = null;
|
|
29925
|
+
this._backend = "wasm";
|
|
29926
|
+
this.isLoading = false;
|
|
29927
|
+
// Inference queue for handling concurrent calls
|
|
29928
|
+
this.inferenceQueue = Promise.resolve();
|
|
29929
|
+
this.config = config;
|
|
29930
|
+
}
|
|
29931
|
+
get backend() {
|
|
29932
|
+
return this.session ? this._backend : null;
|
|
29933
|
+
}
|
|
29934
|
+
get isLoaded() {
|
|
29935
|
+
return this.session !== null;
|
|
29936
|
+
}
|
|
29937
|
+
/**
|
|
29938
|
+
* Load the ONNX model
|
|
29939
|
+
*/
|
|
29940
|
+
async load() {
|
|
29941
|
+
if (this.isLoading) {
|
|
29942
|
+
throw new Error("Model is already loading");
|
|
29943
|
+
}
|
|
29944
|
+
if (this.session) {
|
|
29945
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
29946
|
+
}
|
|
29947
|
+
this.isLoading = true;
|
|
29948
|
+
const startTime = performance.now();
|
|
29949
|
+
const telemetry = getTelemetry();
|
|
29950
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
|
|
29951
|
+
"model.url": this.config.modelUrl,
|
|
29952
|
+
"model.backend_requested": this.config.backend || "wasm"
|
|
29953
|
+
});
|
|
29954
|
+
try {
|
|
29955
|
+
const preference = this.config.backend || "wasm";
|
|
29956
|
+
logger5.info("Loading ONNX Runtime...", { preference });
|
|
29957
|
+
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
29958
|
+
this.ort = ort;
|
|
29959
|
+
this._backend = backend;
|
|
29960
|
+
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
29961
|
+
const cache = getModelCache();
|
|
29962
|
+
const modelUrl = this.config.modelUrl;
|
|
29963
|
+
const isCached = await cache.has(modelUrl);
|
|
29964
|
+
let modelBuffer;
|
|
29965
|
+
if (isCached) {
|
|
29966
|
+
logger5.debug("Loading model from cache", { modelUrl });
|
|
29967
|
+
modelBuffer = await cache.get(modelUrl);
|
|
29968
|
+
if (!modelBuffer) {
|
|
29969
|
+
logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
29970
|
+
await cache.delete(modelUrl);
|
|
29971
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
29972
|
+
}
|
|
29973
|
+
} else {
|
|
29974
|
+
logger5.debug("Fetching and caching model", { modelUrl });
|
|
29975
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
29976
|
+
}
|
|
29977
|
+
if (!modelBuffer) {
|
|
29978
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
29979
|
+
}
|
|
29980
|
+
logger5.debug("Creating ONNX session", {
|
|
29981
|
+
size: formatBytes(modelBuffer.byteLength),
|
|
29982
|
+
backend: this._backend
|
|
29983
|
+
});
|
|
29984
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
29985
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
29986
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
29987
|
+
const loadTimeMs = performance.now() - startTime;
|
|
29988
|
+
logger5.info("Model loaded successfully", {
|
|
29989
|
+
backend: this._backend,
|
|
29990
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
29991
|
+
inputs: this.session.inputNames,
|
|
29992
|
+
outputs: this.session.outputNames
|
|
29993
|
+
});
|
|
29994
|
+
span?.setAttributes({
|
|
29995
|
+
"model.backend": this._backend,
|
|
29996
|
+
"model.load_time_ms": loadTimeMs,
|
|
29997
|
+
"model.cached": isCached
|
|
29998
|
+
});
|
|
29999
|
+
span?.end();
|
|
30000
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
30001
|
+
model: "wav2arkit_cpu",
|
|
30002
|
+
backend: this._backend
|
|
30003
|
+
});
|
|
30004
|
+
logger5.debug("Running warmup inference");
|
|
30005
|
+
const warmupStart = performance.now();
|
|
30006
|
+
const silentAudio = new Float32Array(16e3);
|
|
30007
|
+
await this.infer(silentAudio);
|
|
30008
|
+
const warmupTimeMs = performance.now() - warmupStart;
|
|
30009
|
+
logger5.info("Warmup inference complete", {
|
|
30010
|
+
warmupTimeMs: Math.round(warmupTimeMs),
|
|
30011
|
+
backend: this._backend
|
|
30012
|
+
});
|
|
30013
|
+
telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
|
|
30014
|
+
model: "wav2arkit_cpu",
|
|
30015
|
+
backend: this._backend
|
|
30016
|
+
});
|
|
30017
|
+
return {
|
|
30018
|
+
backend: this._backend,
|
|
30019
|
+
loadTimeMs,
|
|
30020
|
+
inputNames: [...this.session.inputNames],
|
|
30021
|
+
outputNames: [...this.session.outputNames]
|
|
30022
|
+
};
|
|
30023
|
+
} catch (error) {
|
|
30024
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
30025
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
30026
|
+
model: "wav2arkit_cpu",
|
|
30027
|
+
error_type: "load_failed"
|
|
30028
|
+
});
|
|
30029
|
+
throw error;
|
|
30030
|
+
} finally {
|
|
30031
|
+
this.isLoading = false;
|
|
30032
|
+
}
|
|
30033
|
+
}
|
|
30034
|
+
/**
|
|
30035
|
+
* Run inference on raw audio
|
|
30036
|
+
*
|
|
30037
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
30038
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
30039
|
+
*
|
|
30040
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
30041
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
30042
|
+
*/
|
|
30043
|
+
async infer(audioSamples, _identityIndex) {
|
|
30044
|
+
if (!this.session) {
|
|
30045
|
+
throw new Error("Model not loaded. Call load() first.");
|
|
30046
|
+
}
|
|
30047
|
+
const audioCopy = new Float32Array(audioSamples);
|
|
30048
|
+
const feeds = {
|
|
30049
|
+
"audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
|
|
30050
|
+
};
|
|
30051
|
+
return this.queueInference(feeds, audioCopy.length);
|
|
30052
|
+
}
|
|
30053
|
+
/**
|
|
30054
|
+
* Queue inference to serialize ONNX session calls
|
|
30055
|
+
*/
|
|
30056
|
+
queueInference(feeds, inputSamples) {
|
|
30057
|
+
return new Promise((resolve, reject) => {
|
|
30058
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
30059
|
+
const telemetry = getTelemetry();
|
|
30060
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
|
|
30061
|
+
"inference.backend": this._backend,
|
|
30062
|
+
"inference.input_samples": inputSamples
|
|
30063
|
+
});
|
|
30064
|
+
try {
|
|
30065
|
+
const startTime = performance.now();
|
|
30066
|
+
const results = await this.session.run(feeds);
|
|
30067
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
30068
|
+
const blendshapeOutput = results["blendshapes"];
|
|
30069
|
+
if (!blendshapeOutput) {
|
|
30070
|
+
throw new Error("Missing blendshapes output from model");
|
|
30071
|
+
}
|
|
30072
|
+
const blendshapeData = blendshapeOutput.data;
|
|
30073
|
+
const numFrames = blendshapeOutput.dims[1];
|
|
30074
|
+
const numBlendshapes = blendshapeOutput.dims[2];
|
|
30075
|
+
const blendshapes = [];
|
|
30076
|
+
for (let f = 0; f < numFrames; f++) {
|
|
30077
|
+
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
30078
|
+
const remapped = remapWav2ArkitToLam(rawFrame);
|
|
30079
|
+
blendshapes.push(symmetrizeBlendshapes(remapped));
|
|
30080
|
+
}
|
|
30081
|
+
logger5.trace("Inference completed", {
|
|
30082
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
30083
|
+
numFrames,
|
|
30084
|
+
inputSamples
|
|
30085
|
+
});
|
|
30086
|
+
span?.setAttributes({
|
|
30087
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
30088
|
+
"inference.frames": numFrames
|
|
30089
|
+
});
|
|
30090
|
+
span?.end();
|
|
30091
|
+
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
30092
|
+
model: "wav2arkit_cpu",
|
|
30093
|
+
backend: this._backend
|
|
30094
|
+
});
|
|
30095
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
30096
|
+
model: "wav2arkit_cpu",
|
|
30097
|
+
backend: this._backend,
|
|
30098
|
+
status: "success"
|
|
30099
|
+
});
|
|
30100
|
+
resolve({
|
|
30101
|
+
blendshapes,
|
|
30102
|
+
numFrames,
|
|
30103
|
+
inferenceTimeMs
|
|
30104
|
+
});
|
|
30105
|
+
} catch (err) {
|
|
30106
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
30107
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
30108
|
+
model: "wav2arkit_cpu",
|
|
30109
|
+
backend: this._backend,
|
|
30110
|
+
status: "error"
|
|
30111
|
+
});
|
|
30112
|
+
reject(err);
|
|
30113
|
+
}
|
|
30114
|
+
});
|
|
30115
|
+
});
|
|
30116
|
+
}
|
|
30117
|
+
/**
|
|
30118
|
+
* Dispose of the model and free resources
|
|
30119
|
+
*/
|
|
30120
|
+
async dispose() {
|
|
30121
|
+
if (this.session) {
|
|
30122
|
+
await this.session.release();
|
|
30123
|
+
this.session = null;
|
|
30124
|
+
}
|
|
30125
|
+
}
|
|
30126
|
+
};
|
|
30127
|
+
|
|
30128
|
+
// src/inference/createLipSync.ts
|
|
30129
|
+
var logger6 = createLogger("createLipSync");
|
|
30130
|
+
function createLipSync(config) {
|
|
30131
|
+
const mode = config.mode ?? "auto";
|
|
30132
|
+
const fallbackOnError = config.fallbackOnError ?? true;
|
|
30133
|
+
let useCpu;
|
|
30134
|
+
if (mode === "cpu") {
|
|
30135
|
+
useCpu = true;
|
|
30136
|
+
logger6.info("Forcing CPU lip sync model (wav2arkit_cpu)");
|
|
30137
|
+
} else if (mode === "gpu") {
|
|
30138
|
+
useCpu = false;
|
|
30139
|
+
logger6.info("Forcing GPU lip sync model (Wav2Vec2)");
|
|
30140
|
+
} else {
|
|
30141
|
+
useCpu = shouldUseCpuLipSync();
|
|
30142
|
+
logger6.info("Auto-detected lip sync model", {
|
|
30143
|
+
useCpu,
|
|
30144
|
+
isSafari: isSafari()
|
|
30145
|
+
});
|
|
30146
|
+
}
|
|
30147
|
+
if (useCpu) {
|
|
30148
|
+
logger6.info("Creating Wav2ArkitCpuInference (1.8MB, WASM)");
|
|
30149
|
+
return new Wav2ArkitCpuInference({
|
|
30150
|
+
modelUrl: config.cpuModelUrl
|
|
30151
|
+
});
|
|
30152
|
+
}
|
|
30153
|
+
const gpuInstance = new Wav2Vec2Inference({
|
|
30154
|
+
modelUrl: config.gpuModelUrl,
|
|
30155
|
+
backend: config.gpuBackend ?? "auto",
|
|
30156
|
+
numIdentityClasses: config.numIdentityClasses
|
|
30157
|
+
});
|
|
30158
|
+
if (fallbackOnError) {
|
|
30159
|
+
logger6.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
30160
|
+
return new LipSyncWithFallback(gpuInstance, config);
|
|
30161
|
+
}
|
|
30162
|
+
logger6.info("Creating Wav2Vec2Inference (no fallback)");
|
|
30163
|
+
return gpuInstance;
|
|
30164
|
+
}
|
|
30165
|
+
var LipSyncWithFallback = class {
|
|
30166
|
+
constructor(gpuInstance, config) {
|
|
30167
|
+
this.hasFallenBack = false;
|
|
30168
|
+
this.implementation = gpuInstance;
|
|
30169
|
+
this.config = config;
|
|
30170
|
+
}
|
|
30171
|
+
get backend() {
|
|
30172
|
+
return this.implementation.backend;
|
|
30173
|
+
}
|
|
30174
|
+
get isLoaded() {
|
|
30175
|
+
return this.implementation.isLoaded;
|
|
30176
|
+
}
|
|
30177
|
+
async load() {
|
|
30178
|
+
try {
|
|
30179
|
+
return await this.implementation.load();
|
|
30180
|
+
} catch (error) {
|
|
30181
|
+
logger6.warn("GPU model load failed, falling back to CPU model", {
|
|
30182
|
+
error: error instanceof Error ? error.message : String(error)
|
|
30183
|
+
});
|
|
30184
|
+
try {
|
|
30185
|
+
await this.implementation.dispose();
|
|
30186
|
+
} catch {
|
|
30187
|
+
}
|
|
30188
|
+
this.implementation = new Wav2ArkitCpuInference({
|
|
30189
|
+
modelUrl: this.config.cpuModelUrl
|
|
30190
|
+
});
|
|
30191
|
+
this.hasFallenBack = true;
|
|
30192
|
+
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
30193
|
+
return await this.implementation.load();
|
|
30194
|
+
}
|
|
30195
|
+
}
|
|
30196
|
+
async infer(audioSamples, identityIndex) {
|
|
30197
|
+
return this.implementation.infer(audioSamples, identityIndex);
|
|
30198
|
+
}
|
|
30199
|
+
async dispose() {
|
|
30200
|
+
return this.implementation.dispose();
|
|
30201
|
+
}
|
|
30202
|
+
};
|
|
30203
|
+
|
|
29848
30204
|
// src/inference/SileroVADInference.ts
|
|
29849
|
-
var
|
|
30205
|
+
var logger7 = createLogger("SileroVAD");
|
|
29850
30206
|
var SileroVADInference = class {
|
|
29851
30207
|
constructor(config) {
|
|
29852
30208
|
this.session = null;
|
|
@@ -29918,23 +30274,23 @@ var SileroVADInference = class {
|
|
|
29918
30274
|
"model.sample_rate": this.config.sampleRate
|
|
29919
30275
|
});
|
|
29920
30276
|
try {
|
|
29921
|
-
|
|
30277
|
+
logger7.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
29922
30278
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
29923
30279
|
this.ort = ort;
|
|
29924
30280
|
this._backend = backend;
|
|
29925
|
-
|
|
30281
|
+
logger7.info("ONNX Runtime loaded", { backend: this._backend });
|
|
29926
30282
|
const cache = getModelCache();
|
|
29927
30283
|
const modelUrl = this.config.modelUrl;
|
|
29928
30284
|
const isCached = await cache.has(modelUrl);
|
|
29929
30285
|
let modelBuffer;
|
|
29930
30286
|
if (isCached) {
|
|
29931
|
-
|
|
30287
|
+
logger7.debug("Loading model from cache", { modelUrl });
|
|
29932
30288
|
modelBuffer = await cache.get(modelUrl);
|
|
29933
30289
|
} else {
|
|
29934
|
-
|
|
30290
|
+
logger7.debug("Fetching and caching model", { modelUrl });
|
|
29935
30291
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
29936
30292
|
}
|
|
29937
|
-
|
|
30293
|
+
logger7.debug("Creating ONNX session", {
|
|
29938
30294
|
size: formatBytes(modelBuffer.byteLength),
|
|
29939
30295
|
backend: this._backend
|
|
29940
30296
|
});
|
|
@@ -29943,7 +30299,7 @@ var SileroVADInference = class {
|
|
|
29943
30299
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
29944
30300
|
this.reset();
|
|
29945
30301
|
const loadTimeMs = performance.now() - startTime;
|
|
29946
|
-
|
|
30302
|
+
logger7.info("Model loaded successfully", {
|
|
29947
30303
|
backend: this._backend,
|
|
29948
30304
|
loadTimeMs: Math.round(loadTimeMs),
|
|
29949
30305
|
sampleRate: this.config.sampleRate,
|
|
@@ -30096,7 +30452,7 @@ var SileroVADInference = class {
|
|
|
30096
30452
|
this.preSpeechBuffer.shift();
|
|
30097
30453
|
}
|
|
30098
30454
|
}
|
|
30099
|
-
|
|
30455
|
+
logger7.trace("Skipping VAD inference - audio too quiet", {
|
|
30100
30456
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
30101
30457
|
threshold: MIN_ENERGY_THRESHOLD
|
|
30102
30458
|
});
|
|
@@ -30150,7 +30506,7 @@ var SileroVADInference = class {
|
|
|
30150
30506
|
if (isSpeech && !this.wasSpeaking) {
|
|
30151
30507
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
30152
30508
|
this.preSpeechBuffer = [];
|
|
30153
|
-
|
|
30509
|
+
logger7.debug("Speech started with pre-speech buffer", {
|
|
30154
30510
|
preSpeechChunks: preSpeechChunks.length,
|
|
30155
30511
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
30156
30512
|
});
|
|
@@ -30163,7 +30519,7 @@ var SileroVADInference = class {
|
|
|
30163
30519
|
this.preSpeechBuffer = [];
|
|
30164
30520
|
}
|
|
30165
30521
|
this.wasSpeaking = isSpeech;
|
|
30166
|
-
|
|
30522
|
+
logger7.trace("VAD inference completed", {
|
|
30167
30523
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
30168
30524
|
isSpeech,
|
|
30169
30525
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -30219,7 +30575,7 @@ var SileroVADInference = class {
|
|
|
30219
30575
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
30220
30576
|
|
|
30221
30577
|
// src/inference/SileroVADWorker.ts
|
|
30222
|
-
var
|
|
30578
|
+
var logger8 = createLogger("SileroVADWorker");
|
|
30223
30579
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
30224
30580
|
var LOAD_TIMEOUT_MS = 1e4;
|
|
30225
30581
|
var INFERENCE_TIMEOUT_MS = 1e3;
|
|
@@ -30482,7 +30838,7 @@ var SileroVADWorker = class {
|
|
|
30482
30838
|
this.handleWorkerMessage(event.data);
|
|
30483
30839
|
};
|
|
30484
30840
|
worker.onerror = (error) => {
|
|
30485
|
-
|
|
30841
|
+
logger8.error("Worker error", { error: error.message });
|
|
30486
30842
|
for (const [, resolver] of this.pendingResolvers) {
|
|
30487
30843
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
30488
30844
|
}
|
|
@@ -30558,9 +30914,9 @@ var SileroVADWorker = class {
|
|
|
30558
30914
|
"model.sample_rate": this.config.sampleRate
|
|
30559
30915
|
});
|
|
30560
30916
|
try {
|
|
30561
|
-
|
|
30917
|
+
logger8.info("Creating VAD worker...");
|
|
30562
30918
|
this.worker = this.createWorker();
|
|
30563
|
-
|
|
30919
|
+
logger8.info("Loading model in worker...", {
|
|
30564
30920
|
modelUrl: this.config.modelUrl,
|
|
30565
30921
|
sampleRate: this.config.sampleRate
|
|
30566
30922
|
});
|
|
@@ -30576,7 +30932,7 @@ var SileroVADWorker = class {
|
|
|
30576
30932
|
);
|
|
30577
30933
|
this._isLoaded = true;
|
|
30578
30934
|
const loadTimeMs = performance.now() - startTime;
|
|
30579
|
-
|
|
30935
|
+
logger8.info("VAD worker loaded successfully", {
|
|
30580
30936
|
backend: "wasm",
|
|
30581
30937
|
loadTimeMs: Math.round(loadTimeMs),
|
|
30582
30938
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -30683,7 +31039,7 @@ var SileroVADWorker = class {
|
|
|
30683
31039
|
if (isSpeech && !this.wasSpeaking) {
|
|
30684
31040
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
30685
31041
|
this.preSpeechBuffer = [];
|
|
30686
|
-
|
|
31042
|
+
logger8.debug("Speech started with pre-speech buffer", {
|
|
30687
31043
|
preSpeechChunks: preSpeechChunks.length,
|
|
30688
31044
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
30689
31045
|
});
|
|
@@ -30696,7 +31052,7 @@ var SileroVADWorker = class {
|
|
|
30696
31052
|
this.preSpeechBuffer = [];
|
|
30697
31053
|
}
|
|
30698
31054
|
this.wasSpeaking = isSpeech;
|
|
30699
|
-
|
|
31055
|
+
logger8.trace("VAD worker inference completed", {
|
|
30700
31056
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
30701
31057
|
isSpeech,
|
|
30702
31058
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -30764,18 +31120,18 @@ var SileroVADWorker = class {
|
|
|
30764
31120
|
};
|
|
30765
31121
|
|
|
30766
31122
|
// src/inference/createSileroVAD.ts
|
|
30767
|
-
var
|
|
31123
|
+
var logger9 = createLogger("createSileroVAD");
|
|
30768
31124
|
function supportsVADWorker() {
|
|
30769
31125
|
if (typeof Worker === "undefined") {
|
|
30770
|
-
|
|
31126
|
+
logger9.debug("Worker not supported: Worker constructor undefined");
|
|
30771
31127
|
return false;
|
|
30772
31128
|
}
|
|
30773
31129
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
30774
|
-
|
|
31130
|
+
logger9.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
30775
31131
|
return false;
|
|
30776
31132
|
}
|
|
30777
31133
|
if (typeof Blob === "undefined") {
|
|
30778
|
-
|
|
31134
|
+
logger9.debug("Worker not supported: Blob constructor unavailable");
|
|
30779
31135
|
return false;
|
|
30780
31136
|
}
|
|
30781
31137
|
return true;
|
|
@@ -30785,19 +31141,19 @@ function createSileroVAD(config) {
|
|
|
30785
31141
|
let useWorker;
|
|
30786
31142
|
if (config.useWorker !== void 0) {
|
|
30787
31143
|
useWorker = config.useWorker;
|
|
30788
|
-
|
|
31144
|
+
logger9.debug("Worker preference explicitly set", { useWorker });
|
|
30789
31145
|
} else {
|
|
30790
31146
|
const workerSupported = supportsVADWorker();
|
|
30791
31147
|
const onMobile = isMobile();
|
|
30792
31148
|
useWorker = workerSupported && !onMobile;
|
|
30793
|
-
|
|
31149
|
+
logger9.debug("Auto-detected Worker preference", {
|
|
30794
31150
|
useWorker,
|
|
30795
31151
|
workerSupported,
|
|
30796
31152
|
onMobile
|
|
30797
31153
|
});
|
|
30798
31154
|
}
|
|
30799
31155
|
if (useWorker) {
|
|
30800
|
-
|
|
31156
|
+
logger9.info("Creating SileroVADWorker (off-main-thread)");
|
|
30801
31157
|
const worker = new SileroVADWorker({
|
|
30802
31158
|
modelUrl: config.modelUrl,
|
|
30803
31159
|
sampleRate: config.sampleRate,
|
|
@@ -30809,7 +31165,7 @@ function createSileroVAD(config) {
|
|
|
30809
31165
|
}
|
|
30810
31166
|
return worker;
|
|
30811
31167
|
}
|
|
30812
|
-
|
|
31168
|
+
logger9.info("Creating SileroVADInference (main thread)");
|
|
30813
31169
|
return new SileroVADInference(config);
|
|
30814
31170
|
}
|
|
30815
31171
|
var VADWorkerWithFallback = class {
|
|
@@ -30835,7 +31191,7 @@ var VADWorkerWithFallback = class {
|
|
|
30835
31191
|
try {
|
|
30836
31192
|
return await this.implementation.load();
|
|
30837
31193
|
} catch (error) {
|
|
30838
|
-
|
|
31194
|
+
logger9.warn("Worker load failed, falling back to main thread", {
|
|
30839
31195
|
error: error instanceof Error ? error.message : String(error)
|
|
30840
31196
|
});
|
|
30841
31197
|
try {
|
|
@@ -30844,7 +31200,7 @@ var VADWorkerWithFallback = class {
|
|
|
30844
31200
|
}
|
|
30845
31201
|
this.implementation = new SileroVADInference(this.config);
|
|
30846
31202
|
this.hasFallenBack = true;
|
|
30847
|
-
|
|
31203
|
+
logger9.info("Fallback to SileroVADInference successful");
|
|
30848
31204
|
return await this.implementation.load();
|
|
30849
31205
|
}
|
|
30850
31206
|
}
|
|
@@ -30866,7 +31222,7 @@ var VADWorkerWithFallback = class {
|
|
|
30866
31222
|
};
|
|
30867
31223
|
|
|
30868
31224
|
// src/inference/Emotion2VecInference.ts
|
|
30869
|
-
var
|
|
31225
|
+
var logger10 = createLogger("Emotion2Vec");
|
|
30870
31226
|
var EMOTION2VEC_LABELS = ["neutral", "happy", "angry", "sad"];
|
|
30871
31227
|
var Emotion2VecInference = class {
|
|
30872
31228
|
constructor(config) {
|
|
@@ -30908,28 +31264,28 @@ var Emotion2VecInference = class {
|
|
|
30908
31264
|
"model.backend_requested": this.config.backend
|
|
30909
31265
|
});
|
|
30910
31266
|
try {
|
|
30911
|
-
|
|
31267
|
+
logger10.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
30912
31268
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
30913
31269
|
this.ort = ort;
|
|
30914
31270
|
this._backend = backend;
|
|
30915
|
-
|
|
30916
|
-
|
|
31271
|
+
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
31272
|
+
logger10.info("Checking model cache...");
|
|
30917
31273
|
const cache = getModelCache();
|
|
30918
31274
|
const modelUrl = this.config.modelUrl;
|
|
30919
31275
|
const isCached = await cache.has(modelUrl);
|
|
30920
|
-
|
|
31276
|
+
logger10.info("Cache check complete", { modelUrl, isCached });
|
|
30921
31277
|
let modelBuffer;
|
|
30922
31278
|
if (isCached) {
|
|
30923
|
-
|
|
31279
|
+
logger10.info("Loading model from cache...", { modelUrl });
|
|
30924
31280
|
modelBuffer = await cache.get(modelUrl);
|
|
30925
|
-
|
|
31281
|
+
logger10.info("Model loaded from cache", { size: formatBytes(modelBuffer.byteLength) });
|
|
30926
31282
|
} else {
|
|
30927
|
-
|
|
31283
|
+
logger10.info("Fetching model (not cached)...", { modelUrl });
|
|
30928
31284
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
30929
|
-
|
|
31285
|
+
logger10.info("Model fetched and cached", { size: formatBytes(modelBuffer.byteLength) });
|
|
30930
31286
|
}
|
|
30931
|
-
|
|
30932
|
-
|
|
31287
|
+
logger10.info("Creating ONNX session (this may take a while for large models)...");
|
|
31288
|
+
logger10.debug("Creating ONNX session", {
|
|
30933
31289
|
size: formatBytes(modelBuffer.byteLength),
|
|
30934
31290
|
backend: this._backend
|
|
30935
31291
|
});
|
|
@@ -30937,7 +31293,7 @@ var Emotion2VecInference = class {
|
|
|
30937
31293
|
const modelData = new Uint8Array(modelBuffer);
|
|
30938
31294
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
30939
31295
|
const loadTimeMs = performance.now() - startTime;
|
|
30940
|
-
|
|
31296
|
+
logger10.info("Model loaded successfully", {
|
|
30941
31297
|
backend: this._backend,
|
|
30942
31298
|
loadTimeMs: Math.round(loadTimeMs),
|
|
30943
31299
|
sampleRate: this.config.sampleRate,
|
|
@@ -31049,7 +31405,7 @@ var Emotion2VecInference = class {
|
|
|
31049
31405
|
});
|
|
31050
31406
|
}
|
|
31051
31407
|
const inferenceTimeMs = performance.now() - startTime;
|
|
31052
|
-
|
|
31408
|
+
logger10.debug("Emotion inference completed", {
|
|
31053
31409
|
numFrames,
|
|
31054
31410
|
dominant: dominant.emotion,
|
|
31055
31411
|
confidence: Math.round(dominant.confidence * 100),
|
|
@@ -31126,7 +31482,7 @@ var Emotion2VecInference = class {
|
|
|
31126
31482
|
Emotion2VecInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
31127
31483
|
|
|
31128
31484
|
// src/inference/SafariSpeechRecognition.ts
|
|
31129
|
-
var
|
|
31485
|
+
var logger11 = createLogger("SafariSpeech");
|
|
31130
31486
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
31131
31487
|
constructor(config = {}) {
|
|
31132
31488
|
this.recognition = null;
|
|
@@ -31145,7 +31501,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31145
31501
|
interimResults: config.interimResults ?? true,
|
|
31146
31502
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
31147
31503
|
};
|
|
31148
|
-
|
|
31504
|
+
logger11.debug("SafariSpeechRecognition created", {
|
|
31149
31505
|
language: this.config.language,
|
|
31150
31506
|
continuous: this.config.continuous
|
|
31151
31507
|
});
|
|
@@ -31206,7 +31562,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31206
31562
|
*/
|
|
31207
31563
|
async start() {
|
|
31208
31564
|
if (this.isListening) {
|
|
31209
|
-
|
|
31565
|
+
logger11.warn("Already listening");
|
|
31210
31566
|
return;
|
|
31211
31567
|
}
|
|
31212
31568
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -31236,7 +31592,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31236
31592
|
this.isListening = true;
|
|
31237
31593
|
this.startTime = performance.now();
|
|
31238
31594
|
this.accumulatedText = "";
|
|
31239
|
-
|
|
31595
|
+
logger11.info("Speech recognition started", {
|
|
31240
31596
|
language: this.config.language
|
|
31241
31597
|
});
|
|
31242
31598
|
span?.end();
|
|
@@ -31251,7 +31607,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31251
31607
|
*/
|
|
31252
31608
|
async stop() {
|
|
31253
31609
|
if (!this.isListening || !this.recognition) {
|
|
31254
|
-
|
|
31610
|
+
logger11.warn("Not currently listening");
|
|
31255
31611
|
return {
|
|
31256
31612
|
text: this.accumulatedText,
|
|
31257
31613
|
language: this.config.language,
|
|
@@ -31280,7 +31636,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31280
31636
|
if (this.recognition && this.isListening) {
|
|
31281
31637
|
this.recognition.abort();
|
|
31282
31638
|
this.isListening = false;
|
|
31283
|
-
|
|
31639
|
+
logger11.info("Speech recognition aborted");
|
|
31284
31640
|
}
|
|
31285
31641
|
}
|
|
31286
31642
|
/**
|
|
@@ -31311,7 +31667,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31311
31667
|
this.isListening = false;
|
|
31312
31668
|
this.resultCallbacks = [];
|
|
31313
31669
|
this.errorCallbacks = [];
|
|
31314
|
-
|
|
31670
|
+
logger11.debug("SafariSpeechRecognition disposed");
|
|
31315
31671
|
}
|
|
31316
31672
|
/**
|
|
31317
31673
|
* Set up event handlers for the recognition instance
|
|
@@ -31339,7 +31695,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31339
31695
|
confidence: alternative.confidence
|
|
31340
31696
|
};
|
|
31341
31697
|
this.emitResult(speechResult);
|
|
31342
|
-
|
|
31698
|
+
logger11.trace("Speech result", {
|
|
31343
31699
|
text: text.substring(0, 50),
|
|
31344
31700
|
isFinal,
|
|
31345
31701
|
confidence: alternative.confidence
|
|
@@ -31349,12 +31705,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31349
31705
|
span?.end();
|
|
31350
31706
|
} catch (error) {
|
|
31351
31707
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
31352
|
-
|
|
31708
|
+
logger11.error("Error processing speech result", { error });
|
|
31353
31709
|
}
|
|
31354
31710
|
};
|
|
31355
31711
|
this.recognition.onerror = (event) => {
|
|
31356
31712
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
31357
|
-
|
|
31713
|
+
logger11.error("Speech recognition error", { error: event.error, message: event.message });
|
|
31358
31714
|
this.emitError(error);
|
|
31359
31715
|
if (this.stopRejecter) {
|
|
31360
31716
|
this.stopRejecter(error);
|
|
@@ -31364,7 +31720,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31364
31720
|
};
|
|
31365
31721
|
this.recognition.onend = () => {
|
|
31366
31722
|
this.isListening = false;
|
|
31367
|
-
|
|
31723
|
+
logger11.info("Speech recognition ended", {
|
|
31368
31724
|
totalText: this.accumulatedText.length,
|
|
31369
31725
|
durationMs: performance.now() - this.startTime
|
|
31370
31726
|
});
|
|
@@ -31381,13 +31737,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31381
31737
|
}
|
|
31382
31738
|
};
|
|
31383
31739
|
this.recognition.onstart = () => {
|
|
31384
|
-
|
|
31740
|
+
logger11.debug("Speech recognition started by browser");
|
|
31385
31741
|
};
|
|
31386
31742
|
this.recognition.onspeechstart = () => {
|
|
31387
|
-
|
|
31743
|
+
logger11.debug("Speech detected");
|
|
31388
31744
|
};
|
|
31389
31745
|
this.recognition.onspeechend = () => {
|
|
31390
|
-
|
|
31746
|
+
logger11.debug("Speech ended");
|
|
31391
31747
|
};
|
|
31392
31748
|
}
|
|
31393
31749
|
/**
|
|
@@ -31398,7 +31754,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31398
31754
|
try {
|
|
31399
31755
|
callback(result);
|
|
31400
31756
|
} catch (error) {
|
|
31401
|
-
|
|
31757
|
+
logger11.error("Error in result callback", { error });
|
|
31402
31758
|
}
|
|
31403
31759
|
}
|
|
31404
31760
|
}
|
|
@@ -31410,7 +31766,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
31410
31766
|
try {
|
|
31411
31767
|
callback(error);
|
|
31412
31768
|
} catch (callbackError) {
|
|
31413
|
-
|
|
31769
|
+
logger11.error("Error in error callback", { error: callbackError });
|
|
31414
31770
|
}
|
|
31415
31771
|
}
|
|
31416
31772
|
}
|
|
@@ -32833,12 +33189,12 @@ async function isHuggingFaceCDNReachable(testUrl = HF_CDN_TEST_URL) {
|
|
|
32833
33189
|
}
|
|
32834
33190
|
|
|
32835
33191
|
// src/utils/transformersCacheClear.ts
|
|
32836
|
-
var
|
|
33192
|
+
var logger12 = createLogger("TransformersCache");
|
|
32837
33193
|
async function clearTransformersCache(options) {
|
|
32838
33194
|
const verbose = options?.verbose ?? true;
|
|
32839
33195
|
const additionalPatterns = options?.additionalPatterns ?? [];
|
|
32840
33196
|
if (!("caches" in window)) {
|
|
32841
|
-
|
|
33197
|
+
logger12.warn("Cache API not available in this environment");
|
|
32842
33198
|
return [];
|
|
32843
33199
|
}
|
|
32844
33200
|
try {
|
|
@@ -32856,18 +33212,18 @@ async function clearTransformersCache(options) {
|
|
|
32856
33212
|
);
|
|
32857
33213
|
if (shouldDelete) {
|
|
32858
33214
|
if (verbose) {
|
|
32859
|
-
|
|
33215
|
+
logger12.info("Deleting cache", { cacheName });
|
|
32860
33216
|
}
|
|
32861
33217
|
const deleted = await caches.delete(cacheName);
|
|
32862
33218
|
if (deleted) {
|
|
32863
33219
|
deletedCaches.push(cacheName);
|
|
32864
33220
|
} else if (verbose) {
|
|
32865
|
-
|
|
33221
|
+
logger12.warn("Failed to delete cache", { cacheName });
|
|
32866
33222
|
}
|
|
32867
33223
|
}
|
|
32868
33224
|
}
|
|
32869
33225
|
if (verbose) {
|
|
32870
|
-
|
|
33226
|
+
logger12.info("Cache clearing complete", {
|
|
32871
33227
|
totalCaches: cacheNames.length,
|
|
32872
33228
|
deletedCount: deletedCaches.length,
|
|
32873
33229
|
deletedCaches
|
|
@@ -32875,35 +33231,35 @@ async function clearTransformersCache(options) {
|
|
|
32875
33231
|
}
|
|
32876
33232
|
return deletedCaches;
|
|
32877
33233
|
} catch (error) {
|
|
32878
|
-
|
|
33234
|
+
logger12.error("Error clearing caches", { error });
|
|
32879
33235
|
throw error;
|
|
32880
33236
|
}
|
|
32881
33237
|
}
|
|
32882
33238
|
async function clearSpecificCache(cacheName) {
|
|
32883
33239
|
if (!("caches" in window)) {
|
|
32884
|
-
|
|
33240
|
+
logger12.warn("Cache API not available in this environment");
|
|
32885
33241
|
return false;
|
|
32886
33242
|
}
|
|
32887
33243
|
try {
|
|
32888
33244
|
const deleted = await caches.delete(cacheName);
|
|
32889
|
-
|
|
33245
|
+
logger12.info("Cache deletion attempt", { cacheName, deleted });
|
|
32890
33246
|
return deleted;
|
|
32891
33247
|
} catch (error) {
|
|
32892
|
-
|
|
33248
|
+
logger12.error("Error deleting cache", { cacheName, error });
|
|
32893
33249
|
return false;
|
|
32894
33250
|
}
|
|
32895
33251
|
}
|
|
32896
33252
|
async function listCaches() {
|
|
32897
33253
|
if (!("caches" in window)) {
|
|
32898
|
-
|
|
33254
|
+
logger12.warn("Cache API not available in this environment");
|
|
32899
33255
|
return [];
|
|
32900
33256
|
}
|
|
32901
33257
|
try {
|
|
32902
33258
|
const cacheNames = await caches.keys();
|
|
32903
|
-
|
|
33259
|
+
logger12.debug("Available caches", { cacheNames });
|
|
32904
33260
|
return cacheNames;
|
|
32905
33261
|
} catch (error) {
|
|
32906
|
-
|
|
33262
|
+
logger12.error("Error listing caches", { error });
|
|
32907
33263
|
return [];
|
|
32908
33264
|
}
|
|
32909
33265
|
}
|
|
@@ -32945,7 +33301,7 @@ async function validateCachedResponse(cacheName, requestUrl) {
|
|
|
32945
33301
|
reason: valid ? "Valid response" : `Invalid: status=${response.status}, contentType=${contentType}, isHtml=${isHtml || looksLikeHtml}`
|
|
32946
33302
|
};
|
|
32947
33303
|
} catch (error) {
|
|
32948
|
-
|
|
33304
|
+
logger12.error("Error validating cached response", { cacheName, requestUrl, error });
|
|
32949
33305
|
return {
|
|
32950
33306
|
exists: false,
|
|
32951
33307
|
valid: false,
|
|
@@ -32982,7 +33338,7 @@ async function scanForInvalidCaches() {
|
|
|
32982
33338
|
}
|
|
32983
33339
|
}
|
|
32984
33340
|
}
|
|
32985
|
-
|
|
33341
|
+
logger12.info("Cache scan complete", {
|
|
32986
33342
|
totalCaches: cacheNames.length,
|
|
32987
33343
|
scannedEntries,
|
|
32988
33344
|
invalidCount: invalidEntries.length
|
|
@@ -32993,13 +33349,13 @@ async function scanForInvalidCaches() {
|
|
|
32993
33349
|
invalidEntries
|
|
32994
33350
|
};
|
|
32995
33351
|
} catch (error) {
|
|
32996
|
-
|
|
33352
|
+
logger12.error("Error scanning caches", { error });
|
|
32997
33353
|
throw error;
|
|
32998
33354
|
}
|
|
32999
33355
|
}
|
|
33000
33356
|
async function nukeBrowserCaches(preventRecreation = false) {
|
|
33001
33357
|
if (!("caches" in window)) {
|
|
33002
|
-
|
|
33358
|
+
logger12.warn("Cache API not available in this environment");
|
|
33003
33359
|
return 0;
|
|
33004
33360
|
}
|
|
33005
33361
|
try {
|
|
@@ -33011,17 +33367,17 @@ async function nukeBrowserCaches(preventRecreation = false) {
|
|
|
33011
33367
|
deletedCount++;
|
|
33012
33368
|
}
|
|
33013
33369
|
}
|
|
33014
|
-
|
|
33370
|
+
logger12.info("All browser caches cleared", {
|
|
33015
33371
|
totalDeleted: deletedCount
|
|
33016
33372
|
});
|
|
33017
33373
|
if (preventRecreation) {
|
|
33018
33374
|
const { env: env3 } = await Promise.resolve().then(() => (init_transformers_web(), transformers_web_exports));
|
|
33019
33375
|
env3.useBrowserCache = false;
|
|
33020
|
-
|
|
33376
|
+
logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
|
|
33021
33377
|
}
|
|
33022
33378
|
return deletedCount;
|
|
33023
33379
|
} catch (error) {
|
|
33024
|
-
|
|
33380
|
+
logger12.error("Error nuking caches", { error });
|
|
33025
33381
|
throw error;
|
|
33026
33382
|
}
|
|
33027
33383
|
}
|