xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +473 -31
- xinference/client/restful/async_restful_client.py +178 -8
- xinference/client/restful/restful_client.py +151 -3
- xinference/core/supervisor.py +99 -53
- xinference/core/worker.py +10 -0
- xinference/deploy/cmdline.py +15 -0
- xinference/model/audio/core.py +21 -6
- xinference/model/audio/indextts2.py +166 -0
- xinference/model/audio/model_spec.json +58 -21
- xinference/model/image/model_spec.json +159 -90
- xinference/model/image/stable_diffusion/core.py +13 -4
- xinference/model/llm/__init__.py +6 -2
- xinference/model/llm/llm_family.json +1299 -174
- xinference/model/llm/mlx/distributed_models/core.py +41 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
- xinference/model/llm/sglang/core.py +44 -11
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
- xinference/model/llm/transformers/chatglm.py +3 -0
- xinference/model/llm/transformers/core.py +129 -36
- xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
- xinference/model/llm/transformers/utils.py +23 -0
- xinference/model/llm/utils.py +48 -32
- xinference/model/llm/vllm/core.py +207 -72
- xinference/model/utils.py +74 -31
- xinference/thirdparty/audiotools/__init__.py +10 -0
- xinference/thirdparty/audiotools/core/__init__.py +4 -0
- xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
- xinference/thirdparty/audiotools/core/display.py +194 -0
- xinference/thirdparty/audiotools/core/dsp.py +390 -0
- xinference/thirdparty/audiotools/core/effects.py +647 -0
- xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
- xinference/thirdparty/audiotools/core/loudness.py +320 -0
- xinference/thirdparty/audiotools/core/playback.py +252 -0
- xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
- xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
- xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
- xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
- xinference/thirdparty/audiotools/core/util.py +671 -0
- xinference/thirdparty/audiotools/core/whisper.py +97 -0
- xinference/thirdparty/audiotools/data/__init__.py +3 -0
- xinference/thirdparty/audiotools/data/datasets.py +517 -0
- xinference/thirdparty/audiotools/data/preprocess.py +81 -0
- xinference/thirdparty/audiotools/data/transforms.py +1592 -0
- xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
- xinference/thirdparty/audiotools/metrics/distance.py +131 -0
- xinference/thirdparty/audiotools/metrics/quality.py +159 -0
- xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
- xinference/thirdparty/audiotools/ml/__init__.py +5 -0
- xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
- xinference/thirdparty/audiotools/ml/decorators.py +440 -0
- xinference/thirdparty/audiotools/ml/experiment.py +90 -0
- xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
- xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
- xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
- xinference/thirdparty/audiotools/post.py +140 -0
- xinference/thirdparty/audiotools/preference.py +600 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
- xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
- xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
- xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
- xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
- xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
- xinference/thirdparty/indextts/__init__.py +0 -0
- xinference/thirdparty/indextts/cli.py +65 -0
- xinference/thirdparty/indextts/gpt/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
- xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
- xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
- xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
- xinference/thirdparty/indextts/gpt/model.py +713 -0
- xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
- xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
- xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
- xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
- xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
- xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
- xinference/thirdparty/indextts/infer.py +690 -0
- xinference/thirdparty/indextts/infer_v2.py +739 -0
- xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
- xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
- xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
- xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
- xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
- xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
- xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
- xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
- xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
- xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
- xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
- xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
- xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
- xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
- xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
- xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
- xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
- xinference/thirdparty/indextts/utils/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/arch_util.py +120 -0
- xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
- xinference/thirdparty/indextts/utils/common.py +121 -0
- xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
- xinference/thirdparty/indextts/utils/front.py +536 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
- xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
- xinference/thirdparty/indextts/utils/text_utils.py +41 -0
- xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
- xinference/thirdparty/indextts/utils/utils.py +93 -0
- xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
- xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
- xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
- xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
- xinference/thirdparty/melo/text/chinese_mix.py +2 -2
- xinference/types.py +9 -0
- xinference/ui/gradio/media_interface.py +66 -8
- xinference/ui/web/ui/build/asset-manifest.json +6 -6
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
- xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
- xinference/ui/web/ui/build/static/js/main.45e78536.js +3 -0
- xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -7
- xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
- xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
- xinference/ui/web/ui/package-lock.json +0 -34
- xinference/ui/web/ui/package.json +0 -1
- xinference/ui/web/ui/src/locales/en.json +9 -3
- xinference/ui/web/ui/src/locales/ja.json +9 -3
- xinference/ui/web/ui/src/locales/ko.json +9 -3
- xinference/ui/web/ui/src/locales/zh.json +9 -3
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/METADATA +24 -6
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/RECORD +296 -77
- xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
- xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
- xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
- xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
- xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
- xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
- xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
- xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
- xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
- xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
- xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
- xinference/ui/web/ui/node_modules/select/bower.json +0 -13
- xinference/ui/web/ui/node_modules/select/package.json +0 -29
- xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
# Copyright 2022-2025 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import logging
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
16
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
|
17
|
+
|
|
18
|
+
import torch
|
|
19
|
+
from PIL import Image
|
|
20
|
+
|
|
21
|
+
from .....core.model import register_batching_multimodal_models
|
|
22
|
+
from .....model.utils import select_device
|
|
23
|
+
from .....types import PytorchModelConfig
|
|
24
|
+
from ....scheduler.request import InferenceRequest
|
|
25
|
+
from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
|
|
26
|
+
from ...utils import _decode_image, parse_messages
|
|
27
|
+
from ..core import register_non_default_model
|
|
28
|
+
from .core import PytorchMultiModalModel
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@register_batching_multimodal_models("MiniCPM-V-4.5")
|
|
34
|
+
@register_transformer
|
|
35
|
+
@register_non_default_model("MiniCPM-V-4.5")
|
|
36
|
+
class MiniCPMV45Model(PytorchMultiModalModel):
|
|
37
|
+
@classmethod
|
|
38
|
+
def match_json(
|
|
39
|
+
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
40
|
+
) -> bool:
|
|
41
|
+
family = model_family.model_family or model_family.model_name
|
|
42
|
+
if "MiniCPM-V-4.5".lower() in family.lower():
|
|
43
|
+
return True
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
def _sanitize_model_config(
|
|
47
|
+
self, pytorch_model_config: Optional[PytorchModelConfig]
|
|
48
|
+
) -> PytorchModelConfig:
|
|
49
|
+
pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
|
|
50
|
+
assert pytorch_model_config is not None
|
|
51
|
+
# Configure pixel parameters for MiniCPM-V-4.5
|
|
52
|
+
pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
|
|
53
|
+
pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
|
|
54
|
+
return pytorch_model_config
|
|
55
|
+
|
|
56
|
+
def decide_device(self):
|
|
57
|
+
device = self._pytorch_model_config.get("device", "auto")
|
|
58
|
+
self._device = select_device(device)
|
|
59
|
+
self._device = (
|
|
60
|
+
"auto"
|
|
61
|
+
if self._device == "cuda" and self.quantization is None
|
|
62
|
+
else self._device
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def load_processor(self):
|
|
66
|
+
from transformers import AutoProcessor, AutoTokenizer
|
|
67
|
+
|
|
68
|
+
min_pixels = self._pytorch_model_config.get("min_pixels")
|
|
69
|
+
max_pixels = self._pytorch_model_config.get("max_pixels")
|
|
70
|
+
self._processor = AutoProcessor.from_pretrained(
|
|
71
|
+
self.model_path,
|
|
72
|
+
trust_remote_code=True,
|
|
73
|
+
min_pixels=min_pixels,
|
|
74
|
+
max_pixels=max_pixels,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
self._tokenizer = AutoTokenizer.from_pretrained(
|
|
78
|
+
self.model_path, trust_remote_code=True
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def load_multimodal_model(self):
|
|
82
|
+
from transformers import AutoModel
|
|
83
|
+
from transformers.generation import GenerationConfig
|
|
84
|
+
|
|
85
|
+
if "int4" in self.model_path:
|
|
86
|
+
model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
|
|
87
|
+
else:
|
|
88
|
+
kwargs = self.apply_bnb_quantization()
|
|
89
|
+
model = AutoModel.from_pretrained(
|
|
90
|
+
self.model_path,
|
|
91
|
+
trust_remote_code=True,
|
|
92
|
+
torch_dtype=torch.float16,
|
|
93
|
+
device_map=self._device,
|
|
94
|
+
**kwargs,
|
|
95
|
+
)
|
|
96
|
+
self._model = model.eval()
|
|
97
|
+
# Specify hyperparameters for generation
|
|
98
|
+
self._model.generation_config = GenerationConfig.from_pretrained(
|
|
99
|
+
self.model_path,
|
|
100
|
+
trust_remote_code=True,
|
|
101
|
+
)
|
|
102
|
+
self._device = self._model.device
|
|
103
|
+
|
|
104
|
+
def _message_content_to_chat(self, content):
|
|
105
|
+
MAX_NUM_FRAMES = 64
|
|
106
|
+
|
|
107
|
+
def encode_video(video_path):
|
|
108
|
+
from decord import VideoReader, cpu
|
|
109
|
+
|
|
110
|
+
def uniform_sample(l, n):
|
|
111
|
+
gap = len(l) / n
|
|
112
|
+
idxs = [int(i * gap + gap / 2) for i in range(n)]
|
|
113
|
+
return [l[i] for i in idxs]
|
|
114
|
+
|
|
115
|
+
vr = VideoReader(video_path, ctx=cpu(0))
|
|
116
|
+
sample_fps = round(vr.get_avg_fps() / 1) # FPS
|
|
117
|
+
frame_idx = [i for i in range(0, len(vr), sample_fps)]
|
|
118
|
+
if len(frame_idx) > MAX_NUM_FRAMES:
|
|
119
|
+
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
|
|
120
|
+
frames = vr.get_batch(frame_idx).asnumpy()
|
|
121
|
+
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
|
|
122
|
+
logger.info(
|
|
123
|
+
f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
|
|
124
|
+
)
|
|
125
|
+
return frames
|
|
126
|
+
|
|
127
|
+
def _load_video(_url):
|
|
128
|
+
frames = None
|
|
129
|
+
if _url.startswith("data:"):
|
|
130
|
+
raise RuntimeError("Only video url format is supported")
|
|
131
|
+
else:
|
|
132
|
+
frames = encode_video(_url)
|
|
133
|
+
return frames
|
|
134
|
+
|
|
135
|
+
if not isinstance(content, str):
|
|
136
|
+
texts = []
|
|
137
|
+
image_urls = []
|
|
138
|
+
video_urls = []
|
|
139
|
+
for c in content:
|
|
140
|
+
c_type = c.get("type")
|
|
141
|
+
if c_type == "text":
|
|
142
|
+
texts.append(c["text"])
|
|
143
|
+
elif c_type == "image_url":
|
|
144
|
+
image_urls.append(c["image_url"]["url"])
|
|
145
|
+
elif c_type == "video_url":
|
|
146
|
+
video_urls.append(c["video_url"]["url"])
|
|
147
|
+
image_futures = []
|
|
148
|
+
with ThreadPoolExecutor() as executor:
|
|
149
|
+
for image_url in image_urls:
|
|
150
|
+
fut = executor.submit(_decode_image, image_url)
|
|
151
|
+
image_futures.append(fut)
|
|
152
|
+
images = [fut.result() for fut in image_futures]
|
|
153
|
+
frames = []
|
|
154
|
+
if len(video_urls) > 1:
|
|
155
|
+
raise RuntimeError("Only one video per message is supported")
|
|
156
|
+
for v in video_urls:
|
|
157
|
+
frames = _load_video(v)
|
|
158
|
+
text = " ".join(texts)
|
|
159
|
+
return text, images, frames
|
|
160
|
+
return content, [], []
|
|
161
|
+
|
|
162
|
+
def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
|
|
163
|
+
video_existed = False
|
|
164
|
+
prompt, _, chat_history = parse_messages(messages)
|
|
165
|
+
|
|
166
|
+
content, images_chat, video_frames = self._message_content_to_chat(prompt)
|
|
167
|
+
if len(video_frames) > 0:
|
|
168
|
+
video_existed = True
|
|
169
|
+
images_chat = video_frames
|
|
170
|
+
|
|
171
|
+
msgs = []
|
|
172
|
+
query_to_response: List[Dict] = []
|
|
173
|
+
for h in chat_history or []:
|
|
174
|
+
images_history = []
|
|
175
|
+
role = h["role"]
|
|
176
|
+
content_h, images_tmp, video_frames_h = self._message_content_to_chat(
|
|
177
|
+
h["content"]
|
|
178
|
+
)
|
|
179
|
+
if images_tmp != []:
|
|
180
|
+
images_history = images_tmp
|
|
181
|
+
if len(video_frames_h) > 0:
|
|
182
|
+
video_existed = True
|
|
183
|
+
images_history = video_frames_h
|
|
184
|
+
if len(query_to_response) == 0 and role == "user":
|
|
185
|
+
query_to_response.append(
|
|
186
|
+
{"role": "user", "content": images_history + [content_h]}
|
|
187
|
+
)
|
|
188
|
+
if len(query_to_response) == 1 and role == "assistant":
|
|
189
|
+
query_to_response.append(
|
|
190
|
+
{"role": "assistant", "content": images_history + [content_h]}
|
|
191
|
+
)
|
|
192
|
+
if len(query_to_response) == 2:
|
|
193
|
+
msgs.extend(query_to_response)
|
|
194
|
+
query_to_response = []
|
|
195
|
+
msgs.append({"role": "user", "content": images_chat + [content]})
|
|
196
|
+
return msgs, video_existed
|
|
197
|
+
|
|
198
|
+
def build_inputs_from_messages(
|
|
199
|
+
self,
|
|
200
|
+
messages: List[Dict],
|
|
201
|
+
generate_config: Dict,
|
|
202
|
+
):
|
|
203
|
+
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
204
|
+
# Set decode params for video
|
|
205
|
+
params = {}
|
|
206
|
+
if video_existed:
|
|
207
|
+
params = {"use_image_id": False, "max_slice_nums": 1}
|
|
208
|
+
return dict(msgs=msgs, image=None, **params)
|
|
209
|
+
|
|
210
|
+
def build_generate_kwargs(
|
|
211
|
+
self,
|
|
212
|
+
generate_config: Dict,
|
|
213
|
+
) -> Dict[str, Any]:
|
|
214
|
+
return dict(**generate_config)
|
|
215
|
+
|
|
216
|
+
def build_streaming_iter(
|
|
217
|
+
self,
|
|
218
|
+
messages: List[Dict],
|
|
219
|
+
generate_config: Dict,
|
|
220
|
+
) -> Tuple[Iterator, int]:
|
|
221
|
+
inputs = self.build_inputs_from_messages(messages, generate_config)
|
|
222
|
+
config = self.build_generate_kwargs(generate_config)
|
|
223
|
+
chat_iter = self._model.chat(
|
|
224
|
+
**inputs, **config, tokenizer=self._tokenizer, sampling=True
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return chat_iter, -1
|
|
228
|
+
|
|
229
|
+
def prepare_sanitize_generate_config(self, req: InferenceRequest):
|
|
230
|
+
"""
|
|
231
|
+
Refer to MiniCPM-V-4.5 documentation for generation parameters
|
|
232
|
+
"""
|
|
233
|
+
raw_config = req.inference_kwargs.get("raw_params", {})
|
|
234
|
+
temperature = raw_config.get("temperature", None)
|
|
235
|
+
if temperature is None:
|
|
236
|
+
raw_config["temperature"] = 0.7
|
|
237
|
+
top_p = raw_config.get("top_p", None)
|
|
238
|
+
if top_p is None:
|
|
239
|
+
raw_config["top_p"] = 0.8
|
|
240
|
+
top_k = raw_config.get("top_k", None)
|
|
241
|
+
if top_k is None:
|
|
242
|
+
raw_config["top_k"] = 100
|
|
243
|
+
repetition_penalty = raw_config.get("repetition_penalty", None)
|
|
244
|
+
if repetition_penalty is None:
|
|
245
|
+
raw_config["repetition_penalty"] = 1.05
|
|
246
|
+
return raw_config
|
|
247
|
+
|
|
248
|
+
def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
|
|
249
|
+
"""
|
|
250
|
+
Handle input IDs and images for MiniCPM-V-4.5
|
|
251
|
+
Based on MiniCPM-V-2.6 implementation with adaptations for 4.5
|
|
252
|
+
"""
|
|
253
|
+
from copy import deepcopy
|
|
254
|
+
|
|
255
|
+
copy_msgs = deepcopy(msgs)
|
|
256
|
+
|
|
257
|
+
images = []
|
|
258
|
+
for i, msg in enumerate(copy_msgs):
|
|
259
|
+
role = msg["role"]
|
|
260
|
+
content = msg["content"]
|
|
261
|
+
assert role in ["user", "assistant"]
|
|
262
|
+
if i == 0:
|
|
263
|
+
assert role == "user", "The role of first msg should be user"
|
|
264
|
+
if isinstance(content, str):
|
|
265
|
+
content = [content]
|
|
266
|
+
cur_msgs = []
|
|
267
|
+
for c in content:
|
|
268
|
+
if isinstance(c, Image.Image):
|
|
269
|
+
images.append(c)
|
|
270
|
+
cur_msgs.append("(<image>./</image>)")
|
|
271
|
+
elif isinstance(c, str):
|
|
272
|
+
cur_msgs.append(c)
|
|
273
|
+
msg["content"] = "\n".join(cur_msgs)
|
|
274
|
+
|
|
275
|
+
return {
|
|
276
|
+
"prompt": self._processor.tokenizer.apply_chat_template(
|
|
277
|
+
copy_msgs, tokenize=False, add_generation_prompt=True
|
|
278
|
+
),
|
|
279
|
+
"input_image": images,
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
|
|
283
|
+
msgs, video_existed = self._convert_to_specific_style(messages)
|
|
284
|
+
if video_existed:
|
|
285
|
+
raise RuntimeError(
|
|
286
|
+
f"Continuous batching does not support video inputs for this model: {self.model_uid}"
|
|
287
|
+
)
|
|
288
|
+
return self._handle_input_ids_and_images(msgs)
|
|
289
|
+
|
|
290
|
+
def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
|
|
291
|
+
prompts_lists = [x["prompt"] for x in prompts]
|
|
292
|
+
input_images_lists = [x["input_image"] for x in prompts]
|
|
293
|
+
inputs = self._processor(
|
|
294
|
+
prompts_lists,
|
|
295
|
+
input_images_lists,
|
|
296
|
+
max_slice_nums=None,
|
|
297
|
+
use_image_id=None,
|
|
298
|
+
return_tensors="pt",
|
|
299
|
+
max_length=8192,
|
|
300
|
+
).to(self._model.device)
|
|
301
|
+
inputs.pop("image_sizes")
|
|
302
|
+
|
|
303
|
+
masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
|
|
304
|
+
for i in range(masked_input_ids.shape[0]):
|
|
305
|
+
non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
|
|
306
|
+
req_list[i].prompt_tokens = non_zero_values
|
|
307
|
+
req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
|
|
308
|
+
req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
|
|
309
|
+
|
|
310
|
+
model_inputs = {
|
|
311
|
+
"input_ids": inputs["input_ids"],
|
|
312
|
+
"image_bound": inputs["image_bound"],
|
|
313
|
+
"pixel_values": inputs["pixel_values"],
|
|
314
|
+
"tgt_sizes": inputs["tgt_sizes"],
|
|
315
|
+
}
|
|
316
|
+
model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
|
|
317
|
+
|
|
318
|
+
return {
|
|
319
|
+
"inputs_embeds": model_inputs["inputs_embeds"],
|
|
320
|
+
"attention_mask": inputs["attention_mask"],
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
def build_decode_position_ids(
|
|
324
|
+
self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
|
|
325
|
+
):
|
|
326
|
+
return None
|
|
327
|
+
|
|
328
|
+
def batch_inference(self, req_list: List[InferenceRequest]):
|
|
329
|
+
"""
|
|
330
|
+
This method is rewritten
|
|
331
|
+
because the specific inference process is performed by `self._model.llm`,
|
|
332
|
+
not `self._model` itself
|
|
333
|
+
"""
|
|
334
|
+
from ..utils import batch_inference_one_step
|
|
335
|
+
|
|
336
|
+
self.prepare_batch_inference(req_list)
|
|
337
|
+
batch_inference_one_step(
|
|
338
|
+
self, req_list, self.model_uid, self._model.llm, self._tokenizer
|
|
339
|
+
)
|
|
340
|
+
self.handle_batch_inference_results(req_list)
|
|
@@ -27,11 +27,19 @@ logger = logging.getLogger(__name__)
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
@register_batching_multimodal_models(
|
|
30
|
-
"qwen2-vl-instruct",
|
|
30
|
+
"qwen2-vl-instruct",
|
|
31
|
+
"qwen2.5-vl-instruct",
|
|
32
|
+
"QvQ-72B-Preview",
|
|
33
|
+
"Qwen3-VL-Instruct",
|
|
34
|
+
"Qwen3-VL-Thinking",
|
|
31
35
|
)
|
|
32
36
|
@register_transformer
|
|
33
37
|
@register_non_default_model(
|
|
34
|
-
"qwen2-vl-instruct",
|
|
38
|
+
"qwen2-vl-instruct",
|
|
39
|
+
"qwen2.5-vl-instruct",
|
|
40
|
+
"QvQ-72B-Preview",
|
|
41
|
+
"Qwen3-VL-Instruct",
|
|
42
|
+
"Qwen3-VL-Thinking",
|
|
35
43
|
)
|
|
36
44
|
class Qwen2VLChatModel(PytorchMultiModalModel):
|
|
37
45
|
def _sanitize_model_config(
|
|
@@ -47,7 +55,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
|
|
|
47
55
|
def match_json(
|
|
48
56
|
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
|
|
49
57
|
) -> bool:
|
|
50
|
-
if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
|
|
58
|
+
if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb", "fp8"]:
|
|
51
59
|
return False
|
|
52
60
|
llm_family = model_family.model_family or model_family.model_name
|
|
53
61
|
if "qwen2-vl-instruct".lower() in llm_family.lower():
|
|
@@ -56,6 +64,8 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
|
|
|
56
64
|
return True
|
|
57
65
|
if "qvq-72b-preview".lower() in llm_family.lower():
|
|
58
66
|
return True
|
|
67
|
+
if "qwen3-vl" in llm_family.lower():
|
|
68
|
+
return True
|
|
59
69
|
return False
|
|
60
70
|
|
|
61
71
|
def decide_device(self):
|
|
@@ -85,13 +95,19 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
|
|
|
85
95
|
except ImportError:
|
|
86
96
|
Qwen2_5_VLForConditionalGeneration = None
|
|
87
97
|
|
|
98
|
+
try:
|
|
99
|
+
from transformers import AutoModelForImageTextToText
|
|
100
|
+
except ImportError:
|
|
101
|
+
AutoModelForImageTextToText = None
|
|
102
|
+
|
|
88
103
|
kwargs = self.apply_bnb_quantization()
|
|
89
104
|
llm_family = self.model_family.model_family or self.model_family.model_name
|
|
90
|
-
|
|
91
|
-
Qwen2_5_VLForConditionalGeneration
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
105
|
+
if "qwen2.5" in llm_family:
|
|
106
|
+
model_cls = Qwen2_5_VLForConditionalGeneration
|
|
107
|
+
elif "qwen3" in llm_family:
|
|
108
|
+
model_cls = AutoModelForImageTextToText
|
|
109
|
+
else:
|
|
110
|
+
model_cls = Qwen2VLForConditionalGeneration
|
|
95
111
|
if model_cls is None:
|
|
96
112
|
raise ImportError("`transformers` version is too old, please upgrade it")
|
|
97
113
|
device = "auto" if self._device == "cuda" else self._device
|
|
@@ -118,6 +134,16 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
|
|
|
118
134
|
torch_dtype="float16",
|
|
119
135
|
**kwargs,
|
|
120
136
|
).eval()
|
|
137
|
+
elif device == "mps":
|
|
138
|
+
# MacOS special, see https://github.com/QwenLM/Qwen2.5-VL/issues/761
|
|
139
|
+
self._model = model_cls.from_pretrained(
|
|
140
|
+
self.model_path,
|
|
141
|
+
torch_dtype="bfloat16",
|
|
142
|
+
device_map=device,
|
|
143
|
+
attn_implementation="eager",
|
|
144
|
+
low_cpu_mem_usage=True,
|
|
145
|
+
trust_remote_code=True,
|
|
146
|
+
).eval()
|
|
121
147
|
else:
|
|
122
148
|
self._model = model_cls.from_pretrained(
|
|
123
149
|
self.model_path,
|
|
@@ -281,11 +281,34 @@ def _batch_inference_one_step_internal(
|
|
|
281
281
|
r.append_new_token(token)
|
|
282
282
|
|
|
283
283
|
if decode_reqs:
|
|
284
|
+
# Ensure all decode requests have the same kv_cache reference
|
|
285
|
+
# This prevents batch size mismatches during merging
|
|
284
286
|
decode_kv = decode_reqs[0].kv_cache
|
|
287
|
+
|
|
288
|
+
# Verify that all decode requests share the same kv_cache
|
|
289
|
+
for req in decode_reqs[1:]:
|
|
290
|
+
if req.kv_cache is not decode_kv:
|
|
291
|
+
logger.warning(
|
|
292
|
+
"Inconsistent kv_cache references detected in decode requests. "
|
|
293
|
+
"This may indicate a batching synchronization issue."
|
|
294
|
+
)
|
|
295
|
+
# Use the first decode_kv as the reference to maintain consistency
|
|
296
|
+
req.kv_cache = decode_kv
|
|
297
|
+
|
|
285
298
|
# prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
|
|
286
299
|
merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
|
|
300
|
+
# Update sequence length information after KV cache merge
|
|
301
|
+
_, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
|
|
302
|
+
merged_kv_cache, xinf_model_obj
|
|
303
|
+
)
|
|
287
304
|
for r in valid_req_list:
|
|
288
305
|
r.kv_cache = merged_kv_cache
|
|
306
|
+
# Update attention mask sequence length to match merged KV cache
|
|
307
|
+
if "attention_mask_seq_len" in r.extra_kwargs:
|
|
308
|
+
# Ensure the attention mask length doesn't exceed the merged sequence length
|
|
309
|
+
r.extra_kwargs["attention_mask_seq_len"] = min(
|
|
310
|
+
r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
|
|
311
|
+
)
|
|
289
312
|
empty_cache()
|
|
290
313
|
else:
|
|
291
314
|
for r in valid_req_list:
|
xinference/model/llm/utils.py
CHANGED
|
@@ -71,6 +71,12 @@ QWEN_TOOL_CALL_FAMILY = [
|
|
|
71
71
|
"Qwen3-Thinking",
|
|
72
72
|
"Qwen3-Instruct",
|
|
73
73
|
"Qwen3-Coder",
|
|
74
|
+
"Qwen3-VL-Instruct",
|
|
75
|
+
"Qwen3-VL-Thinking",
|
|
76
|
+
"Qwen3-Next-Instruct",
|
|
77
|
+
"Qwen3-Next-Thinking",
|
|
78
|
+
"Qwen3-Omni-Instruct",
|
|
79
|
+
"Qwen3-Omni-Thinking",
|
|
74
80
|
]
|
|
75
81
|
|
|
76
82
|
GLM4_TOOL_CALL_FAMILY = [
|
|
@@ -96,7 +102,6 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
|
|
|
96
102
|
|
|
97
103
|
|
|
98
104
|
class ChatModelMixin:
|
|
99
|
-
|
|
100
105
|
def __init__(self):
|
|
101
106
|
self.model_family = None
|
|
102
107
|
self.model_uid = None
|
|
@@ -139,7 +144,7 @@ class ChatModelMixin:
|
|
|
139
144
|
tokenize=False,
|
|
140
145
|
**kwargs,
|
|
141
146
|
):
|
|
142
|
-
if "vision" not in self.model_family.model_ability: # type: ignore
|
|
147
|
+
if "vision" not in self.model_family.model_ability and "audio" not in self.model_family.model_ability: # type: ignore
|
|
143
148
|
messages = self.convert_messages_with_content_list_to_str_conversion(
|
|
144
149
|
messages
|
|
145
150
|
)
|
|
@@ -182,8 +187,7 @@ class ChatModelMixin:
|
|
|
182
187
|
return kwargs
|
|
183
188
|
else:
|
|
184
189
|
raise TypeError(
|
|
185
|
-
f"`chat_template_kwargs` but be a JSON parsable str "
|
|
186
|
-
f"or dict, got: {kwargs}"
|
|
190
|
+
f"`chat_template_kwargs` but be a JSON parsable str or dict, got: {kwargs}"
|
|
187
191
|
)
|
|
188
192
|
elif reasoning_parser and not reasoning_parser.enable_thinking:
|
|
189
193
|
# hybrid model like qwen3,
|
|
@@ -347,9 +351,7 @@ class ChatModelMixin:
|
|
|
347
351
|
assert choices is not None
|
|
348
352
|
usage = (
|
|
349
353
|
chunk["usage"]
|
|
350
|
-
if choices[0]["finish_reason"] is not None
|
|
351
|
-
and reasoning_parser
|
|
352
|
-
and reasoning_parser.check_content_parser()
|
|
354
|
+
if choices and choices[0]["finish_reason"] is not None or not choices
|
|
353
355
|
else None
|
|
354
356
|
)
|
|
355
357
|
chat_chunk = {
|
|
@@ -798,7 +800,11 @@ class ChatModelMixin:
|
|
|
798
800
|
chunk_id=None,
|
|
799
801
|
previous_texts: List[str] = [""],
|
|
800
802
|
):
|
|
803
|
+
if not c.get("choices"):
|
|
804
|
+
return c
|
|
801
805
|
_id = chunk_id if chunk_id is not None else str(uuid.uuid4())
|
|
806
|
+
tool_result = None
|
|
807
|
+
finish_reason = None
|
|
802
808
|
if isinstance(self.tool_parser, Glm4ToolParser):
|
|
803
809
|
tool_result = self.tool_parser.extract_tool_calls_streaming(
|
|
804
810
|
[],
|
|
@@ -847,15 +853,11 @@ class ChatModelMixin:
|
|
|
847
853
|
"tool_calls": tool_calls,
|
|
848
854
|
}
|
|
849
855
|
|
|
850
|
-
|
|
856
|
+
# For tool completion chunks, use None for usage, actual values for stop
|
|
857
|
+
if finish_reason == "tool_calls":
|
|
858
|
+
usage = None
|
|
859
|
+
else:
|
|
851
860
|
usage = c.get("usage")
|
|
852
|
-
assert "prompt_tokens" in usage
|
|
853
|
-
except Exception:
|
|
854
|
-
usage = {
|
|
855
|
-
"prompt_tokens": -1,
|
|
856
|
-
"completion_tokens": -1,
|
|
857
|
-
"total_tokens": -1,
|
|
858
|
-
}
|
|
859
861
|
return {
|
|
860
862
|
"id": "chat" + f"cmpl-{_id}",
|
|
861
863
|
"model": model_uid,
|
|
@@ -880,25 +882,32 @@ class ChatModelMixin:
|
|
|
880
882
|
):
|
|
881
883
|
if not self.tool_parser:
|
|
882
884
|
return self._get_final_chat_completion_chunk(c)
|
|
883
|
-
|
|
884
|
-
c = self.reasoning_parser.prepare_reasoning_content(c)
|
|
885
|
+
|
|
885
886
|
_id = str(uuid.uuid4())
|
|
886
887
|
reasoning_content = None
|
|
888
|
+
content = ""
|
|
889
|
+
|
|
890
|
+
# First, process reasoning content if reasoning parser exists
|
|
891
|
+
text = c["choices"][0]["text"]
|
|
887
892
|
if self.reasoning_parser and self.reasoning_parser.check_content_parser():
|
|
888
|
-
|
|
889
|
-
reasoning_content,
|
|
893
|
+
# Extract reasoning content directly from the original text
|
|
894
|
+
reasoning_content, processed_content = (
|
|
890
895
|
self.reasoning_parser.extract_reasoning_content(text)
|
|
891
896
|
)
|
|
892
|
-
|
|
897
|
+
# Use the processed content (without thinking tags) for tool parsing
|
|
898
|
+
if processed_content:
|
|
899
|
+
text = processed_content
|
|
893
900
|
|
|
901
|
+
# Then, extract tool calls from the processed text (without thinking tags)
|
|
894
902
|
tool_calls = []
|
|
895
903
|
failed_contents = []
|
|
896
904
|
if isinstance(self.tool_parser, Glm4ToolParser):
|
|
897
905
|
tool_result = self.tool_parser.extract_tool_calls(c)
|
|
898
906
|
else:
|
|
899
|
-
text = c["choices"][0]["text"]
|
|
900
907
|
tool_result = self.tool_parser.extract_tool_calls(text)
|
|
901
|
-
|
|
908
|
+
|
|
909
|
+
# Process tool results
|
|
910
|
+
for tool_content, func, args in tool_result:
|
|
902
911
|
if func:
|
|
903
912
|
tool_calls.append(
|
|
904
913
|
{
|
|
@@ -911,25 +920,31 @@ class ChatModelMixin:
|
|
|
911
920
|
}
|
|
912
921
|
)
|
|
913
922
|
else:
|
|
914
|
-
if
|
|
915
|
-
failed_contents.append(
|
|
916
|
-
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
923
|
+
if tool_content:
|
|
924
|
+
failed_contents.append(tool_content)
|
|
917
925
|
|
|
918
|
-
|
|
926
|
+
# Determine the final content
|
|
927
|
+
if tool_calls:
|
|
928
|
+
# For tool calls, the main content should be empty or contain only non-tool parts
|
|
929
|
+
content = "".join(failed_contents) if failed_contents else ""
|
|
930
|
+
else:
|
|
931
|
+
# For non-tool calls, use the processed content from reasoning parser
|
|
932
|
+
content = text
|
|
933
|
+
|
|
934
|
+
finish_reason = "tool_calls" if tool_calls else "stop"
|
|
919
935
|
|
|
920
936
|
m = {
|
|
921
937
|
"role": "assistant",
|
|
922
|
-
"content": content
|
|
938
|
+
"content": content,
|
|
923
939
|
"tool_calls": tool_calls,
|
|
924
940
|
}
|
|
925
941
|
# add only reasoning_content is None
|
|
926
942
|
if reasoning_content is not None:
|
|
927
943
|
m["reasoning_content"] = reasoning_content
|
|
928
944
|
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
except Exception:
|
|
945
|
+
# For tool completion chunks, use actual usage values when available
|
|
946
|
+
usage = c.get("usage")
|
|
947
|
+
if not usage or not isinstance(usage, dict) or "prompt_tokens" not in usage:
|
|
933
948
|
usage = {
|
|
934
949
|
"prompt_tokens": -1,
|
|
935
950
|
"completion_tokens": -1,
|
|
@@ -1009,7 +1024,8 @@ class ChatModelMixin:
|
|
|
1009
1024
|
completion_chunk, self.reasoning_parser, previous_texts
|
|
1010
1025
|
)
|
|
1011
1026
|
if (
|
|
1012
|
-
|
|
1027
|
+
chat_chunk["choices"]
|
|
1028
|
+
and "reasoning_content" in chat_chunk["choices"][0]["delta"]
|
|
1013
1029
|
and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
|
|
1014
1030
|
):
|
|
1015
1031
|
yield chat_chunk
|