xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +473 -31
- xinference/client/restful/async_restful_client.py +178 -8
- xinference/client/restful/restful_client.py +151 -3
- xinference/core/supervisor.py +99 -53
- xinference/core/worker.py +10 -0
- xinference/deploy/cmdline.py +15 -0
- xinference/model/audio/core.py +21 -6
- xinference/model/audio/indextts2.py +166 -0
- xinference/model/audio/model_spec.json +58 -21
- xinference/model/image/model_spec.json +159 -90
- xinference/model/image/stable_diffusion/core.py +13 -4
- xinference/model/llm/__init__.py +6 -2
- xinference/model/llm/llm_family.json +1299 -174
- xinference/model/llm/mlx/distributed_models/core.py +41 -0
- xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
- xinference/model/llm/sglang/core.py +44 -11
- xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
- xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
- xinference/model/llm/transformers/chatglm.py +3 -0
- xinference/model/llm/transformers/core.py +129 -36
- xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
- xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
- xinference/model/llm/transformers/utils.py +23 -0
- xinference/model/llm/utils.py +48 -32
- xinference/model/llm/vllm/core.py +207 -72
- xinference/model/utils.py +74 -31
- xinference/thirdparty/audiotools/__init__.py +10 -0
- xinference/thirdparty/audiotools/core/__init__.py +4 -0
- xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
- xinference/thirdparty/audiotools/core/display.py +194 -0
- xinference/thirdparty/audiotools/core/dsp.py +390 -0
- xinference/thirdparty/audiotools/core/effects.py +647 -0
- xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
- xinference/thirdparty/audiotools/core/loudness.py +320 -0
- xinference/thirdparty/audiotools/core/playback.py +252 -0
- xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
- xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
- xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
- xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
- xinference/thirdparty/audiotools/core/util.py +671 -0
- xinference/thirdparty/audiotools/core/whisper.py +97 -0
- xinference/thirdparty/audiotools/data/__init__.py +3 -0
- xinference/thirdparty/audiotools/data/datasets.py +517 -0
- xinference/thirdparty/audiotools/data/preprocess.py +81 -0
- xinference/thirdparty/audiotools/data/transforms.py +1592 -0
- xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
- xinference/thirdparty/audiotools/metrics/distance.py +131 -0
- xinference/thirdparty/audiotools/metrics/quality.py +159 -0
- xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
- xinference/thirdparty/audiotools/ml/__init__.py +5 -0
- xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
- xinference/thirdparty/audiotools/ml/decorators.py +440 -0
- xinference/thirdparty/audiotools/ml/experiment.py +90 -0
- xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
- xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
- xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
- xinference/thirdparty/audiotools/post.py +140 -0
- xinference/thirdparty/audiotools/preference.py +600 -0
- xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
- xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
- xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
- xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
- xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
- xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
- xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
- xinference/thirdparty/indextts/__init__.py +0 -0
- xinference/thirdparty/indextts/cli.py +65 -0
- xinference/thirdparty/indextts/gpt/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
- xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
- xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
- xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
- xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
- xinference/thirdparty/indextts/gpt/model.py +713 -0
- xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
- xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
- xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
- xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
- xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
- xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
- xinference/thirdparty/indextts/infer.py +690 -0
- xinference/thirdparty/indextts/infer_v2.py +739 -0
- xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
- xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
- xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
- xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
- xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
- xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
- xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
- xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
- xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
- xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
- xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
- xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
- xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
- xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
- xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
- xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
- xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
- xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
- xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
- xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
- xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
- xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
- xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
- xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
- xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
- xinference/thirdparty/indextts/utils/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/arch_util.py +120 -0
- xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
- xinference/thirdparty/indextts/utils/common.py +121 -0
- xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
- xinference/thirdparty/indextts/utils/front.py +536 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
- xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
- xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
- xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
- xinference/thirdparty/indextts/utils/text_utils.py +41 -0
- xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
- xinference/thirdparty/indextts/utils/utils.py +93 -0
- xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
- xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
- xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
- xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
- xinference/thirdparty/melo/text/chinese_mix.py +2 -2
- xinference/types.py +9 -0
- xinference/ui/gradio/media_interface.py +66 -8
- xinference/ui/web/ui/build/asset-manifest.json +6 -6
- xinference/ui/web/ui/build/index.html +1 -1
- xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
- xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
- xinference/ui/web/ui/build/static/js/main.45e78536.js +3 -0
- xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -7
- xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
- xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
- xinference/ui/web/ui/package-lock.json +0 -34
- xinference/ui/web/ui/package.json +0 -1
- xinference/ui/web/ui/src/locales/en.json +9 -3
- xinference/ui/web/ui/src/locales/ja.json +9 -3
- xinference/ui/web/ui/src/locales/ko.json +9 -3
- xinference/ui/web/ui/src/locales/zh.json +9 -3
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/METADATA +24 -6
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/RECORD +296 -77
- xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
- xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
- xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
- xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
- xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
- xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
- xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
- xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
- xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
- xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
- xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
- xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
- xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
- xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
- xinference/ui/web/ui/node_modules/select/bower.json +0 -13
- xinference/ui/web/ui/node_modules/select/package.json +0 -29
- xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -162,3 +162,44 @@ class DistributedModelMixin:
|
|
|
162
162
|
self.layers = self.layers[: self.end_idx]
|
|
163
163
|
self.layers[: self.start_idx] = [None] * self.start_idx
|
|
164
164
|
self.num_layers = len(self.layers) - self.start_idx
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class SafeKVCache:
|
|
168
|
+
"""
|
|
169
|
+
A safe wrapper around mlx_lm's KVCache that handles None keys gracefully.
|
|
170
|
+
This is needed because mlx_lm's generate function accesses cache.state
|
|
171
|
+
before the cache is properly initialized.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
def __init__(self):
|
|
175
|
+
from mlx_lm.models.cache import KVCache
|
|
176
|
+
|
|
177
|
+
self._cache = KVCache()
|
|
178
|
+
|
|
179
|
+
@property
|
|
180
|
+
def state(self):
|
|
181
|
+
# Safe access to state property
|
|
182
|
+
if self._cache.keys is None:
|
|
183
|
+
return None, None
|
|
184
|
+
if self._cache.offset == self._cache.keys.shape[2]:
|
|
185
|
+
return self._cache.keys, self._cache.values
|
|
186
|
+
else:
|
|
187
|
+
return (
|
|
188
|
+
self._cache.keys[..., : self._cache.offset, :],
|
|
189
|
+
self._cache.values[..., : self._cache.offset, :],
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
@state.setter
|
|
193
|
+
def state(self, v):
|
|
194
|
+
# Safe setter for state property
|
|
195
|
+
if v is None or v[0] is None:
|
|
196
|
+
self._cache.keys = None
|
|
197
|
+
self._cache.values = None
|
|
198
|
+
self._cache.offset = 0
|
|
199
|
+
else:
|
|
200
|
+
self._cache.keys, self._cache.values = v
|
|
201
|
+
self._cache.offset = self._cache.keys.shape[2]
|
|
202
|
+
|
|
203
|
+
def __getattr__(self, name):
|
|
204
|
+
# Delegate all other attributes and methods to the underlying cache
|
|
205
|
+
return getattr(self._cache, name)
|
|
@@ -46,11 +46,10 @@ class Qwen2Model(_Qwen2Model, DistributedModelMixin):
|
|
|
46
46
|
|
|
47
47
|
pipeline_rank = self.rank
|
|
48
48
|
pipeline_size = self.world_size
|
|
49
|
-
if mask is None:
|
|
50
|
-
mask = create_attention_mask(h, cache)
|
|
51
49
|
|
|
52
50
|
if cache is None:
|
|
53
51
|
cache = [None] * self.num_layers
|
|
52
|
+
mask = create_attention_mask(h, cache[0])
|
|
54
53
|
|
|
55
54
|
# Receive from the previous process in the pipeline
|
|
56
55
|
|
|
@@ -73,6 +73,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
|
|
|
73
73
|
stream: bool
|
|
74
74
|
stream_options: Optional[Union[dict, None]]
|
|
75
75
|
json_schema: Optional[dict]
|
|
76
|
+
response_format: dict
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
try:
|
|
@@ -317,13 +318,16 @@ class SGLANGModel(LLM):
|
|
|
317
318
|
stream_options = generate_config.get("stream_options")
|
|
318
319
|
generate_config.setdefault("stream_options", stream_options)
|
|
319
320
|
generate_config.setdefault("ignore_eos", False)
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
.pop("json_schema",
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
321
|
+
response_format = generate_config.pop("response_format", None)
|
|
322
|
+
if response_format:
|
|
323
|
+
json_schema_config = response_format.pop("json_schema", None)
|
|
324
|
+
json_schema = None
|
|
325
|
+
if "schema_" in json_schema_config:
|
|
326
|
+
json_schema = json_schema_config.pop("schema_")
|
|
327
|
+
elif "schema" in json_schema_config:
|
|
328
|
+
json_schema = json_schema_config.pop("schema")
|
|
329
|
+
if json_schema:
|
|
330
|
+
generate_config.setdefault("json_schema", json.dumps(json_schema)) # type: ignore
|
|
327
331
|
|
|
328
332
|
return generate_config
|
|
329
333
|
|
|
@@ -356,22 +360,38 @@ class SGLANGModel(LLM):
|
|
|
356
360
|
|
|
357
361
|
@staticmethod
|
|
358
362
|
def _convert_state_to_completion_chunk(
|
|
359
|
-
request_id: str, model: str, output_text: str
|
|
363
|
+
request_id: str, model: str, output_text: str, meta_info: Dict
|
|
360
364
|
) -> CompletionChunk:
|
|
365
|
+
finish_reason_raw = meta_info.get("finish_reason", None)
|
|
366
|
+
finish_reason: Optional[str] = None
|
|
367
|
+
if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
|
|
368
|
+
finish_reason = (
|
|
369
|
+
str(finish_reason_raw["type"])
|
|
370
|
+
if finish_reason_raw["type"] is not None
|
|
371
|
+
else None
|
|
372
|
+
)
|
|
373
|
+
elif isinstance(finish_reason_raw, str):
|
|
374
|
+
finish_reason = finish_reason_raw
|
|
361
375
|
choices: List[CompletionChoice] = [
|
|
362
376
|
CompletionChoice(
|
|
363
377
|
text=output_text,
|
|
364
378
|
index=0,
|
|
365
379
|
logprobs=None,
|
|
366
|
-
finish_reason=
|
|
380
|
+
finish_reason=finish_reason,
|
|
367
381
|
)
|
|
368
382
|
]
|
|
383
|
+
usage = CompletionUsage(
|
|
384
|
+
prompt_tokens=meta_info["prompt_tokens"],
|
|
385
|
+
completion_tokens=meta_info["completion_tokens"],
|
|
386
|
+
total_tokens=meta_info["prompt_tokens"] + meta_info["completion_tokens"],
|
|
387
|
+
)
|
|
369
388
|
chunk = CompletionChunk(
|
|
370
389
|
id=request_id,
|
|
371
390
|
object="text_completion",
|
|
372
391
|
created=int(time.time()),
|
|
373
392
|
model=model,
|
|
374
393
|
choices=choices,
|
|
394
|
+
usage=usage,
|
|
375
395
|
)
|
|
376
396
|
return chunk
|
|
377
397
|
|
|
@@ -379,12 +399,22 @@ class SGLANGModel(LLM):
|
|
|
379
399
|
def _convert_state_to_completion(
|
|
380
400
|
request_id: str, model: str, output_text: str, meta_info: Dict
|
|
381
401
|
) -> Completion:
|
|
402
|
+
finish_reason_raw = meta_info.get("finish_reason", None)
|
|
403
|
+
finish_reason: Optional[str] = None
|
|
404
|
+
if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
|
|
405
|
+
finish_reason = (
|
|
406
|
+
str(finish_reason_raw["type"])
|
|
407
|
+
if finish_reason_raw["type"] is not None
|
|
408
|
+
else None
|
|
409
|
+
)
|
|
410
|
+
elif isinstance(finish_reason_raw, str):
|
|
411
|
+
finish_reason = finish_reason_raw
|
|
382
412
|
choices = [
|
|
383
413
|
CompletionChoice(
|
|
384
414
|
text=output_text,
|
|
385
415
|
index=0,
|
|
386
416
|
logprobs=None,
|
|
387
|
-
finish_reason=
|
|
417
|
+
finish_reason=finish_reason,
|
|
388
418
|
)
|
|
389
419
|
]
|
|
390
420
|
|
|
@@ -513,7 +543,10 @@ class SGLANGModel(LLM):
|
|
|
513
543
|
prompt, image_data, **sanitized_generate_config
|
|
514
544
|
):
|
|
515
545
|
chunk = self._convert_state_to_completion_chunk(
|
|
516
|
-
request_id,
|
|
546
|
+
request_id,
|
|
547
|
+
self.model_uid,
|
|
548
|
+
output_text=out,
|
|
549
|
+
meta_info=meta_info,
|
|
517
550
|
)
|
|
518
551
|
complete_response += out
|
|
519
552
|
finish_reason = meta_info["finish_reason"]
|
|
@@ -23,12 +23,27 @@ class DeepseekR1ToolParser(ToolParser):
|
|
|
23
23
|
Initialize the DeepSeek R1 tool parser.
|
|
24
24
|
"""
|
|
25
25
|
super().__init__()
|
|
26
|
+
|
|
27
|
+
# Sentinel tokens for streaming mode
|
|
28
|
+
self.think_start_token: str = "<think>"
|
|
29
|
+
self.think_end_token: str = "</think>"
|
|
30
|
+
self.tool_call_start_token: str = "<|tool▁call▁begin|>"
|
|
31
|
+
self.tool_call_end_token: str = "<|tool▁call▁end|>"
|
|
32
|
+
|
|
26
33
|
# Regex pattern to match DeepSeek R1 tool call format
|
|
27
34
|
self.tool_calls_regex = (
|
|
28
35
|
r"<\|tool▁call▁begin|>function<\|tool▁sep|>([^\n]+)\n"
|
|
29
36
|
r"```json\n(.*?)\n```<\|tool▁call▁end|>"
|
|
30
37
|
)
|
|
31
38
|
|
|
39
|
+
# Regex pattern to match the entire tool-calls wrapper block.
|
|
40
|
+
# We intentionally do NOT match <think> blocks here so that the
|
|
41
|
+
# "text before" chunk will include both the think block and any
|
|
42
|
+
# narrative text up to the tool calls wrapper, yielding exactly two
|
|
43
|
+
# blocks when there is a single tool calls section:
|
|
44
|
+
# [before_text_including_think, tool_calls_wrapper_block]
|
|
45
|
+
self.content_regex = r"(<\|tool▁calls▁begin|>.*?<\|tool▁calls▁end|>)"
|
|
46
|
+
|
|
32
47
|
def extract_tool_calls(
|
|
33
48
|
self, model_output: str
|
|
34
49
|
) -> List[Tuple[Optional[str], Optional[str], Optional[dict]]]:
|
|
@@ -56,49 +71,96 @@ class DeepseekR1ToolParser(ToolParser):
|
|
|
56
71
|
>>> print(result)
|
|
57
72
|
[(None, 'get_current_weather', {'location': 'Beijing'})]
|
|
58
73
|
"""
|
|
59
|
-
|
|
60
|
-
if not
|
|
61
|
-
# No tool calls found, return the original output as content
|
|
74
|
+
# If no tool call tokens, return original output as content
|
|
75
|
+
if self.tool_call_start_token not in model_output:
|
|
62
76
|
return [(model_output, None, None)]
|
|
63
77
|
|
|
78
|
+
# Get all content blocks (text, thinking blocks, tool calls)
|
|
79
|
+
function_calls = self._get_function_calls(model_output)
|
|
80
|
+
|
|
64
81
|
# Use set for deduplication of identical tool calls
|
|
65
82
|
tool_calls = set()
|
|
66
83
|
results: List[Tuple[Optional[str], Optional[str], Optional[dict]]] = []
|
|
67
84
|
|
|
68
|
-
for
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
85
|
+
for content_block in function_calls:
|
|
86
|
+
# Check if this block is a tool call
|
|
87
|
+
if (
|
|
88
|
+
self.tool_call_start_token in content_block
|
|
89
|
+
and self.tool_call_end_token in content_block
|
|
90
|
+
):
|
|
91
|
+
# Extract function name and arguments from tool call block
|
|
92
|
+
matches = re.findall(self.tool_calls_regex, content_block, re.DOTALL)
|
|
93
|
+
if not matches:
|
|
94
|
+
# Malformed tool call, treat as regular content
|
|
95
|
+
results.append((content_block, None, None))
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
func_name, raw_json = matches[0] # Take the first match
|
|
99
|
+
|
|
100
|
+
func_and_args = None
|
|
101
|
+
try:
|
|
102
|
+
# Parse JSON arguments
|
|
103
|
+
func_and_args = json.loads(raw_json)
|
|
104
|
+
# Create hashable representation for deduplication
|
|
105
|
+
arguments_hashable = frozenset(func_and_args.items())
|
|
106
|
+
tool_call_tuple = (
|
|
107
|
+
None, # No content error
|
|
108
|
+
func_name,
|
|
109
|
+
func_and_args,
|
|
110
|
+
)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
# JSON parsing failed, treat as raw content
|
|
113
|
+
logger.warning(
|
|
114
|
+
f"Failed to parse tool call JSON: {raw_json}, error: {e}"
|
|
115
|
+
)
|
|
116
|
+
tool_call_tuple = (raw_json, None, None)
|
|
117
|
+
arguments_hashable = None
|
|
118
|
+
|
|
119
|
+
# Create deduplication key
|
|
120
|
+
dedup_key = (
|
|
121
|
+
(func_name, arguments_hashable)
|
|
122
|
+
if func_and_args is not None
|
|
123
|
+
else raw_json
|
|
79
124
|
)
|
|
80
|
-
except Exception as e:
|
|
81
|
-
# JSON parsing failed, treat as raw content
|
|
82
|
-
logger.warning(
|
|
83
|
-
f"Failed to parse tool call JSON: {raw_json}, error: {e}"
|
|
84
|
-
)
|
|
85
|
-
tool_call_tuple = (raw_json, None, None)
|
|
86
|
-
arguments_hashable = None
|
|
87
|
-
|
|
88
|
-
# Create deduplication key
|
|
89
|
-
dedup_key = (
|
|
90
|
-
(func_name, arguments_hashable)
|
|
91
|
-
if func_and_args is not None
|
|
92
|
-
else raw_json
|
|
93
|
-
)
|
|
94
125
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
126
|
+
# Add to results if not already seen
|
|
127
|
+
if dedup_key not in tool_calls:
|
|
128
|
+
tool_calls.add(dedup_key)
|
|
129
|
+
results.append(tool_call_tuple)
|
|
130
|
+
else:
|
|
131
|
+
# This is regular content (text or thinking block), add as-is
|
|
132
|
+
if content_block.strip(): # Only add non-empty content
|
|
133
|
+
results.append((content_block, None, None))
|
|
99
134
|
|
|
100
135
|
return results
|
|
101
136
|
|
|
137
|
+
def _get_function_calls(self, model_output: str) -> List[str]:
|
|
138
|
+
"""
|
|
139
|
+
Extract all function calls and content blocks from model output.
|
|
140
|
+
|
|
141
|
+
Parses the model output to separate thinking blocks, tool calls,
|
|
142
|
+
and regular content into individual components.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
model_output (str): The complete model output to parse.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
List[str]: List of content blocks (text, thinking blocks, tool calls).
|
|
149
|
+
"""
|
|
150
|
+
functions_calls = []
|
|
151
|
+
last_end = 0
|
|
152
|
+
for m in re.finditer(self.content_regex, model_output, re.DOTALL):
|
|
153
|
+
# Add any text before the current match
|
|
154
|
+
if m.start() > last_end:
|
|
155
|
+
functions_calls.append(model_output[last_end : m.start()])
|
|
156
|
+
# Add the matched content (think or tool_call block)
|
|
157
|
+
functions_calls.append(m.group(0))
|
|
158
|
+
last_end = m.end()
|
|
159
|
+
# Add any remaining text after the last match
|
|
160
|
+
if last_end < len(model_output):
|
|
161
|
+
functions_calls.append(model_output[last_end:])
|
|
162
|
+
return functions_calls
|
|
163
|
+
|
|
102
164
|
def extract_tool_calls_streaming(
|
|
103
165
|
self, previous_text: List[str], current_text: str, delta_text: str
|
|
104
166
|
) -> Optional[Any]:
|
|
@@ -59,10 +59,28 @@ class QwenToolParser(ToolParser):
|
|
|
59
59
|
Returns:
|
|
60
60
|
str: Extracted JSON string or original string if no match found.
|
|
61
61
|
"""
|
|
62
|
+
# First try to find complete tool calls
|
|
62
63
|
function_calls = self.tool_call_complete_regex.findall(function_call_str)
|
|
63
|
-
if len(function_calls)
|
|
64
|
-
return
|
|
65
|
-
|
|
64
|
+
if len(function_calls) > 0:
|
|
65
|
+
return function_calls[-1]
|
|
66
|
+
|
|
67
|
+
# If no complete tool calls found, try to extract from incomplete tool calls
|
|
68
|
+
# Handle cases like <tool_call><tool_call>_city
|
|
69
|
+
if self.tool_call_start_token in function_call_str:
|
|
70
|
+
# Extract content between the last tool_call start token and end of string
|
|
71
|
+
last_start = function_call_str.rfind(self.tool_call_start_token)
|
|
72
|
+
potential_json = function_call_str[
|
|
73
|
+
last_start + len(self.tool_call_start_token) :
|
|
74
|
+
]
|
|
75
|
+
# Remove any trailing tool_call end tokens
|
|
76
|
+
if self.tool_call_end_token in potential_json:
|
|
77
|
+
potential_json = potential_json.split(self.tool_call_end_token)[0]
|
|
78
|
+
# Clean up any extra whitespace
|
|
79
|
+
potential_json = potential_json.strip()
|
|
80
|
+
if potential_json:
|
|
81
|
+
return potential_json
|
|
82
|
+
|
|
83
|
+
return function_call_str
|
|
66
84
|
|
|
67
85
|
def _parse_json_function_call_stream(
|
|
68
86
|
self,
|
|
@@ -229,7 +247,14 @@ class QwenToolParser(ToolParser):
|
|
|
229
247
|
try:
|
|
230
248
|
parsed_json = self._parse_json_function_call(function_call)
|
|
231
249
|
res = json.loads(parsed_json, strict=False)
|
|
232
|
-
|
|
250
|
+
# Validate that we have the required fields
|
|
251
|
+
if "name" in res and "arguments" in res:
|
|
252
|
+
results.append((None, res["name"], res["arguments"]))
|
|
253
|
+
else:
|
|
254
|
+
logger.warning(
|
|
255
|
+
"Invalid tool call format, missing required fields: %s", res
|
|
256
|
+
)
|
|
257
|
+
results.append((function_call, None, None))
|
|
233
258
|
except Exception as e:
|
|
234
259
|
logger.error(
|
|
235
260
|
"Can't parse single qwen tool call output: %s. Error: %s",
|
|
@@ -472,6 +472,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
472
472
|
r.prompt = self._process_messages(
|
|
473
473
|
r.prompt, tools=tools, tool_choice=tool_choice
|
|
474
474
|
)
|
|
475
|
+
assert isinstance(
|
|
476
|
+
r.prompt, list
|
|
477
|
+
), "r.prompt must be a list after processing"
|
|
475
478
|
r.full_prompt = self.get_full_context(
|
|
476
479
|
r.prompt,
|
|
477
480
|
self.model_family.chat_template, # type: ignore
|
|
@@ -48,6 +48,7 @@ from ..utils import (
|
|
|
48
48
|
)
|
|
49
49
|
from .utils import (
|
|
50
50
|
_get_pad_param,
|
|
51
|
+
convert_to_cache_cls,
|
|
51
52
|
get_context_length,
|
|
52
53
|
get_max_src_len,
|
|
53
54
|
pad_prefill_tokens,
|
|
@@ -548,31 +549,48 @@ class PytorchModel(LLM):
|
|
|
548
549
|
So we need pad `0` on the left again.
|
|
549
550
|
"""
|
|
550
551
|
data = []
|
|
551
|
-
|
|
552
|
+
# For decode phase, attention mask should match the full KV cache sequence length
|
|
553
|
+
# All requests in batch should have attention mask of length `seq_length`
|
|
554
|
+
for r in reqs:
|
|
555
|
+
# Get the actual sequence length for this request from its tracking
|
|
556
|
+
if "attention_mask_seq_len" not in r.extra_kwargs:
|
|
557
|
+
# Initialize with the current sequence length (full KV cache length)
|
|
558
|
+
r.extra_kwargs["attention_mask_seq_len"] = seq_length
|
|
559
|
+
else:
|
|
560
|
+
# Use the previously tracked length, but ensure it doesn't exceed current seq_length
|
|
561
|
+
tracked_len = r.extra_kwargs["attention_mask_seq_len"]
|
|
562
|
+
r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
|
|
563
|
+
|
|
564
|
+
# For decode phase after KV cache merge, all requests should have attention mask
|
|
565
|
+
# that matches the merged sequence length
|
|
552
566
|
for r in reqs:
|
|
553
|
-
r.extra_kwargs["attention_mask_seq_len"] += 1
|
|
554
567
|
real_len = r.extra_kwargs["attention_mask_seq_len"]
|
|
555
|
-
pad_len = max_len - real_len
|
|
556
568
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
569
|
+
# The attention mask should cover the full sequence length
|
|
570
|
+
if real_len < seq_length:
|
|
571
|
+
# Pad with zeros on the left to reach full sequence length
|
|
572
|
+
pad_len = seq_length - real_len
|
|
573
|
+
|
|
574
|
+
if self._tokenizer.padding_side == "left":
|
|
575
|
+
x = torch.cat(
|
|
576
|
+
[
|
|
577
|
+
torch.full((pad_len,), 0, dtype=torch.long),
|
|
578
|
+
torch.ones((real_len,), dtype=torch.long),
|
|
579
|
+
]
|
|
580
|
+
)
|
|
581
|
+
else:
|
|
582
|
+
x = torch.cat(
|
|
583
|
+
[
|
|
584
|
+
torch.ones((real_len,), dtype=torch.long),
|
|
585
|
+
torch.full((pad_len,), 0, dtype=torch.long),
|
|
586
|
+
]
|
|
587
|
+
)
|
|
568
588
|
else:
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
torch.full((pad_len,), 0, dtype=torch.long),
|
|
573
|
-
]
|
|
574
|
-
)
|
|
589
|
+
# Already at correct length
|
|
590
|
+
x = torch.ones((real_len,), dtype=torch.long)
|
|
591
|
+
|
|
575
592
|
data.append(x)
|
|
593
|
+
|
|
576
594
|
return torch.stack(data).to(self._device)
|
|
577
595
|
|
|
578
596
|
def build_prefill_position_ids(
|
|
@@ -713,30 +731,105 @@ class PytorchModel(LLM):
|
|
|
713
731
|
from torch.nn.functional import pad
|
|
714
732
|
from transformers import DynamicCache
|
|
715
733
|
|
|
734
|
+
# Handle case where past_cache is None
|
|
735
|
+
if past_cache is None:
|
|
736
|
+
return new_cache
|
|
737
|
+
|
|
738
|
+
# Convert both caches to DynamicCache if not already
|
|
739
|
+
if not isinstance(past_cache, DynamicCache):
|
|
740
|
+
past_cache = convert_to_cache_cls(past_cache)
|
|
741
|
+
if not isinstance(new_cache, DynamicCache):
|
|
742
|
+
new_cache = convert_to_cache_cls(new_cache)
|
|
743
|
+
|
|
716
744
|
_, seq_len_idx = self.get_batch_size_and_seq_len_indexes_from_kv()
|
|
717
|
-
|
|
718
|
-
|
|
745
|
+
|
|
746
|
+
# Handle empty caches
|
|
747
|
+
if len(past_cache) == 0:
|
|
748
|
+
return new_cache
|
|
749
|
+
if len(new_cache) == 0:
|
|
750
|
+
return past_cache
|
|
751
|
+
|
|
752
|
+
# Get first layer seq_len safely
|
|
753
|
+
past_first = past_cache[0] if len(past_cache) > 0 else (None, None)
|
|
754
|
+
new_first = new_cache[0] if len(new_cache) > 0 else (None, None)
|
|
755
|
+
|
|
756
|
+
if past_first[0] is None or past_first[1] is None:
|
|
757
|
+
return new_cache
|
|
758
|
+
if new_first[0] is None or new_first[1] is None:
|
|
759
|
+
return past_cache
|
|
760
|
+
|
|
761
|
+
past_seq_len = past_first[0].shape[seq_len_idx]
|
|
762
|
+
new_seq_len = new_first[0].shape[seq_len_idx]
|
|
763
|
+
|
|
764
|
+
# Pad the shorter cache
|
|
719
765
|
if past_seq_len != new_seq_len:
|
|
720
|
-
|
|
721
|
-
|
|
766
|
+
if past_seq_len > new_seq_len:
|
|
767
|
+
padding_target = new_cache
|
|
768
|
+
padding_len = past_seq_len - new_seq_len
|
|
769
|
+
else:
|
|
770
|
+
padding_target = past_cache
|
|
771
|
+
padding_len = new_seq_len - past_seq_len
|
|
772
|
+
|
|
722
773
|
pad_param = _get_pad_param(seq_len_idx, padding_len)
|
|
723
774
|
for idx in range(len(padding_target)):
|
|
724
775
|
k = padding_target.key_cache[idx]
|
|
725
776
|
v = padding_target.value_cache[idx]
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
padding_target.value_cache[idx] = _v
|
|
777
|
+
if k is not None and v is not None:
|
|
778
|
+
padding_target.key_cache[idx] = pad(k, pad_param)
|
|
779
|
+
padding_target.value_cache[idx] = pad(v, pad_param)
|
|
730
780
|
|
|
781
|
+
# Merge caches
|
|
731
782
|
ret_kv = DynamicCache()
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
783
|
+
max_layers = max(len(past_cache), len(new_cache))
|
|
784
|
+
|
|
785
|
+
for idx in range(max_layers):
|
|
786
|
+
past_k = past_cache.key_cache[idx] if idx < len(past_cache) else None
|
|
787
|
+
past_v = past_cache.value_cache[idx] if idx < len(past_cache) else None
|
|
788
|
+
new_k = new_cache.key_cache[idx] if idx < len(new_cache) else None
|
|
789
|
+
new_v = new_cache.value_cache[idx] if idx < len(new_cache) else None
|
|
790
|
+
|
|
791
|
+
if past_k is not None and new_k is not None:
|
|
792
|
+
# Both layers exist - validate tensor dimensions before concatenation
|
|
793
|
+
if past_k.dim() != new_k.dim():
|
|
794
|
+
logger.error(
|
|
795
|
+
f"KV cache tensor dimension mismatch at layer {idx}: "
|
|
796
|
+
f"past_k.dim()={past_k.dim()}, new_k.dim()={new_k.dim()}"
|
|
797
|
+
)
|
|
798
|
+
# Use the cache with higher batch size
|
|
799
|
+
if past_k.shape[0] >= new_k.shape[0]:
|
|
800
|
+
ret_kv.update(past_k, past_v, idx)
|
|
801
|
+
else:
|
|
802
|
+
ret_kv.update(new_k, new_v, idx)
|
|
803
|
+
continue
|
|
804
|
+
|
|
805
|
+
if past_k.shape[1:] == new_k.shape[1:]:
|
|
806
|
+
# Shapes are compatible, concatenate along batch dimension
|
|
807
|
+
ret_kv.update(
|
|
808
|
+
torch.cat((new_k, past_k), 0).contiguous(),
|
|
809
|
+
torch.cat((new_v, past_v), 0).contiguous(),
|
|
810
|
+
idx,
|
|
811
|
+
)
|
|
812
|
+
else:
|
|
813
|
+
# Detailed logging for shape mismatch
|
|
814
|
+
logger.warning(
|
|
815
|
+
f"KV cache shape mismatch at layer {idx}: "
|
|
816
|
+
f"past_k.shape={past_k.shape}, new_k.shape={new_k.shape}. "
|
|
817
|
+
f"This may be due to inconsistent batch sizes in continuous batching."
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
# Choose the cache with larger batch size to preserve more data
|
|
821
|
+
if past_k.shape[0] >= new_k.shape[0]:
|
|
822
|
+
ret_kv.update(past_k, past_v, idx)
|
|
823
|
+
else:
|
|
824
|
+
ret_kv.update(new_k, new_v, idx)
|
|
825
|
+
elif past_k is not None:
|
|
826
|
+
ret_kv.update(past_k, past_v, idx)
|
|
827
|
+
elif new_k is not None:
|
|
828
|
+
ret_kv.update(new_k, new_v, idx)
|
|
829
|
+
else:
|
|
830
|
+
# both None, fill with None
|
|
831
|
+
ret_kv.update(None, None, idx)
|
|
832
|
+
|
|
740
833
|
return ret_kv
|
|
741
834
|
|
|
742
835
|
def prepare_batch_inference(self, req_list: List[InferenceRequest]):
|