PyPI - xinference - Versions diffs - 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (124) hide show

xinference/model/audio/chattts.py CHANGED Viewed

@@ -71,9 +71,10 @@ class ChatTTSModel:
         import ChatTTS
         import numpy as np
         import torch
-        import torchaudio
         import xxhash
+        from .utils import audio_stream_generator, audio_to_bytes
         rnd_spk_emb = None
         if len(voice) > 400:
@@ -105,44 +106,28 @@ class ChatTTSModel:
         )
         assert self._model is not None
+        output = self._model.infer(
+            [input], params_infer_code=params_infer_code, stream=stream
+        )
         if stream:
-            iter = self._model.infer(
-                [input], params_infer_code=params_infer_code, stream=True
-            )
-            def _generator():
-                with BytesIO() as out:
-                    writer = torchaudio.io.StreamWriter(out, format=response_format)
-                    writer.add_audio_stream(sample_rate=24000, num_channels=1)
-                    i = 0
-                    last_pos = 0
-                    with writer.open():
-                        for it in iter:
-                            for chunk in it:
-                                chunk = np.array([chunk]).transpose()
-                                writer.write_audio_chunk(i, torch.from_numpy(chunk))
-                                new_last_pos = out.tell()
-                                if new_last_pos != last_pos:
-                                    out.seek(last_pos)
-                                    encoded_bytes = out.read()
-                                    yield encoded_bytes
-                                    last_pos = new_last_pos
-            return _generator()
+            def _gen_chunk():
+                for it in output:
+                    for chunk in it:
+                        yield chunk
+            return audio_stream_generator(
+                response_format=response_format,
+                sample_rate=24000,
+                output_generator=_gen_chunk(),
+                output_chunk_transformer=lambda c: torch.from_numpy(
+                    np.array([c]).transpose()
+                ),
+            )
         else:
-            wavs = self._model.infer([input], params_infer_code=params_infer_code)
-            # Save the generated audio
-            with BytesIO() as out:
-                try:
-                    torchaudio.save(
-                        out,
-                        torch.from_numpy(wavs[0]).unsqueeze(0),
-                        24000,
-                        format=response_format,
-                    )
-                except:
-                    torchaudio.save(
-                        out, torch.from_numpy(wavs[0]), 24000, format=response_format
-                    )
-                return out.getvalue()
+            return audio_to_bytes(
+                response_format=response_format,
+                sample_rate=24000,
+                tensor=torch.from_numpy(output[0]).unsqueeze(0),
+            )

xinference/model/audio/cosyvoice.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 import io
 import logging
-from io import BytesIO
 from typing import TYPE_CHECKING, Optional
 from ..utils import set_all_random_seed
@@ -132,36 +131,25 @@ class CosyVoiceModel:
                 output = self._model.inference_sft(input, voice, stream=stream)
         import torch
-        import torchaudio
-        def _generator_stream():
-            with BytesIO() as out:
-                writer = torchaudio.io.StreamWriter(out, format=response_format)
-                writer.add_audio_stream(
-                    sample_rate=self._model.sample_rate, num_channels=1
-                )
-                i = 0
-                last_pos = 0
-                with writer.open():
-                    for chunk in output:
-                        chunk = chunk["tts_speech"]
-                        trans_chunk = torch.transpose(chunk, 0, 1)
-                        writer.write_audio_chunk(i, trans_chunk)
-                        new_last_pos = out.tell()
-                        if new_last_pos != last_pos:
-                            out.seek(last_pos)
-                            encoded_bytes = out.read()
-                            yield encoded_bytes
-                            last_pos = new_last_pos
-        def _generator_block():
-            chunks = [o["tts_speech"] for o in output]
-            t = torch.cat(chunks, dim=1)
-            with BytesIO() as out:
-                torchaudio.save(out, t, self._model.sample_rate, format=response_format)
-                return out.getvalue()
-        return _generator_stream() if stream else _generator_block()
+        from .utils import audio_stream_generator, audio_to_bytes
+        return (
+            audio_stream_generator(
+                response_format=response_format,
+                sample_rate=self._model.sample_rate,
+                output_generator=output,
+                output_chunk_transformer=lambda c: torch.transpose(
+                    c["tts_speech"], 0, 1
+                ),
+            )
+            if stream
+            else audio_to_bytes(
+                response_format=response_format,
+                sample_rate=self._model.sample_rate,
+                tensor=torch.cat([o["tts_speech"] for o in output], dim=1),
+            )
+        )
     def speech(
         self,

xinference/model/audio/funasr.py CHANGED Viewed

@@ -44,6 +44,44 @@ class FunASRModel:
     def model_ability(self):
         return self._model_spec.model_ability
+    def convert_to_openai_format(self, input_data):
+        if "timestamp" not in input_data:
+            return {"task": "transcribe", "text": input_data["text"]}
+        start_time = input_data["timestamp"][0][0] / 1000
+        end_time = input_data["timestamp"][-1][1] / 1000
+        duration = end_time - start_time
+        word_timestamps = []
+        for ts in input_data["timestamp"]:
+            word_timestamps.append({"start": ts[0] / 1000, "end": ts[1] / 1000})
+        if "sentence_info" not in input_data:
+            return {
+                "task": "transcribe",
+                "text": input_data["text"],
+                "words": word_timestamps,
+                "duration": duration,
+            }
+        output = {
+            "task": "transcribe",
+            "duration": duration,
+            "text": input_data["text"],
+            "words": word_timestamps,
+            "segments": [],
+        }
+        for sentence in input_data["sentence_info"]:
+            seg_start = sentence["start"] / 1000
+            seg_end = sentence["end"] / 1000
+            output["segments"].append(
+                {
+                    "id": len(output["segments"]),
+                    "start": seg_start,
+                    "end": seg_end,
+                    "text": sentence["text"],
+                    "speaker": sentence["spk"],
+                }
+            )
+        return output
     def load(self):
         try:
             from funasr import AutoModel
@@ -103,6 +141,10 @@ class FunASRModel:
             if response_format == "json":
                 return {"text": text}
+            elif response_format == "verbose_json":
+                verbose = result[0]
+                verbose["text"] = text
+                return self.convert_to_openai_format(verbose)
             else:
                 raise ValueError(f"Unsupported response format: {response_format}")

xinference/model/audio/model_spec.json CHANGED Viewed

@@ -218,13 +218,83 @@
       "batch_size_s": 300
     }
   },
+  {
+    "model_name": "paraformer-zh-hotword",
+    "model_family": "funasr",
+    "model_id": "JunHowie/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+    "model_revision": "26d622993683d7b0c517ee5ec9c1c8bdde76e324",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "hotword": "",
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "paraformer-zh-long",
+    "model_family": "funasr",
+    "model_id": "JunHowie/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "model_revision": "b6d8cb81645e34056cd3dda41e5624a740587de3",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "paraformer-zh-spk",
+    "model_family": "funasr",
+    "model_id": "JunHowie/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
+    "model_revision": "36abd64af4392fe02bf76453bc86c081cf1ca6da",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc",
+      "spk_model":"cam++"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "seaco-paraformer-zh",
+    "model_family": "funasr",
+    "model_id": "JunHowie/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "model_revision": "42e6be00854cf8de0f40002794f99df2a444fa97",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "hotword": "",
+      "batch_size_s": 300
+    }
+  },
   {
     "model_name": "ChatTTS",
     "model_family": "ChatTTS",
     "model_id": "2Noise/ChatTTS",
     "model_revision": "1a3c04a8b0651689bd9242fbb55b1f4b5a9aef84",
     "model_ability": ["text2audio"],
-    "multilingual": true
+    "multilingual": true,
+    "virtualenv": {
+      "packages": [
+        "ChatTTS>=0.2.1",
+        "#system_torch#",
+        "#system_numpy#"
+      ]
+    }
   },
   {
     "model_name": "CosyVoice-300M",

xinference/model/audio/model_spec_modelscope.json CHANGED Viewed

@@ -51,7 +51,7 @@
     "model_name": "paraformer-zh",
     "model_family": "funasr",
     "model_hub": "modelscope",
-    "model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
+    "model_id": "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
     "model_revision": "master",
     "model_ability": ["audio2text"],
     "multilingual": false,
@@ -63,6 +63,73 @@
       "batch_size_s": 300
     }
   },
+  {
+    "model_name": "paraformer-zh-hotword",
+    "model_family": "funasr",
+    "model_hub": "modelscope",
+    "model_id": "iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+    "model_revision": "master",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "hotword": "",
+      "batch_size_s": 300
+    }
+  },
+    {
+    "model_name": "paraformer-zh-long",
+    "model_family": "funasr",
+    "model_hub": "modelscope",
+    "model_id": "iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "model_revision": "master",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "paraformer-zh-spk",
+    "model_family": "funasr",
+    "model_hub": "modelscope",
+    "model_id": "iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn",
+    "model_revision": "master",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc",
+      "spk_model":"cam++"
+    },
+    "default_transcription_config": {
+      "batch_size_s": 300
+    }
+  },
+  {
+    "model_name": "seaco-paraformer-zh",
+    "model_family": "funasr",
+    "model_hub": "modelscope",
+    "model_id": "iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "model_revision": "master",
+    "model_ability": ["audio2text"],
+    "multilingual": false,
+    "default_model_config": {
+      "vad_model": "fsmn-vad",
+      "punc_model": "ct-punc"
+    },
+    "default_transcription_config": {
+      "hotword": "",
+      "batch_size_s": 300
+    }
+  },
   {
     "model_name": "ChatTTS",
     "model_family": "ChatTTS",
@@ -70,7 +137,14 @@
     "model_id": "AI-ModelScope/ChatTTS",
     "model_revision": "master",
     "model_ability": ["text2audio"],
-    "multilingual": true
+    "multilingual": true,
+    "virtualenv": {
+      "packages": [
+        "ChatTTS>=0.2.1",
+        "#system_torch#",
+        "#system_numpy#"
+      ]
+    }
   },
   {
     "model_name": "CosyVoice-300M",

xinference/model/audio/utils.py CHANGED Viewed

@@ -13,16 +13,30 @@
 # limitations under the License.
 import io
+import logging
+import types
+import wave
+from collections.abc import Callable
 import numpy as np
+import torch
 from .core import AudioModelFamilyV1
+logger = logging.getLogger(__name__)
 def get_model_version(audio_model: AudioModelFamilyV1) -> str:
     return audio_model.model_name
+def _extract_pcm_from_wav_bytes(wav_bytes):
+    with io.BytesIO(wav_bytes) as wav_io:
+        with wave.open(wav_io, "rb") as wav_file:
+            num_frames = wav_file.getnframes()
+            return wav_file.readframes(num_frames)
 def ensure_sample_rate(
     audio: np.ndarray, old_sample_rate: int, sample_rate: int
 ) -> np.ndarray:
@@ -48,3 +62,64 @@ def ensure_sample_rate(
             audio, sr = sf.read(buffer, dtype="float32")
     return audio
+def audio_stream_generator(
+    response_format: str,
+    sample_rate: int,
+    output_generator: types.GeneratorType,
+    output_chunk_transformer: Callable,
+):
+    import torch
+    import torchaudio
+    response_pcm = response_format.lower() == "pcm"
+    with io.BytesIO() as out:
+        if response_pcm:
+            logger.info(
+                f"PCM stream output, num_channels: 1, sample_rate: {sample_rate}"
+            )
+            writer = torchaudio.io.StreamWriter(out, format="wav")
+            writer.add_audio_stream(
+                sample_rate=sample_rate, num_channels=1, format="s16"
+            )
+        else:
+            writer = torchaudio.io.StreamWriter(out, format=response_format)
+            writer.add_audio_stream(sample_rate=sample_rate, num_channels=1)
+        strip_header = True
+        last_pos = 0
+        with writer.open():
+            for chunk in output_generator:
+                trans_chunk = output_chunk_transformer(chunk)
+                if response_pcm:
+                    trans_chunk = trans_chunk.to(torch.float32)
+                    trans_chunk = (
+                        (trans_chunk * 32767).clamp(-32768, 32767).to(torch.int16)
+                    )
+                writer.write_audio_chunk(0, trans_chunk)
+                new_last_pos = out.tell()
+                if new_last_pos != last_pos:
+                    out.seek(last_pos)
+                    encoded_bytes = out.read()
+                    if response_pcm and strip_header:
+                        # http://soundfile.sapp.org/doc/WaveFormat
+                        yield _extract_pcm_from_wav_bytes(encoded_bytes)
+                        strip_header = False
+                    else:
+                        yield encoded_bytes
+                    last_pos = new_last_pos
+def audio_to_bytes(response_format: str, sample_rate: int, tensor: "torch.Tensor"):
+    import torchaudio
+    response_pcm = response_format.lower() == "pcm"
+    with io.BytesIO() as out:
+        if response_pcm:
+            logger.info(f"PCM output, num_channels: 1, sample_rate: {sample_rate}")
+            torchaudio.save(out, tensor, sample_rate, format="wav", encoding="PCM_S")
+            # http://soundfile.sapp.org/doc/WaveFormat
+            return _extract_pcm_from_wav_bytes(out.getvalue())
+        else:
+            torchaudio.save(out, tensor, sample_rate, format=response_format)
+            return out.getvalue()

xinference/model/core.py CHANGED Viewed

@@ -97,6 +97,7 @@ def create_model_instance(
             devices,
             model_uid,
             model_name,
+            model_engine,
             download_hub,
             model_path,
             **kwargs,

xinference/model/embedding/__init__.py CHANGED Viewed

@@ -16,7 +16,7 @@ import codecs
 import json
 import os
 import warnings
-from typing import Any, Dict
+from typing import Any, Dict, List
 from .core import (
     EMBEDDING_MODEL_DESCRIPTIONS,
@@ -32,9 +32,15 @@ from .custom import (
     register_embedding,
     unregister_embedding,
 )
-BUILTIN_EMBEDDING_MODELS: Dict[str, Any] = {}
-MODELSCOPE_EMBEDDING_MODELS: Dict[str, Any] = {}
+from .embed_family import (
+    BUILTIN_EMBEDDING_MODELS,
+    EMBEDDING_ENGINES,
+    FLAG_EMBEDDER_CLASSES,
+    MODELSCOPE_EMBEDDING_MODELS,
+    SENTENCE_TRANSFORMER_CLASSES,
+    SUPPORTED_ENGINES,
+    VLLM_CLASSES,
+)
 def register_custom_model():
@@ -55,12 +61,56 @@ def register_custom_model():
                 warnings.warn(f"{user_defined_embedding_dir}/{f} has error, {e}")
+def generate_engine_config_by_model_name(model_spec: "EmbeddingModelSpec"):
+    model_name = model_spec.model_name
+    engines: Dict[str, List[Dict[str, Any]]] = EMBEDDING_ENGINES.get(
+        model_name, {}
+    )  # structure for engine query
+    for engine in SUPPORTED_ENGINES:
+        CLASSES = SUPPORTED_ENGINES[engine]
+        for cls in CLASSES:
+            # Every engine needs to implement match method
+            if cls.match(model_spec):
+                # we only match the first class for an engine
+                engines[engine] = [
+                    {
+                        "model_name": model_name,
+                        "embedding_class": cls,
+                    }
+                ]
+                break
+    EMBEDDING_ENGINES[model_name] = engines
+# will be called in xinference/model/__init__.py
 def _install():
-    load_model_family_from_json("model_spec.json", BUILTIN_EMBEDDING_MODELS)
-    load_model_family_from_json(
-        "model_spec_modelscope.json", MODELSCOPE_EMBEDDING_MODELS
+    _model_spec_json = os.path.join(os.path.dirname(__file__), "model_spec.json")
+    _model_spec_modelscope_json = os.path.join(
+        os.path.dirname(__file__), "model_spec_modelscope.json"
+    )
+    ################### HuggingFace Model List Info Init ###################
+    BUILTIN_EMBEDDING_MODELS.update(
+        dict(
+            (spec["model_name"], EmbeddingModelSpec(**spec))
+            for spec in json.load(codecs.open(_model_spec_json, "r", encoding="utf-8"))
+        )
+    )
+    for model_name, model_spec in BUILTIN_EMBEDDING_MODELS.items():
+        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+    ################### ModelScope Model List Info Init ###################
+    MODELSCOPE_EMBEDDING_MODELS.update(
+        dict(
+            (spec["model_name"], EmbeddingModelSpec(**spec))
+            for spec in json.load(
+                codecs.open(_model_spec_modelscope_json, "r", encoding="utf-8")
+            )
+        )
     )
+    for model_name, model_spec in MODELSCOPE_EMBEDDING_MODELS.items():
+        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+    # TODO: consider support more download hub in future...
     # register model description after recording model revision
     for model_spec_info in [BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS]:
         for model_name, model_spec in model_spec_info.items():
@@ -77,16 +127,22 @@ def _install():
             generate_embedding_description(ud_embedding)
         )
+    from .flag.core import FlagEmbeddingModel
+    from .sentence_transformers.core import SentenceTransformerEmbeddingModel
+    from .vllm.core import VLLMEmbeddingModel
-def load_model_family_from_json(json_filename, target_families):
-    json_path = os.path.join(os.path.dirname(__file__), json_filename)
-    target_families.update(
-        dict(
-            (spec["model_name"], EmbeddingModelSpec(**spec))
-            for spec in json.load(codecs.open(json_path, "r", encoding="utf-8"))
-        )
-    )
-    for model_name, model_spec in target_families.items():
-        MODEL_NAME_TO_REVISION[model_name].append(model_spec.model_revision)
+    SENTENCE_TRANSFORMER_CLASSES.extend([SentenceTransformerEmbeddingModel])
+    FLAG_EMBEDDER_CLASSES.extend([FlagEmbeddingModel])
+    VLLM_CLASSES.extend([VLLMEmbeddingModel])
+    SUPPORTED_ENGINES["sentence_transformers"] = SENTENCE_TRANSFORMER_CLASSES
+    SUPPORTED_ENGINES["flag"] = FLAG_EMBEDDER_CLASSES
+    SUPPORTED_ENGINES["vllm"] = VLLM_CLASSES
+    # Init embedding engine
+    for model_infos in [BUILTIN_EMBEDDING_MODELS, MODELSCOPE_EMBEDDING_MODELS]:
+        for model_spec in model_infos.values():
+            generate_engine_config_by_model_name(model_spec)
-    del json_path
+    del _model_spec_json
+    del _model_spec_modelscope_json

xinference 1.6.0.post1__py3-none-any.whl → 1.7.0__py3-none-any.whl

Potentially problematic release.

xinference 1.6.0.post1py3-none-any.whl → 1.7.0py3-none-any.whl