PyPI - xinference - Versions diffs - 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl - Mend

xinference 0.12.0py3-none-any.whl → 0.12.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (85) hide show

xinference/model/llm/pytorch/utils.py CHANGED Viewed

@@ -17,11 +17,9 @@ import logging
 import os
 import time
 import uuid
-from threading import Thread
 from typing import Dict, Iterable, Iterator, List, Optional, Tuple
 import torch
-from transformers import GenerationConfig, TextIteratorStreamer
 from transformers.cache_utils import DynamicCache
 from transformers.generation.logits_process import (
     LogitsProcessorList,
@@ -126,6 +124,7 @@ def generate_stream(
     stop_str = generate_config.get("stop", None)
     stop_token_ids = generate_config.get("stop_token_ids", None) or []
     stop_token_ids.append(tokenizer.eos_token_id)
+    chunk_id = str(uuid.uuid4())
     logits_processor = prepare_logits_processor(
         temperature, repetition_penalty, top_p, top_k
@@ -289,7 +288,7 @@ def generate_stream(
                     text=output, index=0, logprobs=None, finish_reason=None
                 )
                 completion_chunk = CompletionChunk(
-                    id=str(uuid.uuid1()),
+                    id=chunk_id,
                     object="text_completion",
                     created=int(time.time()),
                     model=model_uid,
@@ -327,7 +326,7 @@ def generate_stream(
         )
     completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
+        id=chunk_id,
         object="text_completion",
         created=int(time.time()),
         model=model_uid,
@@ -343,7 +342,7 @@ def generate_stream(
     if include_usage:
         completion_chunk = CompletionChunk(
-            id=str(uuid.uuid1()),
+            id=chunk_id,
             object="text_completion",
             created=int(time.time()),
             model=model_uid,
@@ -362,178 +361,6 @@ def generate_stream(
     empty_cache()
-@torch.inference_mode()
-def generate_stream_falcon(
-    model_uid,
-    model,
-    tokenizer,
-    prompt,
-    device,
-    generate_config,
-    judge_sent_end=False,
-) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]:
-    context_len = get_context_length(model.config)
-    stream_interval = generate_config.get("stream_interval", 2)
-    stream = generate_config.get("stream", False)
-    stream_options = generate_config.pop("stream_options", None)
-    include_usage = (
-        stream_options["include_usage"] if isinstance(stream_options, dict) else False
-    )
-    len_prompt = len(prompt)
-    temperature = float(generate_config.get("temperature", 1.0))
-    repetition_penalty = float(generate_config.get("repetition_penalty", 1.0))
-    top_p = float(generate_config.get("top_p", 1.0))
-    top_k = int(generate_config.get("top_k", 50))  # -1 means disable
-    max_new_tokens = int(generate_config.get("max_tokens", max_tokens_field.default))
-    echo = bool(generate_config.get("echo", False))
-    stop_str = generate_config.get("stop", None)
-    stop_token_ids = generate_config.get("stop_token_ids", None) or []
-    stop_token_ids.append(tokenizer.eos_token_id)
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    input_ids = inputs["input_ids"]
-    attention_mask = inputs["attention_mask"]
-    max_src_len = context_len - max_new_tokens - 8
-    input_ids = input_ids[-max_src_len:]  # truncate from the left
-    attention_mask = attention_mask[-max_src_len:]  # truncate from the left
-    input_echo_len = len(input_ids)
-    decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
-    generation_config = GenerationConfig(
-        max_new_tokens=max_new_tokens,
-        do_sample=temperature >= 1e-5,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        no_repeat_ngram_size=10,
-        top_p=top_p,
-        top_k=top_k,
-        eos_token_id=stop_token_ids,
-    )
-    generation_kwargs = dict(
-        inputs=input_ids,
-        attention_mask=attention_mask,
-        streamer=streamer,
-        generation_config=generation_config,
-    )
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    if echo:
-        # means keep the prompt
-        output = prompt
-    else:
-        output = ""
-    last_output_length = 0
-    for i, new_text in enumerate(streamer):
-        output += new_text
-        if i % stream_interval == 0:
-            if echo:
-                rfind_start = len_prompt
-            else:
-                rfind_start = 0
-            partially_stopped = False
-            if stop_str:
-                if isinstance(stop_str, str):
-                    pos = output.rfind(stop_str, rfind_start)
-                    if pos != -1:
-                        output = output[:pos]
-                    else:
-                        partially_stopped = is_partial_stop(output, stop_str)
-                elif isinstance(stop_str, Iterable):
-                    for each_stop in stop_str:
-                        pos = output.rfind(each_stop, rfind_start)
-                        if pos != -1:
-                            output = output[:pos]
-                            break
-                        else:
-                            partially_stopped = is_partial_stop(output, each_stop)
-                            if partially_stopped:
-                                break
-                else:
-                    raise ValueError("Invalid stop field type.")
-            if stream:
-                output = output.strip("�")
-                tmp_output_length = len(output)
-                output = output[last_output_length:]
-                last_output_length = tmp_output_length
-            # prevent yielding partial stop sequence
-            if not partially_stopped:
-                completion_choice = CompletionChoice(
-                    text=output, index=0, logprobs=None, finish_reason=None
-                )
-                completion_chunk = CompletionChunk(
-                    id=str(uuid.uuid1()),
-                    object="text_completion",
-                    created=int(time.time()),
-                    model=model_uid,
-                    choices=[completion_choice],
-                )
-                completion_usage = CompletionUsage(
-                    prompt_tokens=input_echo_len,
-                    completion_tokens=i,
-                    total_tokens=(input_echo_len + i),
-                )
-                yield completion_chunk, completion_usage
-    output = output.strip()
-    # finish stream event, which contains finish reason
-    if i == max_new_tokens - 1:
-        finish_reason = "length"
-    elif partially_stopped:
-        finish_reason = None
-    else:
-        finish_reason = "stop"
-    completion_choice = CompletionChoice(
-        text=output, index=0, logprobs=None, finish_reason=finish_reason
-    )
-    completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
-        object="text_completion",
-        created=int(time.time()),
-        model=model_uid,
-        choices=[completion_choice],
-    )
-    completion_usage = CompletionUsage(
-        prompt_tokens=input_echo_len,
-        completion_tokens=i,
-        total_tokens=(input_echo_len + i),
-    )
-    yield completion_chunk, completion_usage
-    if include_usage:
-        completion_chunk = CompletionChunk(
-            id=str(uuid.uuid1()),
-            object="text_completion",
-            created=int(time.time()),
-            model=model_uid,
-            choices=[],
-        )
-        completion_usage = CompletionUsage(
-            prompt_tokens=input_echo_len,
-            completion_tokens=i,
-            total_tokens=(input_echo_len + i),
-        )
-        yield completion_chunk, completion_usage
-    # clean
-    gc.collect()
-    empty_cache()
 def _get_token_from_logits(
     req: InferenceRequest, i: int, logits, temperature, repetition_penalty, top_p, top_k
 ):
@@ -568,12 +395,15 @@ def _pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]:
     return [pad] * (max_len - len(x)) + x
-def _pad_seqs_inplace(seqs: List[List[int]], pad: int):
+def _pad_seqs_inplace(seqs: List[List[int]], reqs: List[InferenceRequest], pad: int):
     max_len = max(len(seq) for seq in seqs)
     n = len(seqs)
     i = 0
     while i < n:
+        prev_seq_len = len(seqs[i])
         seqs[i] = _pad_to_max_length(seqs[i], max_len, pad)
+        padding_len = len(seqs[i]) - prev_seq_len
+        reqs[i].padding_len = padding_len
         i += 1
@@ -586,6 +416,7 @@ def get_max_src_len(context_len: int, r: InferenceRequest) -> int:
 def _get_completion_chunk(
     output: str,
+    chunk_id: str,
     finish_reason: Optional[str],
     model_uid: str,
     r: InferenceRequest,
@@ -601,7 +432,7 @@ def _get_completion_chunk(
         else []
     )
     completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
+        id=chunk_id,
         object="text_completion",
         created=int(time.time()),
         model=model_uid,
@@ -617,14 +448,18 @@ def _get_completion_chunk(
 def _get_completion(
-    output: str, finish_reason: Optional[str], model_uid: str, r: InferenceRequest
+    output: str,
+    chunk_id: str,
+    finish_reason: Optional[str],
+    model_uid: str,
+    r: InferenceRequest,
 ):
     completion_choice = CompletionChoice(
         text=output, index=0, logprobs=None, finish_reason=finish_reason
     )
     completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
+        id=chunk_id,
         object="text_completion",
         created=int(time.time()),
         model=model_uid,
@@ -674,6 +509,25 @@ def _merge_kv_cache(
     return ret_kv.to_legacy_cache()
+def _get_attention_mask_and_position_ids(kv, reqs: List[InferenceRequest]):
+    batch_size, seq_length, device = (
+        kv[0][0].shape[0],
+        kv[0][0].shape[2],
+        kv[0][0].device,
+    )
+    seq_length = seq_length + 1
+    position_ids = torch.as_tensor([[seq_length - 1]], dtype=torch.long, device=device)
+    attention_mask = torch.ones(
+        (batch_size, seq_length), dtype=torch.long, device=device
+    )
+    padding_lens = torch.as_tensor([r.padding_len for r in reqs])
+    mask = torch.arange(seq_length).expand(
+        batch_size, seq_length
+    ) < padding_lens.unsqueeze(1)
+    attention_mask[mask] = 0
+    return attention_mask, position_ids
 @torch.inference_mode()
 def _batch_inference_one_step_internal(
     req_list: List[InferenceRequest],
@@ -682,7 +536,9 @@ def _batch_inference_one_step_internal(
     tokenizer,
     device,
     context_len: int,
+    stop_tokens: Tuple[int],
     decode_round: int = 16,
+    require_attention_mask: bool = False,
     bos_flag: str = "<bos_stream>",
     eos_flag: str = "<eos_stream>",
 ):
@@ -692,7 +548,8 @@ def _batch_inference_one_step_internal(
     if not valid_req_list:
         return
     generate_config_mapping: Dict[InferenceRequest, Tuple] = {
-        r: r.get_generate_configs(tokenizer.eos_token_id) for r in valid_req_list
+        r: r.get_generate_configs(tokenizer.eos_token_id, stop_tokens)
+        for r in valid_req_list
     }
     s_time = time.time()
@@ -701,7 +558,7 @@ def _batch_inference_one_step_internal(
     decode_reqs = []
     for r in valid_req_list:
         if r.is_prefill:
-            prompts.append(r.full_prompt)
+            prompts.append(r.full_prompt if r.full_prompt is not None else r.prompt)
             prefill_reqs.append(r)
         else:
             decode_reqs.append(r)
@@ -714,7 +571,7 @@ def _batch_inference_one_step_internal(
             max_src_len = get_max_src_len(context_len, req)
             req.prompt_tokens = input_id[-max_src_len:]
             prompt_tokens.append(req.prompt_tokens)
-        _pad_seqs_inplace(prompt_tokens, 0)
+        _pad_seqs_inplace(prompt_tokens, valid_req_list, 0)
         out = model(torch.as_tensor(prompt_tokens, device=device), use_cache=True)
         logits = out.logits
@@ -756,10 +613,18 @@ def _batch_inference_one_step_internal(
     # here, only decode phase, just run some rounds
     for _i in range(decode_round):
         decode_tokens: List[List[int]] = [[r.new_tokens[-1]] for r in valid_req_list]
+        inf_kws = {}
+        if require_attention_mask:
+            attention_mask, position_ids = _get_attention_mask_and_position_ids(
+                past_key_values, valid_req_list
+            )
+            inf_kws["position_ids"] = position_ids
+            inf_kws["attention_mask"] = attention_mask
         out = model(
             input_ids=torch.as_tensor(decode_tokens, device=device),
             use_cache=True,
             past_key_values=past_key_values,
+            **inf_kws,
         )
         logits = out.logits
         past_key_values = out.past_key_values
@@ -846,7 +711,7 @@ def _batch_inference_one_step_internal(
                     r.last_output_length += len(output)
                     completion_chunk = _get_completion_chunk(
-                        output, r.finish_reason, model_uid, r, False
+                        output, r.chunk_id, r.finish_reason, model_uid, r, False
                     )
                     r.completion.append(completion_chunk)
                     if r.stopped:
@@ -859,7 +724,7 @@ def _batch_inference_one_step_internal(
                     if r.stopped and _i == decode_round - 1 and include_usage:
                         r.completion.append(
                             _get_completion_chunk(
-                                "", r.finish_reason, model_uid, r, True
+                                "", r.chunk_id, r.finish_reason, model_uid, r, True
                             )
                         )
             else:
@@ -878,7 +743,9 @@ def _batch_inference_one_step_internal(
                         if r not in output_mapping
                         else output_mapping[r]
                     )
-                    completion = _get_completion(outputs, r.finish_reason, model_uid, r)
+                    completion = _get_completion(
+                        outputs, r.chunk_id, r.finish_reason, model_uid, r
+                    )
                     r.completion = [completion]
     e_time = time.time()
@@ -894,12 +761,21 @@ def batch_inference_one_step(
     tokenizer,
     device,
     context_len: int,
+    stop_token_ids: Tuple[int],
+    require_attention_mask: bool = False,
 ):
     from ....core.model import OutOfMemoryError
     try:
         _batch_inference_one_step_internal(
-            req_list, model_uid, model, tokenizer, device, context_len
+            req_list,
+            model_uid,
+            model,
+            tokenizer,
+            device,
+            context_len,
+            stop_token_ids,
+            require_attention_mask=require_attention_mask,
         )
     except OutOfMemoryError:
         logger.exception(
@@ -911,4 +787,8 @@ def batch_inference_one_step(
         os._exit(1)
     except Exception as e:
         logger.exception(f"Internal error for batch inference: {e}.")
-        # TODO: handle this
+        # If internal error happens, just skip all the requests in this batch.
+        # If not handle here, the client will hang.
+        for r in req_list:
+            r.stopped = True
+            r.error_msg = str(e)

xinference/model/llm/utils.py CHANGED Viewed

@@ -607,7 +607,7 @@ Begin!"""
             return arguments, None, None
     @staticmethod
-    def _eval_chatglm3_arguments(c, tools):
+    def _eval_glm_chat_arguments(c, tools):
         if isinstance(c[0], str):
             return c[0], None, None
         return None, c[0]["name"], c[0]["parameters"]
@@ -659,9 +659,15 @@ Begin!"""
         family = model_family.model_family or model_family.model_name
         if family in ["gorilla-openfunctions-v1", "gorilla-openfunctions-v2"]:
             content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
-        elif "chatglm3" == family:
-            content, func, args = cls._eval_chatglm3_arguments(c, tools)
-        elif family in ["qwen-chat", "qwen1.5-chat"]:
+        elif family in ["chatglm3", "glm4-chat"]:
+            content, func, args = cls._eval_glm_chat_arguments(c, tools)
+        elif family in [
+            "qwen-chat",
+            "qwen1.5-chat",
+            "qwen1.5-moe-chat",
+            "qwen2-instruct",
+            "qwen2-moe-instruct",
+        ]:
             content, func, args = cls._eval_qwen_chat_arguments(c, tools)
         else:
             raise Exception(
@@ -676,28 +682,35 @@ Begin!"""
         Generates a filter function for Qwen series models to retain outputs after "\nFinal Answer:".
         Returns:
-            A function that takes tokens (string output by the model so far) as input
-            returns True if current token is after "\nFinal Answer:", else False.
+            A function that takes tokens (string output by the model so far) and delta (new tokens added) as input,
+            returns the part after "\nFinal Answer:" if found, else returns delta.
         """
         family = model_family.model_family or model_family.model_name
-        if family in ["qwen-chat", "qwen1.5-chat"]:
+        if family in [
+            "qwen-chat",
+            "qwen1.5-chat",
+            "qwen1.5-moe-chat",
+            "qwen2-instruct",
+            "qwen2-moe-instruct",
+        ]:
             # Encapsulating function to reset 'found' after each call
             found = False
-            def process_token(tokens: str):
+            def process_tokens(tokens: str, delta: str):
                 nonlocal found
                 # Once "Final Answer:" is found, future tokens are allowed.
                 if found:
-                    return True
+                    return delta
                 # Check if the token ends with "\nFinal Answer:" and update `found`.
-                if tokens.endswith("\nFinal Answer:"):
+                final_answer_idx = tokens.lower().rfind("\nfinal answer:")
+                if final_answer_idx != -1:
                     found = True
-                return False
+                    return tokens[final_answer_idx + len("\nfinal answer:") :]
+                return ""
-            return process_token
+            return process_tokens
         else:
-            # For other families, allow all tokens.
-            return lambda tokens: True
+            return lambda tokens, delta: delta
     @classmethod
     def _tool_calls_completion(cls, model_family, model_uid, c, tools):

xinference/model/llm/vllm/core.py CHANGED Viewed

@@ -444,7 +444,9 @@ class VLLMModel(LLM):
                         _content, func, args = ChatModelMixin._eval_tool_arguments(
                             self.model_family, chunk, tools
                         )
-                        choice["text"] = choice_delta
+                        choice["text"] = tools_token_filter(
+                            tokens=previous_texts[0], delta=choice_delta
+                        )
                         if func is not None:
                             choice["text"] = None
                             choice["finish_reason"] = "tool_calls"
@@ -458,9 +460,13 @@ class VLLMModel(LLM):
                                     ),
                                 )
                             ]
-                    # use a filter function to skip Qwen's react thought process
-                    elif not tools_token_filter(previous_texts[0]):
-                        continue
+                    else:
+                        # use a filter function to skip Qwen's react thought process
+                        choice["text"] = tools_token_filter(
+                            tokens=previous_texts[0], delta=choice["text"]
+                        )
+                        if not choice["text"]:
+                            continue
                 prompt_tokens = len(_request_output.prompt_token_ids)
                 completion_tokens = sum(
                     len(output.token_ids) for output in _request_output.outputs

xinference/model/rerank/core.py CHANGED Viewed

@@ -23,7 +23,7 @@ import numpy as np
 from ...constants import XINFERENCE_CACHE_DIR
 from ...device_utils import empty_cache
-from ...types import Document, DocumentObj, Rerank
+from ...types import Document, DocumentObj, Rerank, RerankTokens
 from ..core import CacheableModelSpec, ModelDescription
 from ..utils import is_model_cached
@@ -121,11 +121,17 @@ class RerankModel:
         if model_spec.type == "unknown":
             model_spec.type = self._auto_detect_type(model_path)
+    @staticmethod
+    def _get_tokenizer(model_path):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        return tokenizer
     @staticmethod
     def _auto_detect_type(model_path):
         """This method may not be stable due to the fact that the tokenizer name may be changed.
         Therefore, we only use this method for unknown model types."""
-        from transformers import AutoTokenizer
         type_mapper = {
             "LlamaTokenizerFast": "LLM-based layerwise",
@@ -133,12 +139,13 @@ class RerankModel:
             "XLMRobertaTokenizerFast": "normal",
         }
-        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        tokenizer = RerankModel._get_tokenizer(model_path)
         rerank_type = type_mapper.get(type(tokenizer).__name__)
         if rerank_type is None:
-            raise Exception(
-                f"Can't determine the rerank type based on the tokenizer {tokenizer}"
+            logger.warning(
+                f"Can't determine the rerank type based on the tokenizer {tokenizer}, use normal type by default."
             )
+            return "normal"
         return rerank_type
     def load(self):
@@ -185,6 +192,7 @@ class RerankModel:
         top_n: Optional[int],
         max_chunks_per_doc: Optional[int],
         return_documents: Optional[bool],
+        return_len: Optional[bool],
         **kwargs,
     ) -> Rerank:
         self._counter += 1
@@ -223,7 +231,28 @@ class RerankModel:
                 )
                 for arg in sim_scores_argsort
             ]
-        return Rerank(id=str(uuid.uuid1()), results=docs)
+        if return_len:
+            tokenizer = self._get_tokenizer(self._model_path)
+            input_len = sum([len(tokenizer.tokenize(t)) for t in documents])
+            # Rerank Model output is just score or documents
+            # while return_documents = True
+            output_len = input_len
+        # api_version, billed_units, warnings
+        # is for Cohere API compatibility, set to None
+        metadata = {
+            "api_version": None,
+            "billed_units": None,
+            "tokens": (
+                RerankTokens(input_tokens=input_len, output_tokens=output_len)
+                if return_len
+                else None
+            ),
+            "warnings": None,
+        }
+        return Rerank(id=str(uuid.uuid1()), results=docs, meta=metadata)
 def get_cache_dir(model_spec: RerankModelSpec):

xinference/model/utils.py CHANGED Viewed

@@ -42,14 +42,20 @@ def is_locale_chinese_simplified() -> bool:
 def download_from_modelscope() -> bool:
-    if os.environ.get(XINFERENCE_ENV_MODEL_SRC) == "modelscope":
-        return True
+    if os.environ.get(XINFERENCE_ENV_MODEL_SRC):
+        return os.environ.get(XINFERENCE_ENV_MODEL_SRC) == "modelscope"
     elif is_locale_chinese_simplified():
         return True
     else:
         return False
+def download_from_csghub() -> bool:
+    if os.environ.get(XINFERENCE_ENV_MODEL_SRC) == "csghub":
+        return True
+    return False
 def symlink_local_file(path: str, local_dir: str, relpath: str) -> str:
     from huggingface_hub.file_download import _create_symlink

xinference/thirdparty/ChatTTS/experimental/__init__.py ADDED Viewed

File without changes

xinference/thirdparty/ChatTTS/experimental/llm.py ADDED Viewed

@@ -0,0 +1,40 @@
+from openai import OpenAI
+prompt_dict = {
+    'kimi': [ {"role": "system", "content": "你是 Kimi，由 Moonshot AI 提供的人工智能助手，你更擅长中文和英文的对话。"},
+              {"role": "user", "content": "你好，请注意你现在生成的文字要按照人日常生活的口吻，你的回复将会后续用TTS模型转为语音，并且请把回答控制在100字以内。并且标点符号仅包含逗号和句号，将数字等转为文字回答。"},
+              {"role": "assistant", "content": "好的，我现在生成的文字将按照人日常生活的口吻， 并且我会把回答控制在一百字以内, 标点符号仅包含逗号和句号，将阿拉伯数字等转为中文文字回答。下面请开始对话。"},],
+    'deepseek': [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "你好，请注意你现在生成的文字要按照人日常生活的口吻，你的回复将会后续用TTS模型转为语音，并且请把回答控制在100字以内。并且标点符号仅包含逗号和句号，将数字等转为文字回答。"},
+        {"role": "assistant", "content": "好的，我现在生成的文字将按照人日常生活的口吻， 并且我会把回答控制在一百字以内, 标点符号仅包含逗号和句号，将阿拉伯数字等转为中文文字回答。下面请开始对话。"},],
+    'deepseek_TN': [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "你好，现在我们在处理TTS的文本输入，下面将会给你输入一段文本，请你将其中的阿拉伯数字等等转为文字表达，并且输出的文本里仅包含逗号和句号这两个标点符号"},
+        {"role": "assistant", "content": "好的，我现在对TTS的文本输入进行处理。这一般叫做text normalization。下面请输入"},
+        {"role": "user", "content": "We paid $123 for this desk."},
+        {"role": "assistant", "content": "We paid one hundred and twenty three dollars for this desk."},
+        {"role": "user", "content": "详询请拨打010-724654"},
+        {"role": "assistant", "content": "详询请拨打零幺零，七二四六五四"},
+        {"role": "user", "content": "罗森宣布将于7月24日退市，在华门店超6000家！"},
+        {"role": "assistant", "content": "罗森宣布将于七月二十四日退市，在华门店超过六千家。"},
+        ],
+}
+class llm_api:
+    def __init__(self, api_key, base_url, model):
+        self.client =  OpenAI(
+            api_key = api_key,
+            base_url = base_url,
+        )
+        self.model = model
+    def call(self, user_question, temperature = 0.3, prompt_version='kimi', **kwargs):
+        completion = self.client.chat.completions.create(
+            model = self.model,
+            messages = prompt_dict[prompt_version]+[{"role": "user", "content": user_question},],
+            temperature = temperature,
+            **kwargs
+        )
+        return completion.choices[0].message.content

xinference/thirdparty/ChatTTS/infer/__init__.py ADDED Viewed

File without changes

xinference 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl

Potentially problematic release.

xinference 0.12.0py3-none-any.whl → 0.12.2py3-none-any.whl