xinference 0.7.4__py3-none-any.whl → 0.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (34) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +22 -8
  3. xinference/client/oscar/actor_client.py +78 -8
  4. xinference/client/restful/restful_client.py +86 -0
  5. xinference/core/model.py +14 -7
  6. xinference/core/supervisor.py +12 -0
  7. xinference/deploy/cmdline.py +16 -0
  8. xinference/deploy/test/test_cmdline.py +1 -0
  9. xinference/model/embedding/model_spec.json +40 -0
  10. xinference/model/llm/__init__.py +14 -1
  11. xinference/model/llm/llm_family.json +10 -1
  12. xinference/model/llm/llm_family.py +38 -2
  13. xinference/model/llm/llm_family_modelscope.json +10 -1
  14. xinference/model/llm/pytorch/chatglm.py +1 -0
  15. xinference/model/llm/pytorch/core.py +1 -1
  16. xinference/model/llm/pytorch/utils.py +50 -18
  17. xinference/model/llm/utils.py +2 -2
  18. xinference/model/llm/vllm/core.py +13 -4
  19. xinference/model/multimodal/core.py +1 -1
  20. xinference/model/multimodal/qwen_vl.py +34 -2
  21. xinference/web/ui/build/asset-manifest.json +3 -3
  22. xinference/web/ui/build/index.html +1 -1
  23. xinference/web/ui/build/static/js/{main.31d347d8.js → main.236e72e7.js} +3 -3
  24. xinference/web/ui/build/static/js/main.236e72e7.js.map +1 -0
  25. xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +1 -0
  26. {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/METADATA +9 -2
  27. {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/RECORD +32 -32
  28. xinference/web/ui/build/static/js/main.31d347d8.js.map +0 -1
  29. xinference/web/ui/node_modules/.cache/babel-loader/ca8515ecefb4a06c5305417bfd9c04e13cf6b9103f52a47c925921b26c0a9f9d.json +0 -1
  30. /xinference/web/ui/build/static/js/{main.31d347d8.js.LICENSE.txt → main.236e72e7.js.LICENSE.txt} +0 -0
  31. {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/LICENSE +0 -0
  32. {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/WHEEL +0 -0
  33. {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/entry_points.txt +0 -0
  34. {xinference-0.7.4.dist-info → xinference-0.7.5.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ import os
17
17
  import platform
18
18
  import shutil
19
19
  from threading import Lock
20
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
20
+ from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
21
21
 
22
22
  from pydantic import BaseModel, Field, Protocol, ValidationError, validator
23
23
  from pydantic.error_wrappers import ErrorWrapper
@@ -41,6 +41,8 @@ logger = logging.getLogger(__name__)
41
41
 
42
42
  DEFAULT_CONTEXT_LENGTH = 2048
43
43
  BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
44
+ BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set()
45
+ BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
44
46
 
45
47
 
46
48
  class GgmlLLMSpecV1(BaseModel):
@@ -105,6 +107,8 @@ class LLMFamilyV1(BaseModel):
105
107
  model_lang: List[str]
106
108
  model_ability: List[Literal["embed", "generate", "chat"]]
107
109
  model_description: Optional[str]
110
+ # reason for not required str here: legacy registration
111
+ model_family: Optional[str]
108
112
  model_specs: List["LLMSpecV1"]
109
113
  prompt_style: Optional["PromptStyleV1"]
110
114
 
@@ -134,7 +138,39 @@ class CustomLLMFamilyV1(LLMFamilyV1):
134
138
  )
135
139
  except (ValueError, TypeError, UnicodeDecodeError) as e:
136
140
  raise ValidationError([ErrorWrapper(e, loc=ROOT_KEY)], cls)
137
- llm_spec = cls.parse_obj(obj)
141
+ llm_spec: CustomLLMFamilyV1 = cls.parse_obj(obj)
142
+
143
+ # check model_family
144
+ if llm_spec.model_family is None:
145
+ raise ValueError(
146
+ f"You must specify `model_family` when registering custom LLM models."
147
+ )
148
+ assert isinstance(llm_spec.model_family, str)
149
+ if (
150
+ llm_spec.model_family != "other"
151
+ and "chat" in llm_spec.model_ability
152
+ and llm_spec.model_family not in BUILTIN_LLM_MODEL_CHAT_FAMILIES
153
+ ):
154
+ raise ValueError(
155
+ f"`model_family` for chat model must be `other` or one of the following values: \n"
156
+ f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}"
157
+ )
158
+ if (
159
+ llm_spec.model_family != "other"
160
+ and "chat" not in llm_spec.model_ability
161
+ and llm_spec.model_family not in BUILTIN_LLM_MODEL_GENERATE_FAMILIES
162
+ ):
163
+ raise ValueError(
164
+ f"`model_family` for generate model must be `other` or one of the following values: \n"
165
+ f"{', '.join(list(BUILTIN_LLM_MODEL_GENERATE_FAMILIES))}"
166
+ )
167
+ # set prompt style when it is the builtin model family
168
+ if (
169
+ llm_spec.prompt_style is None
170
+ and llm_spec.model_family != "other"
171
+ and "chat" in llm_spec.model_ability
172
+ ):
173
+ llm_spec.prompt_style = llm_spec.model_family
138
174
 
139
175
  # handle prompt style when user choose existing style
140
176
  if llm_spec.prompt_style is not None and isinstance(llm_spec.prompt_style, str):
@@ -331,6 +331,15 @@
331
331
  "roles": [
332
332
  "user",
333
333
  "assistant"
334
+ ],
335
+ "stop_token_ids": [
336
+ 64795,
337
+ 64797,
338
+ 2
339
+ ],
340
+ "stop":[
341
+ "<|user|>",
342
+ "<|observation|>"
334
343
  ]
335
344
  }
336
345
  },
@@ -357,7 +366,7 @@
357
366
  ],
358
367
  "model_hub": "modelscope",
359
368
  "model_id": "ZhipuAI/chatglm3-6b-32k",
360
- "model_revision": "v1.0.0"
369
+ "model_revision": "master"
361
370
  }
362
371
  ],
363
372
  "prompt_style": {
@@ -58,6 +58,7 @@ class ChatglmPytorchChatModel(PytorchChatModel):
58
58
  tokenizer = AutoTokenizer.from_pretrained(
59
59
  self.model_path,
60
60
  trust_remote_code=kwargs["trust_remote_code"],
61
+ encode_special_tokens=True,
61
62
  revision=kwargs["revision"],
62
63
  )
63
64
  model = AutoModel.from_pretrained(
@@ -409,7 +409,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
409
409
  ) -> PytorchGenerateConfig:
410
410
  generate_config = super()._sanitize_generate_config(generate_config)
411
411
  if (
412
- generate_config.get("stop", None) is None
412
+ (not generate_config.get("stop"))
413
413
  and self.model_family.prompt_style
414
414
  and self.model_family.prompt_style.stop
415
415
  ):
@@ -527,10 +527,12 @@ def generate_stream_chatglm(
527
527
  top_p = float(generate_config.get("top_p", 1.0))
528
528
  max_new_tokens = int(generate_config.get("max_tokens", 256))
529
529
  echo = generate_config.get("echo", False)
530
+ stop_str = generate_config.get("stop", None)
531
+ eos_token_id = generate_config.get("stop_token_ids", [])
532
+ eos_token_id.append(tokenizer.eos_token_id)
530
533
 
531
534
  inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
532
535
  input_echo_len = len(inputs["input_ids"][0])
533
-
534
536
  gen_kwargs = {
535
537
  "max_length": max_new_tokens + input_echo_len,
536
538
  "do_sample": True if temperature > 1e-5 else False,
@@ -543,7 +545,9 @@ def generate_stream_chatglm(
543
545
 
544
546
  total_len = 0
545
547
  last_response_length = 0
546
- for total_ids in model.stream_generate(**inputs, **gen_kwargs):
548
+ for total_ids in model.stream_generate(
549
+ **inputs, eos_token_id=eos_token_id, **gen_kwargs
550
+ ):
547
551
  total_ids = total_ids.tolist()[0]
548
552
  total_len = len(total_ids)
549
553
  if echo:
@@ -553,29 +557,57 @@ def generate_stream_chatglm(
553
557
  response = tokenizer.decode(output_ids)
554
558
  response = process_response(response)
555
559
 
560
+ partially_stopped = False
561
+ stopped = False
562
+ if stop_str:
563
+ if isinstance(stop_str, str):
564
+ pos = response.rfind(stop_str, 0)
565
+ if pos != -1:
566
+ response = response[:pos]
567
+ stopped = True
568
+ else:
569
+ partially_stopped = is_partial_stop(response, stop_str)
570
+ elif isinstance(stop_str, Iterable):
571
+ for each_stop in stop_str:
572
+ pos = response.rfind(each_stop, 0)
573
+ if pos != -1:
574
+ response = response[:pos]
575
+ stopped = True
576
+ break
577
+ else:
578
+ partially_stopped = is_partial_stop(response, each_stop)
579
+ if partially_stopped:
580
+ break
581
+ else:
582
+ raise ValueError("Invalid stop field type.")
583
+
556
584
  if stream:
557
585
  response = response.strip("�")
558
586
  tmp_response_length = len(response)
559
587
  response = response[last_response_length:]
560
588
  last_response_length = tmp_response_length
561
589
 
562
- completion_choice = CompletionChoice(
563
- text=response, index=0, logprobs=None, finish_reason=None
564
- )
565
- completion_chunk = CompletionChunk(
566
- id=str(uuid.uuid1()),
567
- object="text_completion",
568
- created=int(time.time()),
569
- model=model_uid,
570
- choices=[completion_choice],
571
- )
572
- completion_usage = CompletionUsage(
573
- prompt_tokens=input_echo_len,
574
- completion_tokens=(total_len - input_echo_len),
575
- total_tokens=total_len,
576
- )
590
+ if not partially_stopped:
591
+ completion_choice = CompletionChoice(
592
+ text=response, index=0, logprobs=None, finish_reason=None
593
+ )
594
+ completion_chunk = CompletionChunk(
595
+ id=str(uuid.uuid1()),
596
+ object="text_completion",
597
+ created=int(time.time()),
598
+ model=model_uid,
599
+ choices=[completion_choice],
600
+ )
601
+ completion_usage = CompletionUsage(
602
+ prompt_tokens=input_echo_len,
603
+ completion_tokens=(total_len - input_echo_len),
604
+ total_tokens=total_len,
605
+ )
606
+
607
+ yield completion_chunk, completion_usage
577
608
 
578
- yield completion_chunk, completion_usage
609
+ if stopped:
610
+ break
579
611
 
580
612
  if total_len - input_echo_len == max_new_tokens - 1:
581
613
  finish_reason = "length"
@@ -141,7 +141,7 @@ class ChatModelMixin:
141
141
  return ret
142
142
  elif prompt_style.style_name == "CHATGLM3":
143
143
  prompts = (
144
- [f"<|system|>\n{prompt_style.system_prompt}"]
144
+ [f"<|system|>\n {prompt_style.system_prompt}"]
145
145
  if prompt_style.system_prompt
146
146
  else []
147
147
  )
@@ -155,7 +155,7 @@ class ChatModelMixin:
155
155
  if content:
156
156
  if role == "tool":
157
157
  role = "observation"
158
- prompts.append(f"<|{role}|>\n{content}")
158
+ prompts.append(f"<|{role}|>\n {content}")
159
159
  else:
160
160
  prompts.append(f"<|{role}|>")
161
161
  return "\n".join(prompts)
@@ -37,6 +37,7 @@ from ....types import (
37
37
  CompletionUsage,
38
38
  )
39
39
  from .. import LLM, LLMFamilyV1, LLMSpecV1
40
+ from ..llm_family import CustomLLMFamilyV1
40
41
  from ..utils import ChatModelMixin
41
42
 
42
43
  logger = logging.getLogger(__name__)
@@ -197,8 +198,12 @@ class VLLMModel(LLM):
197
198
  # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
198
199
  if "4" not in quantization:
199
200
  return False
200
- if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
201
- return False
201
+ if isinstance(llm_family, CustomLLMFamilyV1):
202
+ if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
203
+ return False
204
+ else:
205
+ if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
206
+ return False
202
207
  if "generate" not in llm_family.model_ability:
203
208
  return False
204
209
  return VLLM_INSTALLED
@@ -329,8 +334,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
329
334
  # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
330
335
  if "4" not in quantization:
331
336
  return False
332
- if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
333
- return False
337
+ if isinstance(llm_family, CustomLLMFamilyV1):
338
+ if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
339
+ return False
340
+ else:
341
+ if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
342
+ return False
334
343
  if "chat" not in llm_family.model_ability:
335
344
  return False
336
345
  return VLLM_INSTALLED
@@ -96,7 +96,7 @@ class LVLMDescription(ModelDescription):
96
96
 
97
97
  def to_dict(self):
98
98
  return {
99
- "model_type": "LVLM",
99
+ "model_type": "multimodal",
100
100
  "address": self.address,
101
101
  "accelerators": self.devices,
102
102
  "model_name": self._model_family.model_name,
@@ -11,11 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import base64
15
+ import logging
15
16
  import operator
17
+ import tempfile
16
18
  import time
17
19
  import uuid
18
20
  from typing import Dict, Iterator, List, Optional, Union
21
+ from urllib.parse import urlparse
19
22
 
20
23
  from ...types import (
21
24
  ChatCompletion,
@@ -26,6 +29,8 @@ from ...types import (
26
29
  from ..utils import select_device
27
30
  from .core import LVLM, LVLMFamilyV1, LVLMSpecV1
28
31
 
32
+ logger = logging.getLogger(__name__)
33
+
29
34
 
30
35
  class QwenVLChat(LVLM):
31
36
  def __init__(self, *args, **kwargs):
@@ -67,9 +72,32 @@ class QwenVLChat(LVLM):
67
72
  )
68
73
 
69
74
  def _message_content_to_qwen(self, content) -> str:
75
+ def _ensure_url(_url):
76
+ try:
77
+ if _url.startswith("data:"):
78
+ raise "Not a valid url."
79
+ parsed = urlparse(_url)
80
+ if not parsed.scheme:
81
+ raise "Not a valid url."
82
+ return _url
83
+ except Exception:
84
+ logging.info("Parse url by base64 decoder.")
85
+ # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images
86
+ # e.g. f"data:image/jpeg;base64,{base64_image}"
87
+ _type, data = _url.split(";")
88
+ _, ext = _type.split("/")
89
+ data = data[len("base64,") :]
90
+ data = base64.b64decode(data.encode("utf-8"))
91
+
92
+ with tempfile.NamedTemporaryFile(suffix=f".{ext}", delete=False) as f:
93
+ f.write(data)
94
+ logging.info("Dump base64 data to %s", f.name)
95
+ return f.name
96
+
70
97
  if not isinstance(content, str):
98
+ # TODO(codingl2k1): Optimize _ensure_url
71
99
  content = [
72
- {"image": c["image_url"]["url"], "type": "image"}
100
+ {"image": _ensure_url(c["image_url"]["url"]), "type": "image"}
73
101
  if c.get("type") == "image_url"
74
102
  else c
75
103
  for c in content
@@ -85,6 +113,10 @@ class QwenVLChat(LVLM):
85
113
  chat_history: Optional[List[Dict]] = None,
86
114
  generate_config: Optional[Dict] = None,
87
115
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
116
+ if generate_config and generate_config.get("stream"):
117
+ raise Exception(
118
+ f"Chat with model {self.model_family.model_name} does not support stream."
119
+ )
88
120
  prompt = self._message_content_to_qwen(prompt)
89
121
  # Convert openai history to qwen vl history
90
122
  qwen_history = []
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "files": {
3
- "main.js": "./static/js/main.31d347d8.js",
3
+ "main.js": "./static/js/main.236e72e7.js",
4
4
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
5
5
  "index.html": "./index.html",
6
- "main.31d347d8.js.map": "./static/js/main.31d347d8.js.map"
6
+ "main.236e72e7.js.map": "./static/js/main.236e72e7.js.map"
7
7
  },
8
8
  "entrypoints": [
9
- "static/js/main.31d347d8.js"
9
+ "static/js/main.236e72e7.js"
10
10
  ]
11
11
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.31d347d8.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.236e72e7.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>