xinference 0.15.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (38) hide show
  1. xinference/__init__.py +0 -4
  2. xinference/_version.py +3 -3
  3. xinference/constants.py +4 -4
  4. xinference/core/model.py +89 -18
  5. xinference/core/scheduler.py +10 -7
  6. xinference/core/utils.py +9 -0
  7. xinference/deploy/supervisor.py +4 -0
  8. xinference/model/__init__.py +4 -0
  9. xinference/model/image/scheduler/__init__.py +13 -0
  10. xinference/model/image/scheduler/flux.py +533 -0
  11. xinference/model/image/stable_diffusion/core.py +6 -31
  12. xinference/model/image/utils.py +39 -3
  13. xinference/model/llm/__init__.py +2 -0
  14. xinference/model/llm/llm_family.json +169 -1
  15. xinference/model/llm/llm_family_modelscope.json +108 -0
  16. xinference/model/llm/transformers/chatglm.py +104 -0
  17. xinference/model/llm/transformers/core.py +37 -111
  18. xinference/model/llm/transformers/deepseek_v2.py +0 -226
  19. xinference/model/llm/transformers/internlm2.py +3 -95
  20. xinference/model/llm/transformers/opt.py +68 -0
  21. xinference/model/llm/transformers/utils.py +4 -284
  22. xinference/model/llm/utils.py +2 -2
  23. xinference/model/llm/vllm/core.py +16 -1
  24. xinference/utils.py +2 -3
  25. xinference/web/ui/build/asset-manifest.json +3 -3
  26. xinference/web/ui/build/index.html +1 -1
  27. xinference/web/ui/build/static/js/{main.e51a356d.js → main.f7da0140.js} +3 -3
  28. xinference/web/ui/build/static/js/main.f7da0140.js.map +1 -0
  29. xinference/web/ui/node_modules/.cache/babel-loader/331312668fa8bd3d7401818f4a25fa98135d7f61371cd6bfff78b18cf4fbdd92.json +1 -0
  30. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/METADATA +36 -4
  31. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/RECORD +36 -33
  32. xinference/web/ui/build/static/js/main.e51a356d.js.map +0 -1
  33. xinference/web/ui/node_modules/.cache/babel-loader/4385c1095eefbff0a8ec3b2964ba6e5a66a05ab31be721483ca2f43e2a91f6ff.json +0 -1
  34. /xinference/web/ui/build/static/js/{main.e51a356d.js.LICENSE.txt → main.f7da0140.js.LICENSE.txt} +0 -0
  35. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/LICENSE +0 -0
  36. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/WHEEL +0 -0
  37. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/entry_points.txt +0 -0
  38. {xinference-0.15.4.dist-info → xinference-0.16.0.dist-info}/top_level.txt +0 -0
@@ -12,24 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import logging
15
- import uuid
16
- from typing import Dict, Iterator, List, Optional, Union
17
15
 
18
16
  import torch
19
17
 
20
- from ....types import (
21
- ChatCompletion,
22
- ChatCompletionChunk,
23
- Completion,
24
- CompletionChunk,
25
- PytorchGenerateConfig,
26
- )
27
18
  from ..llm_family import LLMFamilyV1, LLMSpecV1
28
- from ..utils import (
29
- generate_chat_completion,
30
- generate_completion,
31
- generate_completion_chunk,
32
- )
33
19
  from .core import PytorchChatModel, PytorchModel
34
20
 
35
21
  logger = logging.getLogger(__name__)
@@ -80,95 +66,6 @@ class DeepSeekV2PytorchModel(PytorchModel):
80
66
  return False
81
67
  return True
82
68
 
83
- def generate(
84
- self, prompt: str, generate_config: Optional[PytorchGenerateConfig] = None
85
- ) -> Union[Completion, Iterator[CompletionChunk]]:
86
- input_tensor = self._tokenizer(prompt, return_tensors="pt")
87
- generate_config = self._sanitize_generate_config(generate_config)
88
- default_generate_config = self._model.generation_config
89
- generate_kwargs = {
90
- "input_ids": input_tensor["input_ids"].cuda(),
91
- "attention_mask": input_tensor["attention_mask"].cuda(),
92
- "temperature": float(
93
- generate_config.get("temperature", default_generate_config.temperature)
94
- ),
95
- "repetition_penalty": float(generate_config.get("repetition_penalty", 1.0)),
96
- "top_p": float(generate_config.get("top_p", default_generate_config.top_p)),
97
- "top_k": int(generate_config.get("top_k", -1)),
98
- "max_new_tokens": generate_config.get("max_tokens", 512),
99
- "bos_token_id": default_generate_config.bos_token_id,
100
- "do_sample": default_generate_config.do_sample,
101
- "eos_token_id": default_generate_config.eos_token_id,
102
- }
103
-
104
- stream = generate_config.get("stream", False)
105
- if stream:
106
- return self._generate_stream(generate_kwargs, input_tensor)
107
- else:
108
- return self._generate(generate_kwargs, input_tensor)
109
-
110
- def _generate(self, generate_kwargs, input_ids) -> Completion:
111
- prompt_tokens = len(input_ids[0])
112
- logger.info(f"generate_kwargs:{generate_kwargs}")
113
- generation_output = self._model.generate(**generate_kwargs)
114
- completion_tokens = len(generation_output[0])
115
- response = self._tokenizer.decode(
116
- generation_output[0], skip_special_tokens=True
117
- )
118
- return generate_completion(
119
- self.model_uid,
120
- response,
121
- prompt_tokens=prompt_tokens,
122
- completion_tokens=completion_tokens,
123
- total_tokens=prompt_tokens + completion_tokens,
124
- )
125
-
126
- def _generate_stream(self, generate_kwargs, input_ids):
127
- from threading import Thread
128
-
129
- from transformers import TextIteratorStreamer
130
-
131
- # Initialize the streamer
132
- streamer = TextIteratorStreamer(
133
- self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
134
- )
135
- # Define the generation configuration
136
- generate_kwargs["streamer"] = streamer
137
- # Start the model chat in a separate thread
138
- thread = Thread(
139
- target=self._model.generate,
140
- kwargs=generate_kwargs,
141
- )
142
- thread.start()
143
-
144
- completion_id = str(uuid.uuid1())
145
- prompt_tokens = len(input_ids[0])
146
- total_tokens, completion_tokens = 0, 0
147
- # Loop through the streamer to get the new text as it is generated
148
- for i, new_text in enumerate(streamer):
149
- completion_tokens = i
150
- total_tokens = prompt_tokens + completion_tokens
151
- yield generate_completion_chunk(
152
- chunk_text=new_text,
153
- finish_reason=None,
154
- chunk_id=completion_id,
155
- model_uid=self.model_uid,
156
- prompt_tokens=prompt_tokens,
157
- completion_tokens=completion_tokens,
158
- total_tokens=total_tokens,
159
- )
160
- yield generate_completion_chunk(
161
- chunk_text=None,
162
- finish_reason="stop",
163
- chunk_id=completion_id,
164
- model_uid=self.model_uid,
165
- prompt_tokens=prompt_tokens,
166
- completion_tokens=completion_tokens,
167
- total_tokens=total_tokens,
168
- has_choice=True,
169
- has_content=False,
170
- )
171
-
172
69
 
173
70
  class DeepSeekV2PytorchChatModel(PytorchChatModel):
174
71
  def _load_model(self, **kwargs):
@@ -215,126 +112,3 @@ class DeepSeekV2PytorchChatModel(PytorchChatModel):
215
112
  if "chat" not in llm_family.model_ability:
216
113
  return False
217
114
  return True
218
-
219
- def chat(
220
- self,
221
- messages: List[Dict],
222
- generate_config: Optional[PytorchGenerateConfig] = None,
223
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
224
- assert self.model_family.chat_template is not None
225
- full_prompt = self.get_full_context(
226
- messages,
227
- self.model_family.chat_template,
228
- tokenizer=self._tokenizer,
229
- )
230
- input_tensor = self._tokenizer.encode(
231
- full_prompt,
232
- padding=False,
233
- truncation=False,
234
- max_length=None,
235
- add_special_tokens=False,
236
- return_tensors="pt",
237
- )
238
-
239
- generate_config = self._sanitize_generate_config(generate_config)
240
- default_generate_config = self._model.generation_config
241
- generate_kwargs = {
242
- "input_ids": input_tensor.cuda(),
243
- "temperature": float(
244
- generate_config.get("temperature", default_generate_config.temperature)
245
- ),
246
- "repetition_penalty": float(generate_config.get("repetition_penalty", 1.0)),
247
- "top_p": float(generate_config.get("top_p", default_generate_config.top_p)),
248
- "top_k": int(generate_config.get("top_k", -1)),
249
- "max_new_tokens": generate_config.get("max_tokens", 512),
250
- "bos_token_id": default_generate_config.bos_token_id,
251
- "do_sample": default_generate_config.do_sample,
252
- "eos_token_id": default_generate_config.eos_token_id,
253
- }
254
-
255
- stream = generate_config.get("stream", False)
256
- stream_options = generate_config.get("stream_options", None)
257
- include_usage = (
258
- stream_options["include_usage"]
259
- if isinstance(stream_options, dict)
260
- else False
261
- )
262
- if stream:
263
- chunk = self._generate_stream(generate_kwargs, input_tensor, include_usage)
264
- return self._to_chat_completion_chunks(chunk)
265
- else:
266
- return self._generate(generate_kwargs, input_tensor)
267
-
268
- def _generate(self, generate_kwargs, input_ids) -> ChatCompletion:
269
- prompt_tokens = len(input_ids[0])
270
- generation_output = self._model.generate(**generate_kwargs)
271
- completion_tokens = len(generation_output[0])
272
- response = self._tokenizer.decode(
273
- generation_output[0][input_ids.shape[1] :], skip_special_tokens=True
274
- )
275
- return generate_chat_completion(
276
- self.model_uid,
277
- response,
278
- prompt_tokens=prompt_tokens,
279
- completion_tokens=completion_tokens,
280
- total_tokens=prompt_tokens + completion_tokens,
281
- )
282
-
283
- def _generate_stream(self, generate_kwargs, input_ids, include_usage):
284
- from threading import Thread
285
-
286
- from transformers import TextIteratorStreamer
287
-
288
- # Initialize the streamer
289
- streamer = TextIteratorStreamer(
290
- self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
291
- )
292
- # Define the generation configuration
293
- generate_kwargs["streamer"] = streamer
294
- # Start the model chat in a separate thread
295
- thread = Thread(
296
- target=self._model.generate,
297
- kwargs=generate_kwargs,
298
- )
299
- thread.start()
300
-
301
- completion_id = str(uuid.uuid1())
302
- prompt_tokens = len(input_ids[0])
303
- total_tokens, completion_tokens = 0, 0
304
- # Loop through the streamer to get the new text as it is generated
305
- for i, new_text in enumerate(streamer):
306
- completion_tokens = max(completion_tokens, len(streamer.token_cache))
307
- total_tokens = prompt_tokens + completion_tokens
308
- yield generate_completion_chunk(
309
- chunk_text=new_text,
310
- finish_reason=None,
311
- chunk_id=completion_id,
312
- model_uid=self.model_uid,
313
- prompt_tokens=prompt_tokens,
314
- completion_tokens=completion_tokens,
315
- total_tokens=total_tokens,
316
- )
317
- yield generate_completion_chunk(
318
- chunk_text=None,
319
- finish_reason="stop",
320
- chunk_id=completion_id,
321
- model_uid=self.model_uid,
322
- prompt_tokens=prompt_tokens,
323
- completion_tokens=completion_tokens,
324
- total_tokens=total_tokens,
325
- has_choice=True,
326
- has_content=False,
327
- )
328
-
329
- if include_usage:
330
- yield generate_completion_chunk(
331
- chunk_text=None,
332
- finish_reason=None,
333
- chunk_id=completion_id,
334
- model_uid=self.model_uid,
335
- prompt_tokens=prompt_tokens,
336
- completion_tokens=completion_tokens,
337
- total_tokens=total_tokens,
338
- has_choice=False,
339
- has_content=False,
340
- )
@@ -11,13 +11,12 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import uuid
15
- from typing import Any, Dict, Iterator, List, Optional, Union
14
+
15
+ from typing import List, Optional
16
16
 
17
17
  from ....core.scheduler import InferenceRequest
18
- from ....types import ChatCompletion, ChatCompletionChunk, LoRA, PytorchGenerateConfig
18
+ from ....types import LoRA
19
19
  from ..llm_family import LLMFamilyV1, LLMSpecV1
20
- from ..utils import generate_chat_completion, generate_completion_chunk, parse_messages
21
20
  from .core import PytorchChatModel, PytorchModelConfig
22
21
 
23
22
 
@@ -93,94 +92,3 @@ class Internlm2PytorchChatModel(PytorchChatModel):
93
92
  if top_p is None:
94
93
  raw_config["top_p"] = 0.8
95
94
  return raw_config
96
-
97
- def chat(
98
- self,
99
- messages: List[Dict],
100
- generate_config: Optional[PytorchGenerateConfig] = None,
101
- ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
102
- kwargs: Dict[str, Any] = {}
103
- generate_config = generate_config or {}
104
- temperature = generate_config.get("temperature")
105
- if temperature is not None:
106
- kwargs["temperature"] = float(temperature)
107
- top_p = generate_config.get("top_p")
108
- if top_p is not None:
109
- kwargs["top_p"] = float(top_p)
110
- max_new_tokens = generate_config.get("max_tokens")
111
- if max_new_tokens is not None:
112
- kwargs["max_length"] = int(max_new_tokens)
113
-
114
- stream = generate_config.get("stream", False)
115
- stream_options = generate_config.pop("stream_options", None)
116
- include_usage = (
117
- stream_options["include_usage"]
118
- if isinstance(stream_options, dict)
119
- else False
120
- )
121
-
122
- prompt, system_prompt, chat_history = parse_messages(messages)
123
- if chat_history:
124
- input_history = [
125
- (chat_history[i]["content"], (chat_history[i + 1]["content"]))
126
- for i in range(0, len(chat_history), 2)
127
- ]
128
- else:
129
- input_history = []
130
- if system_prompt:
131
- kwargs["meta_instruction"] = system_prompt
132
- if stream:
133
-
134
- def _stream_generator():
135
- last_chunk_text_length = 0
136
- chunk_id = "chat-" + str(uuid.uuid1())
137
- prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
138
- inputs = self._tokenizer([prompt], return_tensors="pt")
139
- inputs = inputs.to(self._model.device)
140
- prompt_tokens = len(inputs["input_ids"][0])
141
- for chunk_text, _ in self._model.stream_chat(
142
- self._tokenizer, prompt, input_history, **kwargs
143
- ):
144
- completion_tokens = completion_tokens + 1
145
- total_tokens = prompt_tokens + completion_tokens
146
- chunk_text = chunk_text[last_chunk_text_length:]
147
- last_chunk_text_length += len(chunk_text)
148
-
149
- yield generate_completion_chunk(
150
- chunk_text,
151
- finish_reason=None,
152
- chunk_id=chunk_id,
153
- model_uid=self.model_uid,
154
- prompt_tokens=prompt_tokens,
155
- completion_tokens=completion_tokens,
156
- total_tokens=total_tokens,
157
- )
158
- yield generate_completion_chunk(
159
- None,
160
- finish_reason="stop",
161
- chunk_id=chunk_id,
162
- model_uid=self.model_uid,
163
- prompt_tokens=prompt_tokens,
164
- completion_tokens=completion_tokens,
165
- total_tokens=total_tokens,
166
- has_choice=True,
167
- has_content=False,
168
- )
169
- if include_usage:
170
- yield generate_completion_chunk(
171
- None,
172
- finish_reason=None,
173
- chunk_id=chunk_id,
174
- model_uid=self.model_uid,
175
- prompt_tokens=prompt_tokens,
176
- completion_tokens=completion_tokens,
177
- total_tokens=total_tokens,
178
- has_choice=False,
179
- )
180
-
181
- return self._to_chat_completion_chunks(_stream_generator())
182
- else:
183
- response, _ = self._model.chat(
184
- self._tokenizer, prompt, input_history, **kwargs
185
- )
186
- return generate_chat_completion(self.model_uid, response)
@@ -0,0 +1,68 @@
1
+ # Copyright 2022-2024 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from builtins import classmethod
15
+ from typing import List, Optional
16
+
17
+ from ....core.scheduler import InferenceRequest
18
+ from ....types import LoRA
19
+ from ..llm_family import LLMFamilyV1, LLMSpecV1
20
+ from .core import PytorchModel, PytorchModelConfig
21
+
22
+
23
+ class OptPytorchModel(PytorchModel):
24
+ def __init__(
25
+ self,
26
+ model_uid: str,
27
+ model_family: "LLMFamilyV1",
28
+ model_spec: "LLMSpecV1",
29
+ quantization: str,
30
+ model_path: str,
31
+ pytorch_model_config: Optional[PytorchModelConfig] = None,
32
+ peft_model: Optional[List[LoRA]] = None,
33
+ ):
34
+ super().__init__(
35
+ model_uid,
36
+ model_family,
37
+ model_spec,
38
+ quantization,
39
+ model_path,
40
+ pytorch_model_config=pytorch_model_config,
41
+ peft_model=peft_model,
42
+ )
43
+
44
+ @classmethod
45
+ def match(
46
+ cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
47
+ ) -> bool:
48
+ if llm_spec.model_format != "pytorch":
49
+ return False
50
+ model_family = llm_family.model_family or llm_family.model_name
51
+ if model_family != "opt":
52
+ return False
53
+ return True
54
+
55
+ def build_prefill_position_ids(
56
+ self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
57
+ ):
58
+ """
59
+ Mainly for UT.
60
+ Transformers code in `main` branch supports `position_ids` parameter (https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#L1076),
61
+ while in release branch, it doesn't (https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/opt/modeling_opt.py#L886).
62
+ """
63
+ return None
64
+
65
+ def build_decode_position_ids(
66
+ self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
67
+ ):
68
+ return None