xinference 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (56) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +30 -0
  3. xinference/client/restful/restful_client.py +29 -0
  4. xinference/core/cache_tracker.py +12 -1
  5. xinference/core/chat_interface.py +10 -4
  6. xinference/core/model.py +2 -2
  7. xinference/core/supervisor.py +30 -2
  8. xinference/core/utils.py +12 -0
  9. xinference/core/worker.py +4 -1
  10. xinference/deploy/cmdline.py +126 -0
  11. xinference/deploy/test/test_cmdline.py +24 -0
  12. xinference/fields.py +3 -1
  13. xinference/model/llm/__init__.py +2 -0
  14. xinference/model/llm/ggml/chatglm.py +98 -13
  15. xinference/model/llm/ggml/llamacpp.py +49 -2
  16. xinference/model/llm/llm_family.json +633 -9
  17. xinference/model/llm/llm_family.py +84 -10
  18. xinference/model/llm/llm_family_modelscope.json +337 -10
  19. xinference/model/llm/memory.py +332 -0
  20. xinference/model/llm/pytorch/chatglm.py +48 -0
  21. xinference/model/llm/pytorch/core.py +25 -6
  22. xinference/model/llm/pytorch/deepseek_vl.py +35 -9
  23. xinference/model/llm/pytorch/intern_vl.py +387 -0
  24. xinference/model/llm/pytorch/internlm2.py +32 -1
  25. xinference/model/llm/pytorch/qwen_vl.py +38 -11
  26. xinference/model/llm/pytorch/utils.py +38 -1
  27. xinference/model/llm/pytorch/yi_vl.py +42 -14
  28. xinference/model/llm/sglang/core.py +31 -9
  29. xinference/model/llm/utils.py +38 -5
  30. xinference/model/llm/vllm/core.py +87 -5
  31. xinference/model/rerank/core.py +23 -1
  32. xinference/model/utils.py +17 -7
  33. xinference/thirdparty/deepseek_vl/models/processing_vlm.py +1 -1
  34. xinference/thirdparty/deepseek_vl/models/siglip_vit.py +2 -2
  35. xinference/thirdparty/llava/mm_utils.py +3 -2
  36. xinference/thirdparty/llava/model/llava_arch.py +1 -1
  37. xinference/thirdparty/omnilmm/chat.py +6 -5
  38. xinference/types.py +10 -1
  39. xinference/web/ui/build/asset-manifest.json +3 -3
  40. xinference/web/ui/build/index.html +1 -1
  41. xinference/web/ui/build/static/js/{main.8e44da4b.js → main.551aa479.js} +3 -3
  42. xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
  46. {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/METADATA +10 -8
  47. {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/RECORD +52 -50
  48. xinference/web/ui/build/static/js/main.8e44da4b.js.map +0 -1
  49. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
  50. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
  51. xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +0 -1
  52. /xinference/web/ui/build/static/js/{main.8e44da4b.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +0 -0
  53. {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/LICENSE +0 -0
  54. {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/WHEEL +0 -0
  55. {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/entry_points.txt +0 -0
  56. {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/top_level.txt +0 -0
@@ -108,10 +108,11 @@ class ChatglmCppChatModel(LLM):
108
108
 
109
109
  @staticmethod
110
110
  def _convert_raw_text_chunks_to_chat(
111
- tokens: Iterator[Any], model_name: str
111
+ tokens: Iterator[Any], model_name: str, include_usage: bool, input_ids
112
112
  ) -> Iterator[ChatCompletionChunk]:
113
+ request_id = str(uuid.uuid4())
113
114
  yield {
114
- "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
115
+ "id": "chat" + f"cmpl-{request_id}",
115
116
  "model": model_name,
116
117
  "object": "chat.completion.chunk",
117
118
  "created": int(time.time()),
@@ -125,9 +126,13 @@ class ChatglmCppChatModel(LLM):
125
126
  }
126
127
  ],
127
128
  }
129
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
128
130
  for token in tokens:
131
+ prompt_tokens = len(input_ids)
132
+ completion_tokens = completion_tokens + 1
133
+ total_tokens = prompt_tokens + completion_tokens
129
134
  yield {
130
- "id": "chat" + f"cmpl-{str(uuid.uuid4())}",
135
+ "id": "chat" + f"cmpl-{request_id}",
131
136
  "model": model_name,
132
137
  "object": "chat.completion.chunk",
133
138
  "created": int(time.time()),
@@ -143,6 +148,35 @@ class ChatglmCppChatModel(LLM):
143
148
  }
144
149
  ],
145
150
  }
151
+ # stop
152
+ yield {
153
+ "id": "chat" + f"cmpl-{request_id}",
154
+ "model": model_name,
155
+ "object": "chat.completion.chunk",
156
+ "created": int(time.time()),
157
+ "choices": [
158
+ {
159
+ "index": 0,
160
+ "delta": {
161
+ "content": "",
162
+ },
163
+ "finish_reason": "stop",
164
+ }
165
+ ],
166
+ }
167
+ if include_usage:
168
+ yield {
169
+ "id": "chat" + f"cmpl-{request_id}",
170
+ "model": model_name,
171
+ "object": "chat.completion.chunk",
172
+ "created": int(time.time()),
173
+ "choices": [],
174
+ "usage": {
175
+ "prompt_tokens": prompt_tokens,
176
+ "completion_tokens": completion_tokens,
177
+ "total_tokens": total_tokens,
178
+ },
179
+ }
146
180
 
147
181
  @classmethod
148
182
  def _convert_raw_text_completion_to_chat(
@@ -273,7 +307,7 @@ class ChatglmCppChatModel(LLM):
273
307
 
274
308
  params = {
275
309
  "max_length": generate_config.get("max_tokens"),
276
- "max_context_length": generate_config.get("max_tokens"),
310
+ "max_context_length": generate_config.get("max_tokens", 1024),
277
311
  "top_k": generate_config.get("top_k"),
278
312
  "top_p": generate_config.get("top_p"),
279
313
  "temperature": generate_config.get("temperature"),
@@ -286,13 +320,27 @@ class ChatglmCppChatModel(LLM):
286
320
  assert self._llm is not None
287
321
  chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
288
322
 
289
- if generate_config["stream"]:
323
+ stream = generate_config.get("stream")
324
+ stream_options = generate_config.get("stream_options", None)
325
+ include_usage = (
326
+ stream_options["include_usage"]
327
+ if isinstance(stream_options, dict)
328
+ else False
329
+ )
330
+
331
+ if stream:
290
332
  it = self._llm.chat(
291
333
  chat_history_messages,
292
334
  **params,
293
335
  )
294
336
  assert not isinstance(it, str)
295
- return self._convert_raw_text_chunks_to_chat(it, self.model_uid)
337
+ input_ids = self._llm.tokenizer.encode_messages(
338
+ chat_history_messages, params["max_context_length"]
339
+ )
340
+ return self._convert_raw_text_chunks_to_chat(
341
+ it, self.model_uid, include_usage, input_ids
342
+ )
343
+
296
344
  else:
297
345
  c = self._llm.chat(
298
346
  chat_history_messages,
@@ -320,11 +368,13 @@ class ChatglmCppChatModel(LLM):
320
368
 
321
369
  @staticmethod
322
370
  def _convert_str_to_completion_chunk(
323
- tokens: Iterator[str], model_name: str
371
+ tokens: Iterator[str], model_name: str, include_usage: bool, input_ids
324
372
  ) -> Iterator[CompletionChunk]:
325
- for token in tokens:
373
+ request_id = str(uuid.uuid4())
374
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
375
+ for i, token in enumerate(tokens):
326
376
  yield {
327
- "id": "generate" + f"-{str(uuid.uuid4())}",
377
+ "id": "generate" + f"-{request_id}",
328
378
  "model": model_name,
329
379
  "object": "text_completion",
330
380
  "created": int(time.time()),
@@ -332,6 +382,32 @@ class ChatglmCppChatModel(LLM):
332
382
  {"index": 0, "text": token, "finish_reason": None, "logprobs": None}
333
383
  ],
334
384
  }
385
+ prompt_tokens = len(input_ids)
386
+ completion_tokens = i
387
+ total_tokens = prompt_tokens + completion_tokens
388
+ # stop
389
+ yield {
390
+ "id": "chat" + f"cmpl-{request_id}",
391
+ "model": model_name,
392
+ "object": "text_completion",
393
+ "created": int(time.time()),
394
+ "choices": [
395
+ {"index": 0, "text": "", "finish_reason": "stop", "logprobs": None}
396
+ ],
397
+ }
398
+ if include_usage:
399
+ yield {
400
+ "id": "chat" + f"cmpl-{request_id}",
401
+ "model": model_name,
402
+ "object": "text_completion",
403
+ "created": int(time.time()),
404
+ "choices": [],
405
+ "usage": {
406
+ "prompt_tokens": prompt_tokens,
407
+ "completion_tokens": completion_tokens,
408
+ "total_tokens": total_tokens,
409
+ },
410
+ }
335
411
 
336
412
  def generate(
337
413
  self,
@@ -344,7 +420,7 @@ class ChatglmCppChatModel(LLM):
344
420
 
345
421
  params = {
346
422
  "max_length": generate_config.get("max_tokens"),
347
- "max_context_length": generate_config.get("max_tokens"),
423
+ "max_context_length": generate_config.get("max_tokens", 1024),
348
424
  "top_k": generate_config.get("top_k"),
349
425
  "top_p": generate_config.get("top_p"),
350
426
  "temperature": generate_config.get("temperature"),
@@ -355,14 +431,23 @@ class ChatglmCppChatModel(LLM):
355
431
  params = {k: v for k, v in params.items() if v is not None}
356
432
 
357
433
  assert self._llm is not None
358
-
359
- if generate_config["stream"]:
434
+ stream = generate_config.get("stream")
435
+ stream_options = generate_config.get("stream_options", None)
436
+ include_usage = (
437
+ stream_options["include_usage"]
438
+ if isinstance(stream_options, dict)
439
+ else False
440
+ )
441
+ if stream:
360
442
  it = self._llm.generate(
361
443
  prompt,
362
444
  **params,
363
445
  )
364
446
  assert not isinstance(it, str)
365
- return self._convert_str_to_completion_chunk(it, self.model_uid)
447
+ input_ids = self._llm.tokenizer.encode(prompt, params["max_context_length"])
448
+ return self._convert_str_to_completion_chunk(
449
+ it, self.model_uid, include_usage, input_ids
450
+ )
366
451
  else:
367
452
  c = self._llm.generate(
368
453
  prompt,
@@ -14,6 +14,7 @@
14
14
  import datetime
15
15
  import logging
16
16
  import os
17
+ import time
17
18
  from typing import Iterable, Iterator, List, Optional, Union
18
19
 
19
20
  from ....types import (
@@ -22,6 +23,7 @@ from ....types import (
22
23
  ChatCompletionMessage,
23
24
  Completion,
24
25
  CompletionChunk,
26
+ CompletionUsage,
25
27
  CreateCompletionLlamaCpp,
26
28
  Embedding,
27
29
  LlamaCppGenerateConfig,
@@ -100,6 +102,8 @@ class LlamaCppModel(LLM):
100
102
  generate_config = LlamaCppGenerateConfig(
101
103
  **CreateCompletionLlamaCpp(**generate_config).dict()
102
104
  )
105
+ # Currently, llama.cpp does not support lora
106
+ generate_config.pop("lora_name", None) # type: ignore
103
107
  return generate_config
104
108
 
105
109
  def _convert_ggml_to_gguf(self, model_path: str) -> str:
@@ -195,16 +199,59 @@ class LlamaCppModel(LLM):
195
199
  _generate_config: LlamaCppGenerateConfig,
196
200
  ) -> Iterator[CompletionChunk]:
197
201
  assert self._llm is not None
198
- for _completion_chunk in self._llm(prompt=_prompt, **_generate_config):
202
+ prompt_token_ids: List[int] = (
203
+ (
204
+ self._llm.tokenize(prompt.encode("utf-8"), special=True)
205
+ if prompt != ""
206
+ else [self._llm.token_bos()]
207
+ )
208
+ if isinstance(prompt, str)
209
+ else prompt
210
+ )
211
+ prompt_tokens = len(prompt_token_ids)
212
+ completion_tokens, total_tokens = 0, 0
213
+ request_id = 0
214
+ for index, _completion_chunk in enumerate(
215
+ self._llm(prompt=_prompt, **_generate_config)
216
+ ):
217
+ request_id = _completion_chunk["id"]
218
+ choice = _completion_chunk["choices"][0]
219
+ if choice["finish_reason"] is not None:
220
+ completion_tokens = index
221
+ total_tokens = prompt_tokens + completion_tokens
222
+ _completion_chunk["usage"] = CompletionUsage(
223
+ prompt_tokens=total_tokens,
224
+ completion_tokens=completion_tokens,
225
+ total_tokens=total_tokens,
226
+ )
199
227
  yield _completion_chunk
228
+ if include_usage:
229
+ chunk = CompletionChunk(
230
+ id=request_id,
231
+ object="text_completion",
232
+ created=int(time.time()),
233
+ model=self.model_uid,
234
+ choices=[],
235
+ )
236
+ chunk["usage"] = CompletionUsage(
237
+ prompt_tokens=prompt_tokens,
238
+ completion_tokens=completion_tokens,
239
+ total_tokens=total_tokens,
240
+ )
241
+ yield chunk
200
242
 
201
243
  logger.debug(
202
244
  "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
203
245
  )
204
246
 
205
247
  generate_config = self._sanitize_generate_config(generate_config)
206
-
207
248
  stream = generate_config.get("stream", False)
249
+ stream_options = generate_config.pop("stream_options", None)
250
+ include_usage = (
251
+ stream_options["include_usage"]
252
+ if isinstance(stream_options, dict)
253
+ else False
254
+ )
208
255
 
209
256
  if not stream:
210
257
  assert self._llm is not None