xinference 0.7.4.1__py3-none-any.whl → 0.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (31) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +22 -8
  3. xinference/client/oscar/actor_client.py +78 -8
  4. xinference/core/model.py +14 -7
  5. xinference/core/supervisor.py +12 -0
  6. xinference/deploy/cmdline.py +16 -0
  7. xinference/deploy/test/test_cmdline.py +1 -0
  8. xinference/model/embedding/model_spec.json +40 -0
  9. xinference/model/llm/__init__.py +14 -1
  10. xinference/model/llm/llm_family.json +10 -1
  11. xinference/model/llm/llm_family.py +38 -2
  12. xinference/model/llm/llm_family_modelscope.json +10 -1
  13. xinference/model/llm/pytorch/chatglm.py +1 -0
  14. xinference/model/llm/pytorch/core.py +1 -1
  15. xinference/model/llm/pytorch/utils.py +50 -18
  16. xinference/model/llm/utils.py +2 -2
  17. xinference/model/llm/vllm/core.py +13 -4
  18. xinference/web/ui/build/asset-manifest.json +3 -3
  19. xinference/web/ui/build/index.html +1 -1
  20. xinference/web/ui/build/static/js/{main.31d347d8.js → main.236e72e7.js} +3 -3
  21. xinference/web/ui/build/static/js/main.236e72e7.js.map +1 -0
  22. xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +1 -0
  23. {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/METADATA +9 -2
  24. {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/RECORD +29 -29
  25. xinference/web/ui/build/static/js/main.31d347d8.js.map +0 -1
  26. xinference/web/ui/node_modules/.cache/babel-loader/ca8515ecefb4a06c5305417bfd9c04e13cf6b9103f52a47c925921b26c0a9f9d.json +0 -1
  27. /xinference/web/ui/build/static/js/{main.31d347d8.js.LICENSE.txt → main.236e72e7.js.LICENSE.txt} +0 -0
  28. {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/LICENSE +0 -0
  29. {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/WHEEL +0 -0
  30. {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/entry_points.txt +0 -0
  31. {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/top_level.txt +0 -0
@@ -527,10 +527,12 @@ def generate_stream_chatglm(
527
527
  top_p = float(generate_config.get("top_p", 1.0))
528
528
  max_new_tokens = int(generate_config.get("max_tokens", 256))
529
529
  echo = generate_config.get("echo", False)
530
+ stop_str = generate_config.get("stop", None)
531
+ eos_token_id = generate_config.get("stop_token_ids", [])
532
+ eos_token_id.append(tokenizer.eos_token_id)
530
533
 
531
534
  inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
532
535
  input_echo_len = len(inputs["input_ids"][0])
533
-
534
536
  gen_kwargs = {
535
537
  "max_length": max_new_tokens + input_echo_len,
536
538
  "do_sample": True if temperature > 1e-5 else False,
@@ -543,7 +545,9 @@ def generate_stream_chatglm(
543
545
 
544
546
  total_len = 0
545
547
  last_response_length = 0
546
- for total_ids in model.stream_generate(**inputs, **gen_kwargs):
548
+ for total_ids in model.stream_generate(
549
+ **inputs, eos_token_id=eos_token_id, **gen_kwargs
550
+ ):
547
551
  total_ids = total_ids.tolist()[0]
548
552
  total_len = len(total_ids)
549
553
  if echo:
@@ -553,29 +557,57 @@ def generate_stream_chatglm(
553
557
  response = tokenizer.decode(output_ids)
554
558
  response = process_response(response)
555
559
 
560
+ partially_stopped = False
561
+ stopped = False
562
+ if stop_str:
563
+ if isinstance(stop_str, str):
564
+ pos = response.rfind(stop_str, 0)
565
+ if pos != -1:
566
+ response = response[:pos]
567
+ stopped = True
568
+ else:
569
+ partially_stopped = is_partial_stop(response, stop_str)
570
+ elif isinstance(stop_str, Iterable):
571
+ for each_stop in stop_str:
572
+ pos = response.rfind(each_stop, 0)
573
+ if pos != -1:
574
+ response = response[:pos]
575
+ stopped = True
576
+ break
577
+ else:
578
+ partially_stopped = is_partial_stop(response, each_stop)
579
+ if partially_stopped:
580
+ break
581
+ else:
582
+ raise ValueError("Invalid stop field type.")
583
+
556
584
  if stream:
557
585
  response = response.strip("�")
558
586
  tmp_response_length = len(response)
559
587
  response = response[last_response_length:]
560
588
  last_response_length = tmp_response_length
561
589
 
562
- completion_choice = CompletionChoice(
563
- text=response, index=0, logprobs=None, finish_reason=None
564
- )
565
- completion_chunk = CompletionChunk(
566
- id=str(uuid.uuid1()),
567
- object="text_completion",
568
- created=int(time.time()),
569
- model=model_uid,
570
- choices=[completion_choice],
571
- )
572
- completion_usage = CompletionUsage(
573
- prompt_tokens=input_echo_len,
574
- completion_tokens=(total_len - input_echo_len),
575
- total_tokens=total_len,
576
- )
590
+ if not partially_stopped:
591
+ completion_choice = CompletionChoice(
592
+ text=response, index=0, logprobs=None, finish_reason=None
593
+ )
594
+ completion_chunk = CompletionChunk(
595
+ id=str(uuid.uuid1()),
596
+ object="text_completion",
597
+ created=int(time.time()),
598
+ model=model_uid,
599
+ choices=[completion_choice],
600
+ )
601
+ completion_usage = CompletionUsage(
602
+ prompt_tokens=input_echo_len,
603
+ completion_tokens=(total_len - input_echo_len),
604
+ total_tokens=total_len,
605
+ )
606
+
607
+ yield completion_chunk, completion_usage
577
608
 
578
- yield completion_chunk, completion_usage
609
+ if stopped:
610
+ break
579
611
 
580
612
  if total_len - input_echo_len == max_new_tokens - 1:
581
613
  finish_reason = "length"
@@ -141,7 +141,7 @@ class ChatModelMixin:
141
141
  return ret
142
142
  elif prompt_style.style_name == "CHATGLM3":
143
143
  prompts = (
144
- [f"<|system|>\n{prompt_style.system_prompt}"]
144
+ [f"<|system|>\n {prompt_style.system_prompt}"]
145
145
  if prompt_style.system_prompt
146
146
  else []
147
147
  )
@@ -155,7 +155,7 @@ class ChatModelMixin:
155
155
  if content:
156
156
  if role == "tool":
157
157
  role = "observation"
158
- prompts.append(f"<|{role}|>\n{content}")
158
+ prompts.append(f"<|{role}|>\n {content}")
159
159
  else:
160
160
  prompts.append(f"<|{role}|>")
161
161
  return "\n".join(prompts)
@@ -37,6 +37,7 @@ from ....types import (
37
37
  CompletionUsage,
38
38
  )
39
39
  from .. import LLM, LLMFamilyV1, LLMSpecV1
40
+ from ..llm_family import CustomLLMFamilyV1
40
41
  from ..utils import ChatModelMixin
41
42
 
42
43
  logger = logging.getLogger(__name__)
@@ -197,8 +198,12 @@ class VLLMModel(LLM):
197
198
  # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
198
199
  if "4" not in quantization:
199
200
  return False
200
- if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
201
- return False
201
+ if isinstance(llm_family, CustomLLMFamilyV1):
202
+ if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
203
+ return False
204
+ else:
205
+ if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
206
+ return False
202
207
  if "generate" not in llm_family.model_ability:
203
208
  return False
204
209
  return VLLM_INSTALLED
@@ -329,8 +334,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
329
334
  # Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
330
335
  if "4" not in quantization:
331
336
  return False
332
- if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
333
- return False
337
+ if isinstance(llm_family, CustomLLMFamilyV1):
338
+ if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
339
+ return False
340
+ else:
341
+ if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
342
+ return False
334
343
  if "chat" not in llm_family.model_ability:
335
344
  return False
336
345
  return VLLM_INSTALLED
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "files": {
3
- "main.js": "./static/js/main.31d347d8.js",
3
+ "main.js": "./static/js/main.236e72e7.js",
4
4
  "static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
5
5
  "index.html": "./index.html",
6
- "main.31d347d8.js.map": "./static/js/main.31d347d8.js.map"
6
+ "main.236e72e7.js.map": "./static/js/main.236e72e7.js.map"
7
7
  },
8
8
  "entrypoints": [
9
- "static/js/main.31d347d8.js"
9
+ "static/js/main.236e72e7.js"
10
10
  ]
11
11
  }
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.31d347d8.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1
+ <!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.236e72e7.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>