xinference 0.7.4.1__py3-none-any.whl → 0.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +22 -8
- xinference/client/oscar/actor_client.py +78 -8
- xinference/core/model.py +14 -7
- xinference/core/supervisor.py +12 -0
- xinference/deploy/cmdline.py +16 -0
- xinference/deploy/test/test_cmdline.py +1 -0
- xinference/model/embedding/model_spec.json +40 -0
- xinference/model/llm/__init__.py +14 -1
- xinference/model/llm/llm_family.json +10 -1
- xinference/model/llm/llm_family.py +38 -2
- xinference/model/llm/llm_family_modelscope.json +10 -1
- xinference/model/llm/pytorch/chatglm.py +1 -0
- xinference/model/llm/pytorch/core.py +1 -1
- xinference/model/llm/pytorch/utils.py +50 -18
- xinference/model/llm/utils.py +2 -2
- xinference/model/llm/vllm/core.py +13 -4
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.31d347d8.js → main.236e72e7.js} +3 -3
- xinference/web/ui/build/static/js/main.236e72e7.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/78f2521da2e2a98b075a2666cb782c7e2c019cd3c72199eecd5901c82d8655df.json +1 -0
- {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/METADATA +9 -2
- {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/RECORD +29 -29
- xinference/web/ui/build/static/js/main.31d347d8.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ca8515ecefb4a06c5305417bfd9c04e13cf6b9103f52a47c925921b26c0a9f9d.json +0 -1
- /xinference/web/ui/build/static/js/{main.31d347d8.js.LICENSE.txt → main.236e72e7.js.LICENSE.txt} +0 -0
- {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/LICENSE +0 -0
- {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/WHEEL +0 -0
- {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/entry_points.txt +0 -0
- {xinference-0.7.4.1.dist-info → xinference-0.7.5.dist-info}/top_level.txt +0 -0
|
@@ -527,10 +527,12 @@ def generate_stream_chatglm(
|
|
|
527
527
|
top_p = float(generate_config.get("top_p", 1.0))
|
|
528
528
|
max_new_tokens = int(generate_config.get("max_tokens", 256))
|
|
529
529
|
echo = generate_config.get("echo", False)
|
|
530
|
+
stop_str = generate_config.get("stop", None)
|
|
531
|
+
eos_token_id = generate_config.get("stop_token_ids", [])
|
|
532
|
+
eos_token_id.append(tokenizer.eos_token_id)
|
|
530
533
|
|
|
531
534
|
inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
|
|
532
535
|
input_echo_len = len(inputs["input_ids"][0])
|
|
533
|
-
|
|
534
536
|
gen_kwargs = {
|
|
535
537
|
"max_length": max_new_tokens + input_echo_len,
|
|
536
538
|
"do_sample": True if temperature > 1e-5 else False,
|
|
@@ -543,7 +545,9 @@ def generate_stream_chatglm(
|
|
|
543
545
|
|
|
544
546
|
total_len = 0
|
|
545
547
|
last_response_length = 0
|
|
546
|
-
for total_ids in model.stream_generate(
|
|
548
|
+
for total_ids in model.stream_generate(
|
|
549
|
+
**inputs, eos_token_id=eos_token_id, **gen_kwargs
|
|
550
|
+
):
|
|
547
551
|
total_ids = total_ids.tolist()[0]
|
|
548
552
|
total_len = len(total_ids)
|
|
549
553
|
if echo:
|
|
@@ -553,29 +557,57 @@ def generate_stream_chatglm(
|
|
|
553
557
|
response = tokenizer.decode(output_ids)
|
|
554
558
|
response = process_response(response)
|
|
555
559
|
|
|
560
|
+
partially_stopped = False
|
|
561
|
+
stopped = False
|
|
562
|
+
if stop_str:
|
|
563
|
+
if isinstance(stop_str, str):
|
|
564
|
+
pos = response.rfind(stop_str, 0)
|
|
565
|
+
if pos != -1:
|
|
566
|
+
response = response[:pos]
|
|
567
|
+
stopped = True
|
|
568
|
+
else:
|
|
569
|
+
partially_stopped = is_partial_stop(response, stop_str)
|
|
570
|
+
elif isinstance(stop_str, Iterable):
|
|
571
|
+
for each_stop in stop_str:
|
|
572
|
+
pos = response.rfind(each_stop, 0)
|
|
573
|
+
if pos != -1:
|
|
574
|
+
response = response[:pos]
|
|
575
|
+
stopped = True
|
|
576
|
+
break
|
|
577
|
+
else:
|
|
578
|
+
partially_stopped = is_partial_stop(response, each_stop)
|
|
579
|
+
if partially_stopped:
|
|
580
|
+
break
|
|
581
|
+
else:
|
|
582
|
+
raise ValueError("Invalid stop field type.")
|
|
583
|
+
|
|
556
584
|
if stream:
|
|
557
585
|
response = response.strip("�")
|
|
558
586
|
tmp_response_length = len(response)
|
|
559
587
|
response = response[last_response_length:]
|
|
560
588
|
last_response_length = tmp_response_length
|
|
561
589
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
590
|
+
if not partially_stopped:
|
|
591
|
+
completion_choice = CompletionChoice(
|
|
592
|
+
text=response, index=0, logprobs=None, finish_reason=None
|
|
593
|
+
)
|
|
594
|
+
completion_chunk = CompletionChunk(
|
|
595
|
+
id=str(uuid.uuid1()),
|
|
596
|
+
object="text_completion",
|
|
597
|
+
created=int(time.time()),
|
|
598
|
+
model=model_uid,
|
|
599
|
+
choices=[completion_choice],
|
|
600
|
+
)
|
|
601
|
+
completion_usage = CompletionUsage(
|
|
602
|
+
prompt_tokens=input_echo_len,
|
|
603
|
+
completion_tokens=(total_len - input_echo_len),
|
|
604
|
+
total_tokens=total_len,
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
yield completion_chunk, completion_usage
|
|
577
608
|
|
|
578
|
-
|
|
609
|
+
if stopped:
|
|
610
|
+
break
|
|
579
611
|
|
|
580
612
|
if total_len - input_echo_len == max_new_tokens - 1:
|
|
581
613
|
finish_reason = "length"
|
xinference/model/llm/utils.py
CHANGED
|
@@ -141,7 +141,7 @@ class ChatModelMixin:
|
|
|
141
141
|
return ret
|
|
142
142
|
elif prompt_style.style_name == "CHATGLM3":
|
|
143
143
|
prompts = (
|
|
144
|
-
[f"<|system|>\n{prompt_style.system_prompt}"]
|
|
144
|
+
[f"<|system|>\n {prompt_style.system_prompt}"]
|
|
145
145
|
if prompt_style.system_prompt
|
|
146
146
|
else []
|
|
147
147
|
)
|
|
@@ -155,7 +155,7 @@ class ChatModelMixin:
|
|
|
155
155
|
if content:
|
|
156
156
|
if role == "tool":
|
|
157
157
|
role = "observation"
|
|
158
|
-
prompts.append(f"<|{role}|>\n{content}")
|
|
158
|
+
prompts.append(f"<|{role}|>\n {content}")
|
|
159
159
|
else:
|
|
160
160
|
prompts.append(f"<|{role}|>")
|
|
161
161
|
return "\n".join(prompts)
|
|
@@ -37,6 +37,7 @@ from ....types import (
|
|
|
37
37
|
CompletionUsage,
|
|
38
38
|
)
|
|
39
39
|
from .. import LLM, LLMFamilyV1, LLMSpecV1
|
|
40
|
+
from ..llm_family import CustomLLMFamilyV1
|
|
40
41
|
from ..utils import ChatModelMixin
|
|
41
42
|
|
|
42
43
|
logger = logging.getLogger(__name__)
|
|
@@ -197,8 +198,12 @@ class VLLMModel(LLM):
|
|
|
197
198
|
# Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
|
|
198
199
|
if "4" not in quantization:
|
|
199
200
|
return False
|
|
200
|
-
if llm_family
|
|
201
|
-
|
|
201
|
+
if isinstance(llm_family, CustomLLMFamilyV1):
|
|
202
|
+
if llm_family.model_family not in VLLM_SUPPORTED_MODELS:
|
|
203
|
+
return False
|
|
204
|
+
else:
|
|
205
|
+
if llm_family.model_name not in VLLM_SUPPORTED_MODELS:
|
|
206
|
+
return False
|
|
202
207
|
if "generate" not in llm_family.model_ability:
|
|
203
208
|
return False
|
|
204
209
|
return VLLM_INSTALLED
|
|
@@ -329,8 +334,12 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
329
334
|
# Currently, only 4-bit weight quantization is supported for GPTQ, but got 8 bits.
|
|
330
335
|
if "4" not in quantization:
|
|
331
336
|
return False
|
|
332
|
-
if llm_family
|
|
333
|
-
|
|
337
|
+
if isinstance(llm_family, CustomLLMFamilyV1):
|
|
338
|
+
if llm_family.model_family not in VLLM_SUPPORTED_CHAT_MODELS:
|
|
339
|
+
return False
|
|
340
|
+
else:
|
|
341
|
+
if llm_family.model_name not in VLLM_SUPPORTED_CHAT_MODELS:
|
|
342
|
+
return False
|
|
334
343
|
if "chat" not in llm_family.model_ability:
|
|
335
344
|
return False
|
|
336
345
|
return VLLM_INSTALLED
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"files": {
|
|
3
|
-
"main.js": "./static/js/main.
|
|
3
|
+
"main.js": "./static/js/main.236e72e7.js",
|
|
4
4
|
"static/media/icon.webp": "./static/media/icon.4603d52c63041e5dfbfd.webp",
|
|
5
5
|
"index.html": "./index.html",
|
|
6
|
-
"main.
|
|
6
|
+
"main.236e72e7.js.map": "./static/js/main.236e72e7.js.map"
|
|
7
7
|
},
|
|
8
8
|
"entrypoints": [
|
|
9
|
-
"static/js/main.
|
|
9
|
+
"static/js/main.236e72e7.js"
|
|
10
10
|
]
|
|
11
11
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.
|
|
1
|
+
<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="./logo192.png"/><link rel="manifest" href="./manifest.json"/><title>Xinference</title><script defer="defer" src="./static/js/main.236e72e7.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|