xinference 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +35 -1
- xinference/client/oscar/actor_client.py +2 -2
- xinference/client/restful/restful_client.py +2 -2
- xinference/conftest.py +5 -1
- xinference/core/metrics.py +83 -0
- xinference/core/model.py +148 -8
- xinference/core/status_guard.py +86 -0
- xinference/core/supervisor.py +57 -7
- xinference/core/worker.py +132 -13
- xinference/deploy/cmdline.py +57 -4
- xinference/deploy/local.py +32 -6
- xinference/deploy/worker.py +33 -5
- xinference/fields.py +4 -1
- xinference/model/llm/__init__.py +7 -0
- xinference/model/llm/ggml/llamacpp.py +3 -2
- xinference/model/llm/llm_family.json +70 -3
- xinference/model/llm/llm_family.py +11 -1
- xinference/model/llm/llm_family_modelscope.json +72 -3
- xinference/model/llm/pytorch/chatglm.py +70 -28
- xinference/model/llm/pytorch/core.py +11 -30
- xinference/model/llm/pytorch/internlm2.py +155 -0
- xinference/model/llm/pytorch/utils.py +0 -153
- xinference/model/llm/utils.py +37 -8
- xinference/model/llm/vllm/core.py +15 -3
- xinference/model/multimodal/__init__.py +15 -8
- xinference/model/multimodal/model_spec_modelscope.json +45 -0
- xinference/model/utils.py +7 -2
- xinference/types.py +2 -0
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/METADATA +2 -1
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/RECORD +35 -31
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import gc
|
|
16
16
|
import logging
|
|
17
|
-
import re
|
|
18
17
|
import time
|
|
19
18
|
import uuid
|
|
20
19
|
from threading import Thread
|
|
@@ -23,7 +22,6 @@ from typing import Iterable, Iterator, Tuple
|
|
|
23
22
|
import torch
|
|
24
23
|
from transformers import GenerationConfig, TextIteratorStreamer
|
|
25
24
|
from transformers.generation.logits_process import (
|
|
26
|
-
LogitsProcessor,
|
|
27
25
|
LogitsProcessorList,
|
|
28
26
|
RepetitionPenaltyLogitsProcessor,
|
|
29
27
|
TemperatureLogitsWarper,
|
|
@@ -480,154 +478,3 @@ def generate_stream_falcon(
|
|
|
480
478
|
# clean
|
|
481
479
|
gc.collect()
|
|
482
480
|
torch.cuda.empty_cache()
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
class InvalidScoreLogitsProcessor(LogitsProcessor):
|
|
486
|
-
def __call__(
|
|
487
|
-
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
|
|
488
|
-
) -> torch.FloatTensor:
|
|
489
|
-
if torch.isnan(scores).any() or torch.isinf(scores).any():
|
|
490
|
-
scores.zero_()
|
|
491
|
-
scores[..., 5] = 5e4
|
|
492
|
-
return scores
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
invalid_score_processor = InvalidScoreLogitsProcessor()
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
def process_response(response):
|
|
499
|
-
response = response.strip()
|
|
500
|
-
response = response.replace("[[训练时间]]", "2023年")
|
|
501
|
-
punkts = [
|
|
502
|
-
[",", ","],
|
|
503
|
-
["!", "!"],
|
|
504
|
-
[":", ":"],
|
|
505
|
-
[";", ";"],
|
|
506
|
-
["\\?", "?"],
|
|
507
|
-
]
|
|
508
|
-
for item in punkts:
|
|
509
|
-
response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
|
|
510
|
-
response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
|
|
511
|
-
return response
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
@torch.inference_mode()
|
|
515
|
-
def generate_stream_chatglm(
|
|
516
|
-
model_uid,
|
|
517
|
-
model,
|
|
518
|
-
tokenizer,
|
|
519
|
-
prompt,
|
|
520
|
-
device,
|
|
521
|
-
generate_config,
|
|
522
|
-
judge_sent_end=False,
|
|
523
|
-
):
|
|
524
|
-
stream = generate_config.get("stream", False)
|
|
525
|
-
temperature = float(generate_config.get("temperature", 1.0))
|
|
526
|
-
repetition_penalty = float(generate_config.get("repetition_penalty", 1.0))
|
|
527
|
-
top_p = float(generate_config.get("top_p", 1.0))
|
|
528
|
-
max_new_tokens = int(generate_config.get("max_tokens", 256))
|
|
529
|
-
echo = generate_config.get("echo", False)
|
|
530
|
-
stop_str = generate_config.get("stop", None)
|
|
531
|
-
eos_token_id = generate_config.get("stop_token_ids", [])
|
|
532
|
-
eos_token_id.append(tokenizer.eos_token_id)
|
|
533
|
-
|
|
534
|
-
inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
|
|
535
|
-
input_echo_len = len(inputs["input_ids"][0])
|
|
536
|
-
gen_kwargs = {
|
|
537
|
-
"max_length": max_new_tokens + input_echo_len,
|
|
538
|
-
"do_sample": True if temperature > 1e-5 else False,
|
|
539
|
-
"top_p": top_p,
|
|
540
|
-
"repetition_penalty": repetition_penalty,
|
|
541
|
-
"logits_processor": [invalid_score_processor],
|
|
542
|
-
}
|
|
543
|
-
if temperature > 1e-5:
|
|
544
|
-
gen_kwargs["temperature"] = temperature
|
|
545
|
-
|
|
546
|
-
total_len = 0
|
|
547
|
-
last_response_length = 0
|
|
548
|
-
for total_ids in model.stream_generate(
|
|
549
|
-
**inputs, eos_token_id=eos_token_id, **gen_kwargs
|
|
550
|
-
):
|
|
551
|
-
total_ids = total_ids.tolist()[0]
|
|
552
|
-
total_len = len(total_ids)
|
|
553
|
-
if echo:
|
|
554
|
-
output_ids = total_ids
|
|
555
|
-
else:
|
|
556
|
-
output_ids = total_ids[input_echo_len:]
|
|
557
|
-
response = tokenizer.decode(output_ids)
|
|
558
|
-
response = process_response(response)
|
|
559
|
-
|
|
560
|
-
partially_stopped = False
|
|
561
|
-
stopped = False
|
|
562
|
-
if stop_str:
|
|
563
|
-
if isinstance(stop_str, str):
|
|
564
|
-
pos = response.rfind(stop_str, 0)
|
|
565
|
-
if pos != -1:
|
|
566
|
-
response = response[:pos]
|
|
567
|
-
stopped = True
|
|
568
|
-
else:
|
|
569
|
-
partially_stopped = is_partial_stop(response, stop_str)
|
|
570
|
-
elif isinstance(stop_str, Iterable):
|
|
571
|
-
for each_stop in stop_str:
|
|
572
|
-
pos = response.rfind(each_stop, 0)
|
|
573
|
-
if pos != -1:
|
|
574
|
-
response = response[:pos]
|
|
575
|
-
stopped = True
|
|
576
|
-
break
|
|
577
|
-
else:
|
|
578
|
-
partially_stopped = is_partial_stop(response, each_stop)
|
|
579
|
-
if partially_stopped:
|
|
580
|
-
break
|
|
581
|
-
else:
|
|
582
|
-
raise ValueError("Invalid stop field type.")
|
|
583
|
-
|
|
584
|
-
if stream:
|
|
585
|
-
response = response.strip("�")
|
|
586
|
-
tmp_response_length = len(response)
|
|
587
|
-
response = response[last_response_length:]
|
|
588
|
-
last_response_length = tmp_response_length
|
|
589
|
-
|
|
590
|
-
if not partially_stopped:
|
|
591
|
-
completion_choice = CompletionChoice(
|
|
592
|
-
text=response, index=0, logprobs=None, finish_reason=None
|
|
593
|
-
)
|
|
594
|
-
completion_chunk = CompletionChunk(
|
|
595
|
-
id=str(uuid.uuid1()),
|
|
596
|
-
object="text_completion",
|
|
597
|
-
created=int(time.time()),
|
|
598
|
-
model=model_uid,
|
|
599
|
-
choices=[completion_choice],
|
|
600
|
-
)
|
|
601
|
-
completion_usage = CompletionUsage(
|
|
602
|
-
prompt_tokens=input_echo_len,
|
|
603
|
-
completion_tokens=(total_len - input_echo_len),
|
|
604
|
-
total_tokens=total_len,
|
|
605
|
-
)
|
|
606
|
-
|
|
607
|
-
yield completion_chunk, completion_usage
|
|
608
|
-
|
|
609
|
-
if stopped:
|
|
610
|
-
break
|
|
611
|
-
|
|
612
|
-
if total_len - input_echo_len == max_new_tokens - 1:
|
|
613
|
-
finish_reason = "length"
|
|
614
|
-
else:
|
|
615
|
-
finish_reason = "stop"
|
|
616
|
-
|
|
617
|
-
completion_choice = CompletionChoice(
|
|
618
|
-
text=response, index=0, logprobs=None, finish_reason=finish_reason
|
|
619
|
-
)
|
|
620
|
-
completion_chunk = CompletionChunk(
|
|
621
|
-
id=str(uuid.uuid1()),
|
|
622
|
-
object="text_completion",
|
|
623
|
-
created=int(time.time()),
|
|
624
|
-
model=model_uid,
|
|
625
|
-
choices=[completion_choice],
|
|
626
|
-
)
|
|
627
|
-
completion_usage = CompletionUsage(
|
|
628
|
-
prompt_tokens=input_echo_len,
|
|
629
|
-
completion_tokens=(total_len - input_echo_len),
|
|
630
|
-
total_tokens=total_len,
|
|
631
|
-
)
|
|
632
|
-
|
|
633
|
-
yield completion_chunk, completion_usage
|
xinference/model/llm/utils.py
CHANGED
|
@@ -16,7 +16,7 @@ import json
|
|
|
16
16
|
import logging
|
|
17
17
|
import time
|
|
18
18
|
import uuid
|
|
19
|
-
from typing import AsyncGenerator, Dict, Iterator, List, Optional
|
|
19
|
+
from typing import AsyncGenerator, Dict, Iterator, List, Optional, cast
|
|
20
20
|
|
|
21
21
|
from xinference.model.llm.llm_family import PromptStyleV1
|
|
22
22
|
|
|
@@ -299,6 +299,24 @@ Begin!"""
|
|
|
299
299
|
)
|
|
300
300
|
ret += chat_history[-1]["role"] + ":"
|
|
301
301
|
return ret
|
|
302
|
+
elif prompt_style.style_name == "INTERNLM2":
|
|
303
|
+
ret = (
|
|
304
|
+
"<s>"
|
|
305
|
+
if prompt_style.system_prompt == ""
|
|
306
|
+
else "<s>[UNUSED_TOKEN_146]system\n"
|
|
307
|
+
+ prompt_style.system_prompt
|
|
308
|
+
+ prompt_style.intra_message_sep
|
|
309
|
+
+ "\n"
|
|
310
|
+
)
|
|
311
|
+
for message in chat_history:
|
|
312
|
+
role = message["role"]
|
|
313
|
+
content = message["content"]
|
|
314
|
+
|
|
315
|
+
if content:
|
|
316
|
+
ret += role + "\n" + content + prompt_style.intra_message_sep + "\n"
|
|
317
|
+
else:
|
|
318
|
+
ret += role + "\n"
|
|
319
|
+
return ret
|
|
302
320
|
elif prompt_style.style_name == "ADD_COLON_SINGLE_COT":
|
|
303
321
|
ret = prompt_style.system_prompt + prompt_style.intra_message_sep
|
|
304
322
|
for message in chat_history:
|
|
@@ -360,7 +378,7 @@ Begin!"""
|
|
|
360
378
|
|
|
361
379
|
@classmethod
|
|
362
380
|
def _to_chat_completion_chunk(cls, chunk: CompletionChunk) -> ChatCompletionChunk:
|
|
363
|
-
|
|
381
|
+
chat_chunk = {
|
|
364
382
|
"id": "chat" + chunk["id"],
|
|
365
383
|
"model": chunk["model"],
|
|
366
384
|
"created": chunk["created"],
|
|
@@ -376,12 +394,16 @@ Begin!"""
|
|
|
376
394
|
for i, choice in enumerate(chunk["choices"])
|
|
377
395
|
],
|
|
378
396
|
}
|
|
397
|
+
usage = chunk.get("usage")
|
|
398
|
+
if usage is not None:
|
|
399
|
+
chat_chunk["usage"] = usage
|
|
400
|
+
return cast(ChatCompletionChunk, chat_chunk)
|
|
379
401
|
|
|
380
402
|
@classmethod
|
|
381
403
|
def _get_first_chat_completion_chunk(
|
|
382
404
|
cls, chunk: CompletionChunk
|
|
383
405
|
) -> ChatCompletionChunk:
|
|
384
|
-
|
|
406
|
+
chat_chunk = {
|
|
385
407
|
"id": "chat" + chunk["id"],
|
|
386
408
|
"model": chunk["model"],
|
|
387
409
|
"created": chunk["created"],
|
|
@@ -397,6 +419,10 @@ Begin!"""
|
|
|
397
419
|
for i, choice in enumerate(chunk["choices"])
|
|
398
420
|
],
|
|
399
421
|
}
|
|
422
|
+
usage = chunk.get("usage")
|
|
423
|
+
if usage is not None:
|
|
424
|
+
chat_chunk["usage"] = usage
|
|
425
|
+
return cast(ChatCompletionChunk, chat_chunk)
|
|
400
426
|
|
|
401
427
|
@classmethod
|
|
402
428
|
def _to_chat_completion_chunks(
|
|
@@ -494,16 +520,19 @@ Begin!"""
|
|
|
494
520
|
return text, None, None
|
|
495
521
|
|
|
496
522
|
@classmethod
|
|
497
|
-
def _tool_calls_completion(cls,
|
|
523
|
+
def _tool_calls_completion(cls, model_family, model_uid, c, tools):
|
|
498
524
|
_id = str(uuid.uuid4())
|
|
499
|
-
|
|
525
|
+
family = model_family.model_family or model_family.model_name
|
|
526
|
+
if "gorilla-openfunctions-v1" == family:
|
|
500
527
|
content, func, args = cls._eval_gorilla_openfunctions_arguments(c, tools)
|
|
501
|
-
elif
|
|
528
|
+
elif "chatglm3" == family:
|
|
502
529
|
content, func, args = cls._eval_chatglm3_arguments(c, tools)
|
|
503
|
-
elif
|
|
530
|
+
elif "qwen-chat" == family:
|
|
504
531
|
content, func, args = cls._eval_qwen_chat_arguments(c, tools)
|
|
505
532
|
else:
|
|
506
|
-
raise Exception(
|
|
533
|
+
raise Exception(
|
|
534
|
+
f"Model {model_family.model_name} is not support tool calls."
|
|
535
|
+
)
|
|
507
536
|
logger.debug("Tool call content: %s, func: %s, args: %s", content, func, args)
|
|
508
537
|
|
|
509
538
|
if content:
|
|
@@ -94,6 +94,7 @@ VLLM_SUPPORTED_CHAT_MODELS = [
|
|
|
94
94
|
"code-llama-python",
|
|
95
95
|
"code-llama-instruct",
|
|
96
96
|
"mistral-instruct-v0.1",
|
|
97
|
+
"mistral-instruct-v0.2",
|
|
97
98
|
"chatglm3",
|
|
98
99
|
]
|
|
99
100
|
|
|
@@ -170,7 +171,7 @@ class VLLMModel(LLM):
|
|
|
170
171
|
)
|
|
171
172
|
sanitized.setdefault("temperature", generate_config.get("temperature", 1.0))
|
|
172
173
|
sanitized.setdefault("top_p", generate_config.get("top_p", 1.0))
|
|
173
|
-
sanitized.setdefault("max_tokens", generate_config.get("max_tokens",
|
|
174
|
+
sanitized.setdefault("max_tokens", generate_config.get("max_tokens", 1024))
|
|
174
175
|
sanitized.setdefault("stop", generate_config.get("stop", None))
|
|
175
176
|
sanitized.setdefault(
|
|
176
177
|
"stop_token_ids", generate_config.get("stop_token_ids", None)
|
|
@@ -303,6 +304,16 @@ class VLLMModel(LLM):
|
|
|
303
304
|
delta = choice["text"][len(previous_texts[i]) :]
|
|
304
305
|
previous_texts[i] = choice["text"]
|
|
305
306
|
choice["text"] = delta
|
|
307
|
+
prompt_tokens = len(_request_output.prompt_token_ids)
|
|
308
|
+
completion_tokens = sum(
|
|
309
|
+
len(output.token_ids) for output in _request_output.outputs
|
|
310
|
+
)
|
|
311
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
312
|
+
chunk["usage"] = CompletionUsage(
|
|
313
|
+
prompt_tokens=prompt_tokens,
|
|
314
|
+
completion_tokens=completion_tokens,
|
|
315
|
+
total_tokens=total_tokens,
|
|
316
|
+
)
|
|
306
317
|
yield chunk
|
|
307
318
|
|
|
308
319
|
if stream:
|
|
@@ -379,7 +390,8 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
379
390
|
|
|
380
391
|
generate_config = self._sanitize_chat_config(generate_config)
|
|
381
392
|
# TODO(codingl2k1): qwen hacky to set stop for function call.
|
|
382
|
-
|
|
393
|
+
model_family = self.model_family.model_family or self.model_family.model_name
|
|
394
|
+
if tools and "qwen-chat" == model_family:
|
|
383
395
|
stop = generate_config.get("stop")
|
|
384
396
|
if isinstance(stop, str):
|
|
385
397
|
generate_config["stop"] = [stop, "Observation:"]
|
|
@@ -400,6 +412,6 @@ class VLLMChatModel(VLLMModel, ChatModelMixin):
|
|
|
400
412
|
assert not isinstance(c, AsyncGenerator)
|
|
401
413
|
if tools:
|
|
402
414
|
return self._tool_calls_completion(
|
|
403
|
-
self.model_family
|
|
415
|
+
self.model_family, self.model_uid, c, tools
|
|
404
416
|
)
|
|
405
417
|
return self._to_chat_completion(c)
|
|
@@ -30,16 +30,23 @@ MODEL_CLASSES.append(QwenVLChat)
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def _install():
|
|
33
|
-
|
|
33
|
+
json_path_huggingface = os.path.join(
|
|
34
34
|
os.path.dirname(os.path.abspath(__file__)), "model_spec.json"
|
|
35
35
|
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
json_path_modelscope = os.path.join(
|
|
37
|
+
os.path.dirname(os.path.abspath(__file__)), "model_spec_modelscope.json"
|
|
38
|
+
)
|
|
39
|
+
for builtin_family, json_path in [
|
|
40
|
+
(BUILTIN_LVLM_FAMILIES, json_path_huggingface),
|
|
41
|
+
(BUILTIN_MODELSCOPE_LVLM_FAMILIES, json_path_modelscope),
|
|
42
|
+
]:
|
|
43
|
+
for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
|
|
44
|
+
model_family = LVLMFamilyV1.parse_obj(json_obj)
|
|
45
|
+
builtin_family.append(model_family)
|
|
46
|
+
for model_spec in model_family.model_specs:
|
|
47
|
+
MODEL_NAME_TO_REVISION[model_family.model_name].append(
|
|
48
|
+
model_spec.model_revision
|
|
49
|
+
)
|
|
43
50
|
|
|
44
51
|
|
|
45
52
|
_install()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"version": 1,
|
|
4
|
+
"context_length": 4096,
|
|
5
|
+
"model_name": "qwen-vl-chat",
|
|
6
|
+
"model_lang": [
|
|
7
|
+
"en",
|
|
8
|
+
"zh"
|
|
9
|
+
],
|
|
10
|
+
"model_ability": [
|
|
11
|
+
"chat"
|
|
12
|
+
],
|
|
13
|
+
"model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
|
|
14
|
+
"model_specs": [
|
|
15
|
+
{
|
|
16
|
+
"model_format": "pytorch",
|
|
17
|
+
"model_size_in_billions": 7,
|
|
18
|
+
"quantizations": [
|
|
19
|
+
"none"
|
|
20
|
+
],
|
|
21
|
+
"model_hub": "modelscope",
|
|
22
|
+
"model_id": "Qwen/Qwen-VL-Chat",
|
|
23
|
+
"model_revision": "master"
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"model_format": "gptq",
|
|
27
|
+
"model_size_in_billions": 7,
|
|
28
|
+
"quantizations": [
|
|
29
|
+
"Int4"
|
|
30
|
+
],
|
|
31
|
+
"model_hub": "modelscope",
|
|
32
|
+
"model_id": "Qwen/Qwen-VL-Chat-{quantization}",
|
|
33
|
+
"model_revision": "master"
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"prompt_style": {
|
|
37
|
+
"style_name": "QWEN",
|
|
38
|
+
"system_prompt": "You are a helpful assistant.",
|
|
39
|
+
"roles": [
|
|
40
|
+
"user",
|
|
41
|
+
"assistant"
|
|
42
|
+
]
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
]
|
xinference/model/utils.py
CHANGED
|
@@ -153,8 +153,13 @@ def is_model_cached(model_spec: Any, name_to_revisions_mapping: Dict):
|
|
|
153
153
|
|
|
154
154
|
|
|
155
155
|
def is_valid_model_name(model_name: str) -> bool:
|
|
156
|
-
|
|
157
|
-
|
|
156
|
+
import re
|
|
157
|
+
|
|
158
|
+
if len(model_name) == 0:
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
# check if contains +/?%#&=\s
|
|
162
|
+
return re.match(r"^[^+\/?%#&=\s]*$", model_name) is not None
|
|
158
163
|
|
|
159
164
|
|
|
160
165
|
def parse_uri(uri: str) -> Tuple[str, str]:
|
xinference/types.py
CHANGED
|
@@ -110,6 +110,7 @@ class CompletionChunk(TypedDict):
|
|
|
110
110
|
created: int
|
|
111
111
|
model: str
|
|
112
112
|
choices: List[CompletionChoice]
|
|
113
|
+
usage: NotRequired[CompletionUsage]
|
|
113
114
|
|
|
114
115
|
|
|
115
116
|
class Completion(TypedDict):
|
|
@@ -160,6 +161,7 @@ class ChatCompletionChunk(TypedDict):
|
|
|
160
161
|
object: Literal["chat.completion.chunk"]
|
|
161
162
|
created: int
|
|
162
163
|
choices: List[ChatCompletionChunkChoice]
|
|
164
|
+
usage: NotRequired[CompletionUsage]
|
|
163
165
|
|
|
164
166
|
|
|
165
167
|
class ChatglmCppModelConfig(TypedDict, total=False):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: xinference
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Model Serving Made Easy
|
|
5
5
|
Home-page: https://github.com/xorbitsai/inference
|
|
6
6
|
Author: Qin Xuye
|
|
@@ -39,6 +39,7 @@ Requires-Dist: sse-starlette >=1.6.5
|
|
|
39
39
|
Requires-Dist: openai >1
|
|
40
40
|
Requires-Dist: python-jose[cryptography]
|
|
41
41
|
Requires-Dist: passlib[bcrypt]
|
|
42
|
+
Requires-Dist: aioprometheus[starlette]
|
|
42
43
|
Provides-Extra: all
|
|
43
44
|
Requires-Dist: chatglm-cpp >=0.3.0 ; extra == 'all'
|
|
44
45
|
Requires-Dist: ctransformers ; extra == 'all'
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
xinference/__init__.py,sha256=jv7PR7ali6n5TpvGjB3hKugwB9Tq-eSTyc_xl2gFnZ0,910
|
|
2
|
-
xinference/_version.py,sha256
|
|
3
|
-
xinference/conftest.py,sha256=
|
|
2
|
+
xinference/_version.py,sha256=6jc3tqkig6J9BPzwIKDs3G9C6KyubOgPgzkt_BI8zeg,497
|
|
3
|
+
xinference/conftest.py,sha256=mGhwGBCmu4SNXx-akGw6rzYO2wfB9gTwjewC--geSSw,9315
|
|
4
4
|
xinference/constants.py,sha256=JfcCKl28iyuDAnfue9FrvK34KEfg0dKyoo1-2hzQTJ4,2343
|
|
5
|
-
xinference/fields.py,sha256=
|
|
5
|
+
xinference/fields.py,sha256=xRpDiZXVORKoC9rG3eqwxT-BFuAojhJlxJTsAQHzJ24,5075
|
|
6
6
|
xinference/isolation.py,sha256=NstVRcO3dG4umHExICXAHlzVKwH8ch8MBwKwE-KFkE0,1826
|
|
7
|
-
xinference/types.py,sha256=
|
|
7
|
+
xinference/types.py,sha256=dhWPyZR-YX6AGaPhPTxxem_Q0fWYN9dHQILeWFvW4yI,11704
|
|
8
8
|
xinference/utils.py,sha256=Z6PPDGmX4EW8OD3OfA2Wa37ZM9OdRTnR00ITMDTu4qE,716
|
|
9
9
|
xinference/api/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
10
|
-
xinference/api/restful_api.py,sha256=
|
|
10
|
+
xinference/api/restful_api.py,sha256=gCx4tRD124-x61zNnFHLQMBotDcp8en0y34kYE7T2co,42684
|
|
11
11
|
xinference/api/oauth2/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
12
12
|
xinference/api/oauth2/common.py,sha256=YqEUHgyUnsL3SJbTMuS8Prd7l_AI0y9HHhSEdfJ-iSI,613
|
|
13
13
|
xinference/api/oauth2/core.py,sha256=1m8IDdZ9FH5SZvlurSKPKGbrucG47_04Re0PmOJRTg8,3121
|
|
@@ -17,22 +17,24 @@ xinference/client/__init__.py,sha256=Gc4HOzAy_1cic5kXlso7hahYgw89CKvZSJDicEU461k
|
|
|
17
17
|
xinference/client/common.py,sha256=wk-3j1tJPNa60tHO8YZ5z7iBIeI6cBEqomKIatyMQx8,1554
|
|
18
18
|
xinference/client/handlers.py,sha256=nFQQMRHiu_c2nsqVWMA792-QfXXq6wIW3dPR5Q7H-f0,545
|
|
19
19
|
xinference/client/oscar/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
20
|
-
xinference/client/oscar/actor_client.py,sha256=
|
|
20
|
+
xinference/client/oscar/actor_client.py,sha256=fWGbqCaJHp33CSgtznPosryTD88KWSjQLQebHkGvQCI,21545
|
|
21
21
|
xinference/client/restful/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
22
|
-
xinference/client/restful/restful_client.py,sha256=
|
|
22
|
+
xinference/client/restful/restful_client.py,sha256=jGWMPJNtI1m-rZ3GmYwDDkZqWpyOXMiaSX-PnaF13vI,37703
|
|
23
23
|
xinference/core/__init__.py,sha256=Fe5tYCHDbYJ7PhxJhQ68VbfgKgOsAuslNPr4wPhFMJM,612
|
|
24
24
|
xinference/core/chat_interface.py,sha256=ec9fVV-5m88PoXrfvr4C-m74L_p35nSVHTf-fupW1jc,17114
|
|
25
|
-
xinference/core/
|
|
25
|
+
xinference/core/metrics.py,sha256=ScmTG15Uq3h_ob72ybZSMWdnk8P4sUZFcm60f4ikSXc,2631
|
|
26
|
+
xinference/core/model.py,sha256=vQ_Q65GEFb5y9RuhhqNJNN5fUy7Mou3pO_lpvAwZzD0,16258
|
|
26
27
|
xinference/core/resource.py,sha256=oG44ZpXQL7uz4pbowtHGdpHarNJ52CK6KBLJRMcySpk,1516
|
|
27
|
-
xinference/core/
|
|
28
|
+
xinference/core/status_guard.py,sha256=YtQUU_I_9fn74e4xg2xqaLdmww6F4Jx0N96jHsfjR88,2644
|
|
29
|
+
xinference/core/supervisor.py,sha256=afrGeR0yKUjJZCb4gSBWS1EAcc9DDQp0vwV_IdtZjEU,28970
|
|
28
30
|
xinference/core/utils.py,sha256=oVM_bIbbI0w4n3lFIytQ7wMmlIBrxPstrUu7Neahvyw,3528
|
|
29
|
-
xinference/core/worker.py,sha256=
|
|
31
|
+
xinference/core/worker.py,sha256=SvHPX66YSPo1WrNSpTMMhTfNC-fmXQzLFcQrNriBjdo,23029
|
|
30
32
|
xinference/deploy/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
31
|
-
xinference/deploy/cmdline.py,sha256=
|
|
32
|
-
xinference/deploy/local.py,sha256
|
|
33
|
+
xinference/deploy/cmdline.py,sha256=FcxjI24_8-0CzHZlN-lGvFxGsfN76y0lgDDzuGEpF_0,28376
|
|
34
|
+
xinference/deploy/local.py,sha256=99xpzEx9yFj6xPyplzseF64ZCcVOp3oDKZRTBnJF8Io,3933
|
|
33
35
|
xinference/deploy/supervisor.py,sha256=N2EnjN0_lNT92ygZ6qiip6BeodK7dazT2sWWj_wRU_4,2965
|
|
34
36
|
xinference/deploy/utils.py,sha256=wR8dUZud1k7gSmd4M7l6Rq2rtrhTn5qLGmM1XB1IHr4,4941
|
|
35
|
-
xinference/deploy/worker.py,sha256=
|
|
37
|
+
xinference/deploy/worker.py,sha256=IzmENJE-g0bujAReC5I_iZE6epdgRwaobALKPIk7csA,2915
|
|
36
38
|
xinference/deploy/test/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
37
39
|
xinference/deploy/test/test_cmdline.py,sha256=bEQ4-y2IG-ntOA6tP3_EP22WQelMAiJ9BQkuBmRzZAs,6736
|
|
38
40
|
xinference/locale/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
@@ -40,7 +42,7 @@ xinference/locale/utils.py,sha256=w-G1DAJGw1UUQVVtq6khOZn7ZjobUmTw6qwHMm2eWIs,13
|
|
|
40
42
|
xinference/locale/zh_CN.json,sha256=YA55G9s1p05Bt5RBoDo5SV12dd-CMJI0ABap6RpCp4M,1097
|
|
41
43
|
xinference/model/__init__.py,sha256=IRC3ojiqYkVLIK_xsIxYeKypEeeTTdrovnVzK_4L4eg,663
|
|
42
44
|
xinference/model/core.py,sha256=D1EJ9ZJ8VkD0I8NcHcUBA4U9cbm-rcaHkmHL-tq8INY,3041
|
|
43
|
-
xinference/model/utils.py,sha256=
|
|
45
|
+
xinference/model/utils.py,sha256=V2Ei0QKkQS3hW_TpfEmHsl1nAIdBrOZH7sMTTWHp7Mg,9727
|
|
44
46
|
xinference/model/embedding/__init__.py,sha256=uU_fNnMbEpl6LxgzN_FC9U3DisjQwCwIvKv0gfjOFkI,2119
|
|
45
47
|
xinference/model/embedding/core.py,sha256=ANZCSv6rCYpzaPcqcgmzwgSrilinXoBLAAPbzpsyh2A,16138
|
|
46
48
|
xinference/model/embedding/custom.py,sha256=iE3-iWVzxarXdeTdw5e6rxv6HQRXVbPHp65wwhT2IL8,3919
|
|
@@ -51,36 +53,38 @@ xinference/model/image/core.py,sha256=ectdISnWjTvxxbj-ty9fgqNzLCBo7uX1q_iA6CbHs1
|
|
|
51
53
|
xinference/model/image/model_spec.json,sha256=VBo3jTq93UtwD9fB1oqrpIJVaZbyYNiougZuY81pt8g,2965
|
|
52
54
|
xinference/model/image/stable_diffusion/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
53
55
|
xinference/model/image/stable_diffusion/core.py,sha256=BdslgN_T7f2nzmxtRz0fT2S0P6tTN9zJ_UE6XO2bLLs,5152
|
|
54
|
-
xinference/model/llm/__init__.py,sha256=
|
|
56
|
+
xinference/model/llm/__init__.py,sha256=x_f2h_D6aUWtGbkmVSZyzvDA-DcKT3mts3EXxFjWlr4,5244
|
|
55
57
|
xinference/model/llm/core.py,sha256=YDUB-MGEu9CtHxtUKFlu9gNitFO-JvLb6xhaqOYdbZM,7761
|
|
56
|
-
xinference/model/llm/llm_family.json,sha256=
|
|
57
|
-
xinference/model/llm/llm_family.py,sha256=
|
|
58
|
-
xinference/model/llm/llm_family_modelscope.json,sha256
|
|
59
|
-
xinference/model/llm/utils.py,sha256=
|
|
58
|
+
xinference/model/llm/llm_family.json,sha256=b4PAz9EbD5RjIbC3CR--FaPmDuupViK_ecJeDDJTpjg,80701
|
|
59
|
+
xinference/model/llm/llm_family.py,sha256=6cqVSQsilvZIITwB0kUYIrkj1h8orH1uV1rO7XSYCf0,30989
|
|
60
|
+
xinference/model/llm/llm_family_modelscope.json,sha256=JkaOILdduYXaAS_knca2lO075mUwx4kaOvYN9XCR250,45570
|
|
61
|
+
xinference/model/llm/utils.py,sha256=QyJLOcnibkCNvJwxZhxbAJPWB_F3T9-90OQaOBDT39I,23028
|
|
60
62
|
xinference/model/llm/ggml/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
61
63
|
xinference/model/llm/ggml/chatglm.py,sha256=XJ6lyr4Kl6-u65ZxFtEjPbIcd0tTdudKXYoEWFoNXig,12555
|
|
62
64
|
xinference/model/llm/ggml/ctransformers.py,sha256=n8dTItZe97cF79NkmsVPirqpBcrZiGAQfd2GRpz9-3I,9917
|
|
63
65
|
xinference/model/llm/ggml/ctransformers_util.py,sha256=WozFJgJZlbuEDPQLhy31YmwGp-oJoUYsnd9HjuGraIE,5271
|
|
64
|
-
xinference/model/llm/ggml/llamacpp.py,sha256=
|
|
66
|
+
xinference/model/llm/ggml/llamacpp.py,sha256=aK4EU25ryGgCbY9HRFfAdFusg385MB_UA1OT-p8fFQs,12241
|
|
65
67
|
xinference/model/llm/ggml/tools/__init__.py,sha256=6a6P2VPKE06xKxJ-dTqp4TRO2IEDWvtcVP6gHutAR0M,624
|
|
66
68
|
xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py,sha256=ULvaoAKGH-L6RuRLFOtAOVitKLVPdpd5QyXrLL14gG0,17959
|
|
67
69
|
xinference/model/llm/ggml/tools/gguf.py,sha256=Hv2haR-UN7NdB1N8YId32hFoEPd-JX6_aUNWRJhyJZc,30277
|
|
68
70
|
xinference/model/llm/pytorch/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
69
71
|
xinference/model/llm/pytorch/baichuan.py,sha256=6d9UQw5Ox1FFdPG-cO3aKPcUmio5bQ4j1X5B0eCg5TU,2703
|
|
70
|
-
xinference/model/llm/pytorch/chatglm.py,sha256=
|
|
72
|
+
xinference/model/llm/pytorch/chatglm.py,sha256=2YCZL86ABFOjjoVXGxBwK9V80j71gSTfHAYEUveskyY,6991
|
|
71
73
|
xinference/model/llm/pytorch/compression.py,sha256=rN8z0yt2xV9ASLv1muYStyBdvZjGCQL-OYaEga4FwIQ,8081
|
|
72
|
-
xinference/model/llm/pytorch/core.py,sha256=
|
|
74
|
+
xinference/model/llm/pytorch/core.py,sha256=fE2Faags4qg3dwuDHAYShPYNeB-DyH6KbHmlDDdrF0A,17920
|
|
73
75
|
xinference/model/llm/pytorch/falcon.py,sha256=wC3_ILUtoimtOWox2_Sr94OcEkMYoB7Mf84R4LYHMGI,4298
|
|
76
|
+
xinference/model/llm/pytorch/internlm2.py,sha256=J9byaXpgPQ28zp8EEqz2hggX2d7bdmY5-e2Mp6SIeTU,5569
|
|
74
77
|
xinference/model/llm/pytorch/llama_2.py,sha256=JQI9R_ZrNhTXg_MbS7el6P-ou49iMKq5vnr9QlQ_o70,3509
|
|
75
78
|
xinference/model/llm/pytorch/spec_decoding_utils.py,sha256=t059oJ0kvcXMA1pKizN1HDzs0LMf5Jpb5MM7aMNKnzQ,18750
|
|
76
79
|
xinference/model/llm/pytorch/spec_model.py,sha256=2rmSYaliu0nwOQzc3rnYFtBLFBayKZ9Xe2hkJUbe7mo,6810
|
|
77
|
-
xinference/model/llm/pytorch/utils.py,sha256=
|
|
80
|
+
xinference/model/llm/pytorch/utils.py,sha256=zKpyeF7JV0sMv4jmwiYQGCxpk6VsbXtgOmnyksHUs8M,16870
|
|
78
81
|
xinference/model/llm/pytorch/vicuna.py,sha256=eJ2HVHg-HbhVHRdA5g4N9I2_cCti4K9y3Y4WjMMzAIs,2217
|
|
79
82
|
xinference/model/llm/vllm/__init__.py,sha256=h_JgzSqV5lP6vQ6XX_17kE4IY4BRnvKta_7VLQAL1ms,581
|
|
80
|
-
xinference/model/llm/vllm/core.py,sha256=
|
|
81
|
-
xinference/model/multimodal/__init__.py,sha256=
|
|
83
|
+
xinference/model/llm/vllm/core.py,sha256=GwQ9eBlEV0NHsrxERx7j4usR3qZEElFkPoAgBIL1VnU,14966
|
|
84
|
+
xinference/model/multimodal/__init__.py,sha256=EjP1dK0s-N5GyePkX4vauris7CfwsoT7Sb2E8OFoysM,1687
|
|
82
85
|
xinference/model/multimodal/core.py,sha256=OupAMjRRc4UjhxWapVaXpymSl7gz1Ss4YTL_lTBO3wI,15429
|
|
83
86
|
xinference/model/multimodal/model_spec.json,sha256=jPDB7Yhn13-e4h2lBm2yF0xQXIgv1cI6Jpto3AjCnto,1071
|
|
87
|
+
xinference/model/multimodal/model_spec_modelscope.json,sha256=yf6naWPy28mVWanW8BMWlwL8gBmzFsMYCVLg2XhzYh8,1073
|
|
84
88
|
xinference/model/multimodal/qwen_vl.py,sha256=DCk_AzDiM5eX7gSRCv1Y3IBRDCwAXEMcM-1jq_HUEEE,5490
|
|
85
89
|
xinference/model/rerank/__init__.py,sha256=FEFQSLCNihIgUb28EAMxutVvpDTPmaf9o7Ey97ryAyA,2135
|
|
86
90
|
xinference/model/rerank/core.py,sha256=M01V1Im2SGb0x8cff8LcB6sbM1mdwcfjxQtGXuHuH6E,11110
|
|
@@ -15291,9 +15295,9 @@ xinference/web/ui/node_modules/yargs-parser/package.json,sha256=BSwbOzgetKXMK4u0
|
|
|
15291
15295
|
xinference/web/ui/node_modules/yocto-queue/package.json,sha256=6U1XHQPGXJTqsiFvT953ORihUtXTblZy4fXBWP9qxC0,725
|
|
15292
15296
|
xinference/web/ui/node_modules/yup/package.json,sha256=xRFSROB9NKxqSWHEVFvSTsPs9Ll074uo8OS1zEw0qhA,1206
|
|
15293
15297
|
xinference/web/ui/node_modules/yup/node_modules/type-fest/package.json,sha256=JTv2zTTVgxQ2H82m1-6qEpdMv08lHjFx4Puf_MsbB_Q,1134
|
|
15294
|
-
xinference-0.8.
|
|
15295
|
-
xinference-0.8.
|
|
15296
|
-
xinference-0.8.
|
|
15297
|
-
xinference-0.8.
|
|
15298
|
-
xinference-0.8.
|
|
15299
|
-
xinference-0.8.
|
|
15298
|
+
xinference-0.8.1.dist-info/LICENSE,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
15299
|
+
xinference-0.8.1.dist-info/METADATA,sha256=ArHMcySJgMVpoXrhiu5nfdcLvV0uscZcC7kq7c_Ky9c,13420
|
|
15300
|
+
xinference-0.8.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
15301
|
+
xinference-0.8.1.dist-info/entry_points.txt,sha256=-lDyyzqWMFQF0Rgm7VxBNz0V-bMBMQLRR3pvQ-Y8XTY,226
|
|
15302
|
+
xinference-0.8.1.dist-info/top_level.txt,sha256=L1rQt7pl6m8tmKXpWVHzP-GtmzAxp663rXxGE7qnK00,11
|
|
15303
|
+
xinference-0.8.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|