xinference 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +35 -1
- xinference/client/oscar/actor_client.py +2 -2
- xinference/client/restful/restful_client.py +2 -2
- xinference/conftest.py +5 -1
- xinference/core/metrics.py +83 -0
- xinference/core/model.py +148 -8
- xinference/core/status_guard.py +86 -0
- xinference/core/supervisor.py +57 -7
- xinference/core/worker.py +132 -13
- xinference/deploy/cmdline.py +57 -4
- xinference/deploy/local.py +32 -6
- xinference/deploy/worker.py +33 -5
- xinference/fields.py +4 -1
- xinference/model/llm/__init__.py +7 -0
- xinference/model/llm/ggml/llamacpp.py +3 -2
- xinference/model/llm/llm_family.json +70 -3
- xinference/model/llm/llm_family.py +11 -1
- xinference/model/llm/llm_family_modelscope.json +72 -3
- xinference/model/llm/pytorch/chatglm.py +70 -28
- xinference/model/llm/pytorch/core.py +11 -30
- xinference/model/llm/pytorch/internlm2.py +155 -0
- xinference/model/llm/pytorch/utils.py +0 -153
- xinference/model/llm/utils.py +37 -8
- xinference/model/llm/vllm/core.py +15 -3
- xinference/model/multimodal/__init__.py +15 -8
- xinference/model/multimodal/model_spec_modelscope.json +45 -0
- xinference/model/utils.py +7 -2
- xinference/types.py +2 -0
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/METADATA +2 -1
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/RECORD +35 -31
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/LICENSE +0 -0
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/WHEEL +0 -0
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/entry_points.txt +0 -0
- {xinference-0.8.0.dist-info → xinference-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -535,7 +535,8 @@
|
|
|
535
535
|
"zh"
|
|
536
536
|
],
|
|
537
537
|
"model_ability": [
|
|
538
|
-
"chat"
|
|
538
|
+
"chat",
|
|
539
|
+
"tools"
|
|
539
540
|
],
|
|
540
541
|
"model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
541
542
|
"model_specs": [
|
|
@@ -609,6 +610,15 @@
|
|
|
609
610
|
"roles": [
|
|
610
611
|
"user",
|
|
611
612
|
"assistant"
|
|
613
|
+
],
|
|
614
|
+
"stop_token_ids": [
|
|
615
|
+
64795,
|
|
616
|
+
64797,
|
|
617
|
+
2
|
|
618
|
+
],
|
|
619
|
+
"stop":[
|
|
620
|
+
"<|user|>",
|
|
621
|
+
"<|observation|>"
|
|
612
622
|
]
|
|
613
623
|
}
|
|
614
624
|
},
|
|
@@ -1139,14 +1149,15 @@
|
|
|
1139
1149
|
},
|
|
1140
1150
|
{
|
|
1141
1151
|
"version": 1,
|
|
1142
|
-
"context_length":
|
|
1152
|
+
"context_length": 32768,
|
|
1143
1153
|
"model_name": "qwen-chat",
|
|
1144
1154
|
"model_lang": [
|
|
1145
1155
|
"en",
|
|
1146
1156
|
"zh"
|
|
1147
1157
|
],
|
|
1148
1158
|
"model_ability": [
|
|
1149
|
-
"chat"
|
|
1159
|
+
"chat",
|
|
1160
|
+
"tools"
|
|
1150
1161
|
],
|
|
1151
1162
|
"model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
|
|
1152
1163
|
"model_specs": [
|
|
@@ -1172,6 +1183,8 @@
|
|
|
1172
1183
|
"model_format": "pytorch",
|
|
1173
1184
|
"model_size_in_billions": "1_8",
|
|
1174
1185
|
"quantizations": [
|
|
1186
|
+
"4-bit",
|
|
1187
|
+
"8-bit",
|
|
1175
1188
|
"none"
|
|
1176
1189
|
],
|
|
1177
1190
|
"model_id": "Qwen/Qwen-1_8B-Chat",
|
|
@@ -1181,6 +1194,8 @@
|
|
|
1181
1194
|
"model_format": "pytorch",
|
|
1182
1195
|
"model_size_in_billions": 7,
|
|
1183
1196
|
"quantizations": [
|
|
1197
|
+
"4-bit",
|
|
1198
|
+
"8-bit",
|
|
1184
1199
|
"none"
|
|
1185
1200
|
],
|
|
1186
1201
|
"model_id": "Qwen/Qwen-7B-Chat",
|
|
@@ -1190,6 +1205,8 @@
|
|
|
1190
1205
|
"model_format": "pytorch",
|
|
1191
1206
|
"model_size_in_billions": 14,
|
|
1192
1207
|
"quantizations": [
|
|
1208
|
+
"4-bit",
|
|
1209
|
+
"8-bit",
|
|
1193
1210
|
"none"
|
|
1194
1211
|
],
|
|
1195
1212
|
"model_id": "Qwen/Qwen-14B-Chat",
|
|
@@ -1199,6 +1216,8 @@
|
|
|
1199
1216
|
"model_format": "pytorch",
|
|
1200
1217
|
"model_size_in_billions": 72,
|
|
1201
1218
|
"quantizations": [
|
|
1219
|
+
"4-bit",
|
|
1220
|
+
"8-bit",
|
|
1202
1221
|
"none"
|
|
1203
1222
|
],
|
|
1204
1223
|
"model_id": "Qwen/Qwen-72B-Chat",
|
|
@@ -3144,5 +3163,53 @@
|
|
|
3144
3163
|
"model_revision": "70d1740208c8ba39f9ba250b22117ec25311ab33"
|
|
3145
3164
|
}
|
|
3146
3165
|
]
|
|
3166
|
+
},
|
|
3167
|
+
{
|
|
3168
|
+
"version": 1,
|
|
3169
|
+
"context_length": 204800,
|
|
3170
|
+
"model_name": "internlm2-chat",
|
|
3171
|
+
"model_lang": [
|
|
3172
|
+
"en",
|
|
3173
|
+
"zh"
|
|
3174
|
+
],
|
|
3175
|
+
"model_ability": [
|
|
3176
|
+
"chat"
|
|
3177
|
+
],
|
|
3178
|
+
"model_description": "The second generation of the InternLM model, InternLM2.",
|
|
3179
|
+
"model_specs": [
|
|
3180
|
+
{
|
|
3181
|
+
"model_format": "pytorch",
|
|
3182
|
+
"model_size_in_billions": 7,
|
|
3183
|
+
"quantizations": [
|
|
3184
|
+
"none"
|
|
3185
|
+
],
|
|
3186
|
+
"model_id": "internlm/internlm2-chat-7b",
|
|
3187
|
+
"model_revision": "5797f79825bab7013932d57e2babaac1b8de6b4f"
|
|
3188
|
+
},
|
|
3189
|
+
{
|
|
3190
|
+
"model_format": "pytorch",
|
|
3191
|
+
"model_size_in_billions": 20,
|
|
3192
|
+
"quantizations": [
|
|
3193
|
+
"none"
|
|
3194
|
+
],
|
|
3195
|
+
"model_id": "internlm/internlm2-chat-20b",
|
|
3196
|
+
"model_revision": "3ccaf3ae82d5d01c0a95eecf40ee550f9c543635"
|
|
3197
|
+
}
|
|
3198
|
+
],
|
|
3199
|
+
"prompt_style": {
|
|
3200
|
+
"style_name": "INTERNLM2",
|
|
3201
|
+
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
3202
|
+
"roles": [
|
|
3203
|
+
"[UNUSED_TOKEN_146]user",
|
|
3204
|
+
"[UNUSED_TOKEN_146]assistant"
|
|
3205
|
+
],
|
|
3206
|
+
"intra_message_sep": "[UNUSED_TOKEN_145]",
|
|
3207
|
+
"stop_token_ids": [
|
|
3208
|
+
92542
|
|
3209
|
+
],
|
|
3210
|
+
"stop": [
|
|
3211
|
+
"[UNUSED_TOKEN_145]"
|
|
3212
|
+
]
|
|
3213
|
+
}
|
|
3147
3214
|
}
|
|
3148
3215
|
]
|
|
@@ -43,6 +43,7 @@ DEFAULT_CONTEXT_LENGTH = 2048
|
|
|
43
43
|
BUILTIN_LLM_PROMPT_STYLE: Dict[str, "PromptStyleV1"] = {}
|
|
44
44
|
BUILTIN_LLM_MODEL_CHAT_FAMILIES: Set[str] = set()
|
|
45
45
|
BUILTIN_LLM_MODEL_GENERATE_FAMILIES: Set[str] = set()
|
|
46
|
+
BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES: Set[str] = set()
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
class GgmlLLMSpecV1(BaseModel):
|
|
@@ -105,7 +106,7 @@ class LLMFamilyV1(BaseModel):
|
|
|
105
106
|
context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
|
|
106
107
|
model_name: str
|
|
107
108
|
model_lang: List[str]
|
|
108
|
-
model_ability: List[Literal["embed", "generate", "chat"]]
|
|
109
|
+
model_ability: List[Literal["embed", "generate", "chat", "tools"]]
|
|
109
110
|
model_description: Optional[str]
|
|
110
111
|
# reason for not required str here: legacy registration
|
|
111
112
|
model_family: Optional[str]
|
|
@@ -155,6 +156,15 @@ class CustomLLMFamilyV1(LLMFamilyV1):
|
|
|
155
156
|
f"`model_family` for chat model must be `other` or one of the following values: \n"
|
|
156
157
|
f"{', '.join(list(BUILTIN_LLM_MODEL_CHAT_FAMILIES))}"
|
|
157
158
|
)
|
|
159
|
+
if (
|
|
160
|
+
llm_spec.model_family != "other"
|
|
161
|
+
and "tool_call" in llm_spec.model_ability
|
|
162
|
+
and llm_spec.model_family not in BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES
|
|
163
|
+
):
|
|
164
|
+
raise ValueError(
|
|
165
|
+
f"`model_family` for tool call model must be `other` or one of the following values: \n"
|
|
166
|
+
f"{', '.join(list(BUILTIN_LLM_MODEL_TOOL_CALL_FAMILIES))}"
|
|
167
|
+
)
|
|
158
168
|
if (
|
|
159
169
|
llm_spec.model_family != "other"
|
|
160
170
|
and "chat" not in llm_spec.model_ability
|
|
@@ -297,7 +297,8 @@
|
|
|
297
297
|
"zh"
|
|
298
298
|
],
|
|
299
299
|
"model_ability": [
|
|
300
|
-
"chat"
|
|
300
|
+
"chat",
|
|
301
|
+
"tools"
|
|
301
302
|
],
|
|
302
303
|
"model_description": "ChatGLM3 is the third generation of ChatGLM, still open-source and trained on Chinese and English data.",
|
|
303
304
|
"model_specs": [
|
|
@@ -375,6 +376,15 @@
|
|
|
375
376
|
"roles": [
|
|
376
377
|
"user",
|
|
377
378
|
"assistant"
|
|
379
|
+
],
|
|
380
|
+
"stop_token_ids": [
|
|
381
|
+
64795,
|
|
382
|
+
64797,
|
|
383
|
+
2
|
|
384
|
+
],
|
|
385
|
+
"stop":[
|
|
386
|
+
"<|user|>",
|
|
387
|
+
"<|observation|>"
|
|
378
388
|
]
|
|
379
389
|
}
|
|
380
390
|
},
|
|
@@ -1461,14 +1471,15 @@
|
|
|
1461
1471
|
},
|
|
1462
1472
|
{
|
|
1463
1473
|
"version": 1,
|
|
1464
|
-
"context_length":
|
|
1474
|
+
"context_length": 32768,
|
|
1465
1475
|
"model_name": "qwen-chat",
|
|
1466
1476
|
"model_lang": [
|
|
1467
1477
|
"en",
|
|
1468
1478
|
"zh"
|
|
1469
1479
|
],
|
|
1470
1480
|
"model_ability": [
|
|
1471
|
-
"chat"
|
|
1481
|
+
"chat",
|
|
1482
|
+
"tools"
|
|
1472
1483
|
],
|
|
1473
1484
|
"model_description": "Qwen-chat is a fine-tuned version of the Qwen LLM trained with alignment techniques, specializing in chatting.",
|
|
1474
1485
|
"model_specs": [
|
|
@@ -1498,6 +1509,8 @@
|
|
|
1498
1509
|
"model_format": "pytorch",
|
|
1499
1510
|
"model_size_in_billions": "1_8",
|
|
1500
1511
|
"quantizations": [
|
|
1512
|
+
"4-bit",
|
|
1513
|
+
"8-bit",
|
|
1501
1514
|
"none"
|
|
1502
1515
|
],
|
|
1503
1516
|
"model_hub": "modelscope",
|
|
@@ -1508,6 +1521,8 @@
|
|
|
1508
1521
|
"model_format": "pytorch",
|
|
1509
1522
|
"model_size_in_billions": 7,
|
|
1510
1523
|
"quantizations": [
|
|
1524
|
+
"4-bit",
|
|
1525
|
+
"8-bit",
|
|
1511
1526
|
"none"
|
|
1512
1527
|
],
|
|
1513
1528
|
"model_hub": "modelscope",
|
|
@@ -1518,6 +1533,8 @@
|
|
|
1518
1533
|
"model_format": "pytorch",
|
|
1519
1534
|
"model_size_in_billions": 72,
|
|
1520
1535
|
"quantizations": [
|
|
1536
|
+
"4-bit",
|
|
1537
|
+
"8-bit",
|
|
1521
1538
|
"none"
|
|
1522
1539
|
],
|
|
1523
1540
|
"model_hub": "modelscope",
|
|
@@ -1528,6 +1545,8 @@
|
|
|
1528
1545
|
"model_format": "pytorch",
|
|
1529
1546
|
"model_size_in_billions": 14,
|
|
1530
1547
|
"quantizations": [
|
|
1548
|
+
"4-bit",
|
|
1549
|
+
"8-bit",
|
|
1531
1550
|
"none"
|
|
1532
1551
|
],
|
|
1533
1552
|
"model_id": "qwen/Qwen-14B-Chat",
|
|
@@ -1759,5 +1778,55 @@
|
|
|
1759
1778
|
"model_revision": "master"
|
|
1760
1779
|
}
|
|
1761
1780
|
]
|
|
1781
|
+
},
|
|
1782
|
+
{
|
|
1783
|
+
"version": 1,
|
|
1784
|
+
"context_length": 204800,
|
|
1785
|
+
"model_name": "internlm2-chat",
|
|
1786
|
+
"model_lang": [
|
|
1787
|
+
"en",
|
|
1788
|
+
"zh"
|
|
1789
|
+
],
|
|
1790
|
+
"model_ability": [
|
|
1791
|
+
"chat"
|
|
1792
|
+
],
|
|
1793
|
+
"model_description": "The second generation of the InternLM model, InternLM2.",
|
|
1794
|
+
"model_specs": [
|
|
1795
|
+
{
|
|
1796
|
+
"model_format": "pytorch",
|
|
1797
|
+
"model_size_in_billions": 7,
|
|
1798
|
+
"quantizations": [
|
|
1799
|
+
"none"
|
|
1800
|
+
],
|
|
1801
|
+
"model_id": "Shanghai_AI_Laboratory/internlm2-chat-7b",
|
|
1802
|
+
"model_hub": "modelscope",
|
|
1803
|
+
"model_revision": "master"
|
|
1804
|
+
},
|
|
1805
|
+
{
|
|
1806
|
+
"model_format": "pytorch",
|
|
1807
|
+
"model_size_in_billions": 20,
|
|
1808
|
+
"quantizations": [
|
|
1809
|
+
"none"
|
|
1810
|
+
],
|
|
1811
|
+
"model_id": "Shanghai_AI_Laboratory/internlm2-chat-20b",
|
|
1812
|
+
"model_hub": "modelscope",
|
|
1813
|
+
"model_revision": "master"
|
|
1814
|
+
}
|
|
1815
|
+
],
|
|
1816
|
+
"prompt_style": {
|
|
1817
|
+
"style_name": "INTERNLM2",
|
|
1818
|
+
"system_prompt": "You are InternLM (书生·浦语), a helpful, honest, and harmless AI assistant developed by Shanghai AI Laboratory (上海人工智能实验室).",
|
|
1819
|
+
"roles": [
|
|
1820
|
+
"[UNUSED_TOKEN_146]user",
|
|
1821
|
+
"[UNUSED_TOKEN_146]assistant"
|
|
1822
|
+
],
|
|
1823
|
+
"intra_message_sep": "[UNUSED_TOKEN_145]",
|
|
1824
|
+
"stop_token_ids": [
|
|
1825
|
+
92542
|
|
1826
|
+
],
|
|
1827
|
+
"stop": [
|
|
1828
|
+
"[UNUSED_TOKEN_145]"
|
|
1829
|
+
]
|
|
1830
|
+
}
|
|
1762
1831
|
}
|
|
1763
1832
|
]
|
|
@@ -11,13 +11,19 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import time
|
|
15
|
+
import uuid
|
|
14
16
|
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
15
17
|
|
|
16
18
|
from ....types import (
|
|
17
19
|
SPECIAL_TOOL_PROMPT,
|
|
18
20
|
ChatCompletion,
|
|
21
|
+
ChatCompletionChoice,
|
|
19
22
|
ChatCompletionChunk,
|
|
20
23
|
ChatCompletionMessage,
|
|
24
|
+
CompletionChoice,
|
|
25
|
+
CompletionChunk,
|
|
26
|
+
CompletionUsage,
|
|
21
27
|
PytorchGenerateConfig,
|
|
22
28
|
)
|
|
23
29
|
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
@@ -106,38 +112,74 @@ class ChatglmPytorchChatModel(PytorchChatModel):
|
|
|
106
112
|
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
107
113
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
108
114
|
tools = self._handle_tools(generate_config)
|
|
115
|
+
kwargs: Dict[str, Any] = {}
|
|
116
|
+
generate_config = generate_config or {}
|
|
117
|
+
temperature = generate_config.get("temperature")
|
|
118
|
+
if temperature is not None:
|
|
119
|
+
kwargs["temperature"] = float(temperature)
|
|
120
|
+
top_p = generate_config.get("top_p")
|
|
121
|
+
if top_p is not None:
|
|
122
|
+
kwargs["top_p"] = float(top_p)
|
|
123
|
+
max_length = generate_config.get("max_tokens")
|
|
124
|
+
if max_length is not None:
|
|
125
|
+
kwargs["max_length"] = int(max_length)
|
|
126
|
+
# Tool calls only works for non stream, so we call chat directly.
|
|
127
|
+
if prompt == SPECIAL_TOOL_PROMPT and chat_history:
|
|
128
|
+
tool_message = chat_history.pop()
|
|
129
|
+
content = tool_message.get("content")
|
|
130
|
+
assert content is not None
|
|
131
|
+
prompt = content
|
|
132
|
+
kwargs["role"] = "observation"
|
|
133
|
+
chat_history = [h for h in chat_history if not h.get("tool_calls")]
|
|
134
|
+
if not chat_history:
|
|
135
|
+
chat_history = []
|
|
109
136
|
if tools:
|
|
110
|
-
# Tool calls only works for non stream, so we call chat directly.
|
|
111
|
-
kwargs: Dict[str, Any] = {}
|
|
112
|
-
generate_config = generate_config or {}
|
|
113
|
-
temperature = generate_config.get("temperature")
|
|
114
|
-
if temperature is not None:
|
|
115
|
-
kwargs["temperature"] = float(temperature)
|
|
116
|
-
top_p = generate_config.get("top_p")
|
|
117
|
-
if top_p is not None:
|
|
118
|
-
kwargs["top_p"] = float(top_p)
|
|
119
|
-
max_length = generate_config.get("max_tokens")
|
|
120
|
-
if max_length is not None:
|
|
121
|
-
kwargs["max_length"] = int(max_length)
|
|
122
|
-
if prompt == SPECIAL_TOOL_PROMPT and chat_history:
|
|
123
|
-
tool_message = chat_history.pop()
|
|
124
|
-
content = tool_message.get("content")
|
|
125
|
-
assert content is not None
|
|
126
|
-
prompt = content
|
|
127
|
-
kwargs["role"] = "observation"
|
|
128
|
-
chat_history = [h for h in chat_history if not h.get("tool_calls")]
|
|
129
|
-
if not chat_history:
|
|
130
|
-
chat_history = []
|
|
131
137
|
msg = self._model.chat(
|
|
132
138
|
self._tokenizer, prompt, [tools] + chat_history, **kwargs
|
|
133
139
|
)
|
|
134
140
|
return self._tool_calls_completion(
|
|
135
|
-
self.model_family
|
|
141
|
+
self.model_family, self.model_uid, msg, tools
|
|
136
142
|
)
|
|
137
143
|
else:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
+
stream = generate_config.get("stream", False)
|
|
145
|
+
if stream:
|
|
146
|
+
|
|
147
|
+
def _stream_generator():
|
|
148
|
+
last_chunk_text_length = 0
|
|
149
|
+
for chunk_text, _ in self._model.stream_chat(
|
|
150
|
+
self._tokenizer, prompt, chat_history, **kwargs
|
|
151
|
+
):
|
|
152
|
+
chunk_text = chunk_text[last_chunk_text_length:]
|
|
153
|
+
last_chunk_text_length += len(chunk_text)
|
|
154
|
+
completion_choice = CompletionChoice(
|
|
155
|
+
text=chunk_text, index=0, logprobs=None, finish_reason=None
|
|
156
|
+
)
|
|
157
|
+
yield CompletionChunk(
|
|
158
|
+
id=str(uuid.uuid1()),
|
|
159
|
+
object="text_completion",
|
|
160
|
+
created=int(time.time()),
|
|
161
|
+
model=self.model_uid,
|
|
162
|
+
choices=[completion_choice],
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return self._to_chat_completion_chunks(_stream_generator())
|
|
166
|
+
else:
|
|
167
|
+
response, _ = self._model.chat(
|
|
168
|
+
self._tokenizer, prompt, chat_history, **kwargs
|
|
169
|
+
)
|
|
170
|
+
return ChatCompletion(
|
|
171
|
+
id="chat" + str(uuid.uuid1()),
|
|
172
|
+
object="chat.completion",
|
|
173
|
+
created=int(time.time()),
|
|
174
|
+
model=self.model_uid,
|
|
175
|
+
choices=[
|
|
176
|
+
ChatCompletionChoice(
|
|
177
|
+
index=0,
|
|
178
|
+
message={"role": "assistant", "content": response},
|
|
179
|
+
finish_reason="stop",
|
|
180
|
+
)
|
|
181
|
+
],
|
|
182
|
+
usage=CompletionUsage(
|
|
183
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
184
|
+
),
|
|
185
|
+
)
|
|
@@ -192,7 +192,8 @@ class PytorchModel(LLM):
|
|
|
192
192
|
) -> bool:
|
|
193
193
|
if llm_spec.model_format not in ["pytorch", "gptq"]:
|
|
194
194
|
return False
|
|
195
|
-
|
|
195
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
196
|
+
if model_family in [
|
|
196
197
|
"baichuan-chat",
|
|
197
198
|
"vicuna-v1.3",
|
|
198
199
|
"falcon",
|
|
@@ -211,11 +212,7 @@ class PytorchModel(LLM):
|
|
|
211
212
|
def generate(
|
|
212
213
|
self, prompt: str, generate_config: Optional[PytorchGenerateConfig] = None
|
|
213
214
|
) -> Union[Completion, Iterator[CompletionChunk]]:
|
|
214
|
-
from .utils import
|
|
215
|
-
generate_stream,
|
|
216
|
-
generate_stream_chatglm,
|
|
217
|
-
generate_stream_falcon,
|
|
218
|
-
)
|
|
215
|
+
from .utils import generate_stream, generate_stream_falcon
|
|
219
216
|
|
|
220
217
|
model_family_name = self.model_family.model_name.lower()
|
|
221
218
|
|
|
@@ -223,17 +220,7 @@ class PytorchModel(LLM):
|
|
|
223
220
|
prompt: str, generate_config: PytorchGenerateConfig
|
|
224
221
|
) -> Iterator[CompletionChunk]:
|
|
225
222
|
if "falcon" in model_family_name:
|
|
226
|
-
for completion_chunk,
|
|
227
|
-
self.model_uid,
|
|
228
|
-
self._model,
|
|
229
|
-
self._tokenizer,
|
|
230
|
-
prompt,
|
|
231
|
-
self._device,
|
|
232
|
-
generate_config,
|
|
233
|
-
):
|
|
234
|
-
yield completion_chunk
|
|
235
|
-
elif "chatglm" in model_family_name:
|
|
236
|
-
for completion_chunk, _ in generate_stream_chatglm(
|
|
223
|
+
for completion_chunk, completion_usage in generate_stream_falcon(
|
|
237
224
|
self.model_uid,
|
|
238
225
|
self._model,
|
|
239
226
|
self._tokenizer,
|
|
@@ -241,9 +228,10 @@ class PytorchModel(LLM):
|
|
|
241
228
|
self._device,
|
|
242
229
|
generate_config,
|
|
243
230
|
):
|
|
231
|
+
completion_chunk["usage"] = completion_usage
|
|
244
232
|
yield completion_chunk
|
|
245
233
|
else:
|
|
246
|
-
for completion_chunk,
|
|
234
|
+
for completion_chunk, completion_usage in generate_stream(
|
|
247
235
|
self.model_uid,
|
|
248
236
|
self._model,
|
|
249
237
|
self._tokenizer,
|
|
@@ -251,6 +239,7 @@ class PytorchModel(LLM):
|
|
|
251
239
|
self._device,
|
|
252
240
|
generate_config,
|
|
253
241
|
):
|
|
242
|
+
completion_chunk["usage"] = completion_usage
|
|
254
243
|
yield completion_chunk
|
|
255
244
|
|
|
256
245
|
logger.debug(
|
|
@@ -274,16 +263,6 @@ class PytorchModel(LLM):
|
|
|
274
263
|
generate_config,
|
|
275
264
|
):
|
|
276
265
|
pass
|
|
277
|
-
elif "chatglm" in model_family_name:
|
|
278
|
-
for completion_chunk, completion_usage in generate_stream_chatglm(
|
|
279
|
-
self.model_uid,
|
|
280
|
-
self._model,
|
|
281
|
-
self._tokenizer,
|
|
282
|
-
prompt,
|
|
283
|
-
self._device,
|
|
284
|
-
generate_config,
|
|
285
|
-
):
|
|
286
|
-
pass
|
|
287
266
|
else:
|
|
288
267
|
for completion_chunk, completion_usage in generate_stream(
|
|
289
268
|
self.model_uid,
|
|
@@ -442,6 +421,7 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
442
421
|
"chatglm2-32k",
|
|
443
422
|
"llama-2",
|
|
444
423
|
"llama-2-chat",
|
|
424
|
+
"internlm2-chat",
|
|
445
425
|
]:
|
|
446
426
|
return False
|
|
447
427
|
if "chat" not in llm_family.model_ability:
|
|
@@ -465,7 +445,8 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
465
445
|
|
|
466
446
|
generate_config = self._sanitize_generate_config(generate_config)
|
|
467
447
|
# TODO(codingl2k1): qwen hacky to set stop for function call.
|
|
468
|
-
|
|
448
|
+
model_family = self.model_family.model_family or self.model_family.model_name
|
|
449
|
+
if tools and "qwen-chat" == model_family:
|
|
469
450
|
stop = generate_config.get("stop")
|
|
470
451
|
if isinstance(stop, str):
|
|
471
452
|
generate_config["stop"] = [stop, "Observation:"]
|
|
@@ -485,6 +466,6 @@ class PytorchChatModel(PytorchModel, ChatModelMixin):
|
|
|
485
466
|
assert not isinstance(c, Iterator)
|
|
486
467
|
if tools:
|
|
487
468
|
return self._tool_calls_completion(
|
|
488
|
-
self.model_family
|
|
469
|
+
self.model_family, self.model_uid, c, tools
|
|
489
470
|
)
|
|
490
471
|
return self._to_chat_completion(c)
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Copyright 2022-2023 XProbe Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import time
|
|
15
|
+
import uuid
|
|
16
|
+
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
from ....types import (
|
|
19
|
+
ChatCompletion,
|
|
20
|
+
ChatCompletionChoice,
|
|
21
|
+
ChatCompletionChunk,
|
|
22
|
+
ChatCompletionMessage,
|
|
23
|
+
CompletionChoice,
|
|
24
|
+
CompletionChunk,
|
|
25
|
+
CompletionUsage,
|
|
26
|
+
PytorchGenerateConfig,
|
|
27
|
+
)
|
|
28
|
+
from ..llm_family import LLMFamilyV1, LLMSpecV1
|
|
29
|
+
from .core import PytorchChatModel, PytorchModelConfig
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Internlm2PytorchChatModel(PytorchChatModel):
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
model_uid: str,
|
|
36
|
+
model_family: "LLMFamilyV1",
|
|
37
|
+
model_spec: "LLMSpecV1",
|
|
38
|
+
quantization: str,
|
|
39
|
+
model_path: str,
|
|
40
|
+
pytorch_model_config: Optional[PytorchModelConfig] = None,
|
|
41
|
+
):
|
|
42
|
+
super().__init__(
|
|
43
|
+
model_uid,
|
|
44
|
+
model_family,
|
|
45
|
+
model_spec,
|
|
46
|
+
quantization,
|
|
47
|
+
model_path,
|
|
48
|
+
pytorch_model_config=pytorch_model_config,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def _load_model(self, **kwargs):
|
|
52
|
+
try:
|
|
53
|
+
from transformers import AutoModel, AutoTokenizer
|
|
54
|
+
except ImportError:
|
|
55
|
+
error_message = "Failed to import module 'transformers'"
|
|
56
|
+
installation_guide = [
|
|
57
|
+
"Please make sure 'transformers' is installed. ",
|
|
58
|
+
"You can install it by `pip install transformers`\n",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
|
|
62
|
+
|
|
63
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
64
|
+
self.model_path,
|
|
65
|
+
trust_remote_code=kwargs["trust_remote_code"],
|
|
66
|
+
encode_special_tokens=True,
|
|
67
|
+
revision=kwargs["revision"],
|
|
68
|
+
)
|
|
69
|
+
model = AutoModel.from_pretrained(
|
|
70
|
+
self.model_path,
|
|
71
|
+
**kwargs,
|
|
72
|
+
)
|
|
73
|
+
return model, tokenizer
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def match(
|
|
77
|
+
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
|
|
78
|
+
) -> bool:
|
|
79
|
+
if llm_spec.model_format != "pytorch":
|
|
80
|
+
return False
|
|
81
|
+
model_family = llm_family.model_family or llm_family.model_name
|
|
82
|
+
if model_family != "internlm2-chat":
|
|
83
|
+
return False
|
|
84
|
+
if "chat" not in llm_family.model_ability:
|
|
85
|
+
return False
|
|
86
|
+
return True
|
|
87
|
+
|
|
88
|
+
def chat(
|
|
89
|
+
self,
|
|
90
|
+
prompt: str,
|
|
91
|
+
system_prompt: Optional[str] = None,
|
|
92
|
+
chat_history: Optional[List[ChatCompletionMessage]] = None,
|
|
93
|
+
generate_config: Optional[PytorchGenerateConfig] = None,
|
|
94
|
+
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
95
|
+
kwargs: Dict[str, Any] = {}
|
|
96
|
+
generate_config = generate_config or {}
|
|
97
|
+
temperature = generate_config.get("temperature")
|
|
98
|
+
if temperature is not None:
|
|
99
|
+
kwargs["temperature"] = float(temperature)
|
|
100
|
+
top_p = generate_config.get("top_p")
|
|
101
|
+
if top_p is not None:
|
|
102
|
+
kwargs["top_p"] = float(top_p)
|
|
103
|
+
max_new_tokens = generate_config.get("max_tokens")
|
|
104
|
+
if max_new_tokens is not None:
|
|
105
|
+
kwargs["max_length"] = int(max_new_tokens)
|
|
106
|
+
|
|
107
|
+
stream = generate_config.get("stream", False)
|
|
108
|
+
if chat_history:
|
|
109
|
+
input_history = [
|
|
110
|
+
(chat_history[i]["content"], (chat_history[i + 1]["content"]))
|
|
111
|
+
for i in range(0, len(chat_history), 2)
|
|
112
|
+
]
|
|
113
|
+
else:
|
|
114
|
+
input_history = []
|
|
115
|
+
if stream:
|
|
116
|
+
|
|
117
|
+
def _stream_generator():
|
|
118
|
+
last_chunk_text_length = 0
|
|
119
|
+
for chunk_text, _ in self._model.stream_chat(
|
|
120
|
+
self._tokenizer, prompt, input_history, **kwargs
|
|
121
|
+
):
|
|
122
|
+
chunk_text = chunk_text[last_chunk_text_length:]
|
|
123
|
+
last_chunk_text_length += len(chunk_text)
|
|
124
|
+
completion_choice = CompletionChoice(
|
|
125
|
+
text=chunk_text, index=0, logprobs=None, finish_reason=None
|
|
126
|
+
)
|
|
127
|
+
yield CompletionChunk(
|
|
128
|
+
id=str(uuid.uuid1()),
|
|
129
|
+
object="text_completion",
|
|
130
|
+
created=int(time.time()),
|
|
131
|
+
model=self.model_uid,
|
|
132
|
+
choices=[completion_choice],
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return self._to_chat_completion_chunks(_stream_generator())
|
|
136
|
+
else:
|
|
137
|
+
response, _ = self._model.chat(
|
|
138
|
+
self._tokenizer, prompt, input_history, **kwargs
|
|
139
|
+
)
|
|
140
|
+
return ChatCompletion(
|
|
141
|
+
id="chat" + str(uuid.uuid1()),
|
|
142
|
+
object="chat.completion",
|
|
143
|
+
created=int(time.time()),
|
|
144
|
+
model=self.model_uid,
|
|
145
|
+
choices=[
|
|
146
|
+
ChatCompletionChoice(
|
|
147
|
+
index=0,
|
|
148
|
+
message={"role": "assistant", "content": response},
|
|
149
|
+
finish_reason="stop",
|
|
150
|
+
)
|
|
151
|
+
],
|
|
152
|
+
usage=CompletionUsage(
|
|
153
|
+
prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
|
|
154
|
+
),
|
|
155
|
+
)
|