xinference 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +30 -0
- xinference/client/restful/restful_client.py +29 -0
- xinference/core/cache_tracker.py +12 -1
- xinference/core/chat_interface.py +10 -4
- xinference/core/model.py +2 -2
- xinference/core/supervisor.py +30 -2
- xinference/core/utils.py +12 -0
- xinference/core/worker.py +4 -1
- xinference/deploy/cmdline.py +126 -0
- xinference/deploy/test/test_cmdline.py +24 -0
- xinference/fields.py +3 -1
- xinference/model/llm/__init__.py +2 -0
- xinference/model/llm/ggml/chatglm.py +98 -13
- xinference/model/llm/ggml/llamacpp.py +49 -2
- xinference/model/llm/llm_family.json +633 -9
- xinference/model/llm/llm_family.py +84 -10
- xinference/model/llm/llm_family_modelscope.json +337 -10
- xinference/model/llm/memory.py +332 -0
- xinference/model/llm/pytorch/chatglm.py +48 -0
- xinference/model/llm/pytorch/core.py +25 -6
- xinference/model/llm/pytorch/deepseek_vl.py +35 -9
- xinference/model/llm/pytorch/intern_vl.py +387 -0
- xinference/model/llm/pytorch/internlm2.py +32 -1
- xinference/model/llm/pytorch/qwen_vl.py +38 -11
- xinference/model/llm/pytorch/utils.py +38 -1
- xinference/model/llm/pytorch/yi_vl.py +42 -14
- xinference/model/llm/sglang/core.py +31 -9
- xinference/model/llm/utils.py +38 -5
- xinference/model/llm/vllm/core.py +87 -5
- xinference/model/rerank/core.py +23 -1
- xinference/model/utils.py +17 -7
- xinference/thirdparty/deepseek_vl/models/processing_vlm.py +1 -1
- xinference/thirdparty/deepseek_vl/models/siglip_vit.py +2 -2
- xinference/thirdparty/llava/mm_utils.py +3 -2
- xinference/thirdparty/llava/model/llava_arch.py +1 -1
- xinference/thirdparty/omnilmm/chat.py +6 -5
- xinference/types.py +10 -1
- xinference/web/ui/build/asset-manifest.json +3 -3
- xinference/web/ui/build/index.html +1 -1
- xinference/web/ui/build/static/js/{main.8e44da4b.js → main.551aa479.js} +3 -3
- xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
- xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
- {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/METADATA +10 -8
- {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/RECORD +52 -50
- xinference/web/ui/build/static/js/main.8e44da4b.js.map +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
- xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +0 -1
- /xinference/web/ui/build/static/js/{main.8e44da4b.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/LICENSE +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/WHEEL +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.0.dist-info → xinference-0.11.2.dist-info}/top_level.txt +0 -0
|
@@ -108,10 +108,11 @@ class ChatglmCppChatModel(LLM):
|
|
|
108
108
|
|
|
109
109
|
@staticmethod
|
|
110
110
|
def _convert_raw_text_chunks_to_chat(
|
|
111
|
-
tokens: Iterator[Any], model_name: str
|
|
111
|
+
tokens: Iterator[Any], model_name: str, include_usage: bool, input_ids
|
|
112
112
|
) -> Iterator[ChatCompletionChunk]:
|
|
113
|
+
request_id = str(uuid.uuid4())
|
|
113
114
|
yield {
|
|
114
|
-
"id": "chat" + f"cmpl-{
|
|
115
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
115
116
|
"model": model_name,
|
|
116
117
|
"object": "chat.completion.chunk",
|
|
117
118
|
"created": int(time.time()),
|
|
@@ -125,9 +126,13 @@ class ChatglmCppChatModel(LLM):
|
|
|
125
126
|
}
|
|
126
127
|
],
|
|
127
128
|
}
|
|
129
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
128
130
|
for token in tokens:
|
|
131
|
+
prompt_tokens = len(input_ids)
|
|
132
|
+
completion_tokens = completion_tokens + 1
|
|
133
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
129
134
|
yield {
|
|
130
|
-
"id": "chat" + f"cmpl-{
|
|
135
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
131
136
|
"model": model_name,
|
|
132
137
|
"object": "chat.completion.chunk",
|
|
133
138
|
"created": int(time.time()),
|
|
@@ -143,6 +148,35 @@ class ChatglmCppChatModel(LLM):
|
|
|
143
148
|
}
|
|
144
149
|
],
|
|
145
150
|
}
|
|
151
|
+
# stop
|
|
152
|
+
yield {
|
|
153
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
154
|
+
"model": model_name,
|
|
155
|
+
"object": "chat.completion.chunk",
|
|
156
|
+
"created": int(time.time()),
|
|
157
|
+
"choices": [
|
|
158
|
+
{
|
|
159
|
+
"index": 0,
|
|
160
|
+
"delta": {
|
|
161
|
+
"content": "",
|
|
162
|
+
},
|
|
163
|
+
"finish_reason": "stop",
|
|
164
|
+
}
|
|
165
|
+
],
|
|
166
|
+
}
|
|
167
|
+
if include_usage:
|
|
168
|
+
yield {
|
|
169
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
170
|
+
"model": model_name,
|
|
171
|
+
"object": "chat.completion.chunk",
|
|
172
|
+
"created": int(time.time()),
|
|
173
|
+
"choices": [],
|
|
174
|
+
"usage": {
|
|
175
|
+
"prompt_tokens": prompt_tokens,
|
|
176
|
+
"completion_tokens": completion_tokens,
|
|
177
|
+
"total_tokens": total_tokens,
|
|
178
|
+
},
|
|
179
|
+
}
|
|
146
180
|
|
|
147
181
|
@classmethod
|
|
148
182
|
def _convert_raw_text_completion_to_chat(
|
|
@@ -273,7 +307,7 @@ class ChatglmCppChatModel(LLM):
|
|
|
273
307
|
|
|
274
308
|
params = {
|
|
275
309
|
"max_length": generate_config.get("max_tokens"),
|
|
276
|
-
"max_context_length": generate_config.get("max_tokens"),
|
|
310
|
+
"max_context_length": generate_config.get("max_tokens", 1024),
|
|
277
311
|
"top_k": generate_config.get("top_k"),
|
|
278
312
|
"top_p": generate_config.get("top_p"),
|
|
279
313
|
"temperature": generate_config.get("temperature"),
|
|
@@ -286,13 +320,27 @@ class ChatglmCppChatModel(LLM):
|
|
|
286
320
|
assert self._llm is not None
|
|
287
321
|
chat_history_messages = self._to_chatglm_chat_messages(chat_history_list)
|
|
288
322
|
|
|
289
|
-
|
|
323
|
+
stream = generate_config.get("stream")
|
|
324
|
+
stream_options = generate_config.get("stream_options", None)
|
|
325
|
+
include_usage = (
|
|
326
|
+
stream_options["include_usage"]
|
|
327
|
+
if isinstance(stream_options, dict)
|
|
328
|
+
else False
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if stream:
|
|
290
332
|
it = self._llm.chat(
|
|
291
333
|
chat_history_messages,
|
|
292
334
|
**params,
|
|
293
335
|
)
|
|
294
336
|
assert not isinstance(it, str)
|
|
295
|
-
|
|
337
|
+
input_ids = self._llm.tokenizer.encode_messages(
|
|
338
|
+
chat_history_messages, params["max_context_length"]
|
|
339
|
+
)
|
|
340
|
+
return self._convert_raw_text_chunks_to_chat(
|
|
341
|
+
it, self.model_uid, include_usage, input_ids
|
|
342
|
+
)
|
|
343
|
+
|
|
296
344
|
else:
|
|
297
345
|
c = self._llm.chat(
|
|
298
346
|
chat_history_messages,
|
|
@@ -320,11 +368,13 @@ class ChatglmCppChatModel(LLM):
|
|
|
320
368
|
|
|
321
369
|
@staticmethod
|
|
322
370
|
def _convert_str_to_completion_chunk(
|
|
323
|
-
tokens: Iterator[str], model_name: str
|
|
371
|
+
tokens: Iterator[str], model_name: str, include_usage: bool, input_ids
|
|
324
372
|
) -> Iterator[CompletionChunk]:
|
|
325
|
-
|
|
373
|
+
request_id = str(uuid.uuid4())
|
|
374
|
+
prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
|
|
375
|
+
for i, token in enumerate(tokens):
|
|
326
376
|
yield {
|
|
327
|
-
"id": "generate" + f"-{
|
|
377
|
+
"id": "generate" + f"-{request_id}",
|
|
328
378
|
"model": model_name,
|
|
329
379
|
"object": "text_completion",
|
|
330
380
|
"created": int(time.time()),
|
|
@@ -332,6 +382,32 @@ class ChatglmCppChatModel(LLM):
|
|
|
332
382
|
{"index": 0, "text": token, "finish_reason": None, "logprobs": None}
|
|
333
383
|
],
|
|
334
384
|
}
|
|
385
|
+
prompt_tokens = len(input_ids)
|
|
386
|
+
completion_tokens = i
|
|
387
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
388
|
+
# stop
|
|
389
|
+
yield {
|
|
390
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
391
|
+
"model": model_name,
|
|
392
|
+
"object": "text_completion",
|
|
393
|
+
"created": int(time.time()),
|
|
394
|
+
"choices": [
|
|
395
|
+
{"index": 0, "text": "", "finish_reason": "stop", "logprobs": None}
|
|
396
|
+
],
|
|
397
|
+
}
|
|
398
|
+
if include_usage:
|
|
399
|
+
yield {
|
|
400
|
+
"id": "chat" + f"cmpl-{request_id}",
|
|
401
|
+
"model": model_name,
|
|
402
|
+
"object": "text_completion",
|
|
403
|
+
"created": int(time.time()),
|
|
404
|
+
"choices": [],
|
|
405
|
+
"usage": {
|
|
406
|
+
"prompt_tokens": prompt_tokens,
|
|
407
|
+
"completion_tokens": completion_tokens,
|
|
408
|
+
"total_tokens": total_tokens,
|
|
409
|
+
},
|
|
410
|
+
}
|
|
335
411
|
|
|
336
412
|
def generate(
|
|
337
413
|
self,
|
|
@@ -344,7 +420,7 @@ class ChatglmCppChatModel(LLM):
|
|
|
344
420
|
|
|
345
421
|
params = {
|
|
346
422
|
"max_length": generate_config.get("max_tokens"),
|
|
347
|
-
"max_context_length": generate_config.get("max_tokens"),
|
|
423
|
+
"max_context_length": generate_config.get("max_tokens", 1024),
|
|
348
424
|
"top_k": generate_config.get("top_k"),
|
|
349
425
|
"top_p": generate_config.get("top_p"),
|
|
350
426
|
"temperature": generate_config.get("temperature"),
|
|
@@ -355,14 +431,23 @@ class ChatglmCppChatModel(LLM):
|
|
|
355
431
|
params = {k: v for k, v in params.items() if v is not None}
|
|
356
432
|
|
|
357
433
|
assert self._llm is not None
|
|
358
|
-
|
|
359
|
-
|
|
434
|
+
stream = generate_config.get("stream")
|
|
435
|
+
stream_options = generate_config.get("stream_options", None)
|
|
436
|
+
include_usage = (
|
|
437
|
+
stream_options["include_usage"]
|
|
438
|
+
if isinstance(stream_options, dict)
|
|
439
|
+
else False
|
|
440
|
+
)
|
|
441
|
+
if stream:
|
|
360
442
|
it = self._llm.generate(
|
|
361
443
|
prompt,
|
|
362
444
|
**params,
|
|
363
445
|
)
|
|
364
446
|
assert not isinstance(it, str)
|
|
365
|
-
|
|
447
|
+
input_ids = self._llm.tokenizer.encode(prompt, params["max_context_length"])
|
|
448
|
+
return self._convert_str_to_completion_chunk(
|
|
449
|
+
it, self.model_uid, include_usage, input_ids
|
|
450
|
+
)
|
|
366
451
|
else:
|
|
367
452
|
c = self._llm.generate(
|
|
368
453
|
prompt,
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
import datetime
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
+
import time
|
|
17
18
|
from typing import Iterable, Iterator, List, Optional, Union
|
|
18
19
|
|
|
19
20
|
from ....types import (
|
|
@@ -22,6 +23,7 @@ from ....types import (
|
|
|
22
23
|
ChatCompletionMessage,
|
|
23
24
|
Completion,
|
|
24
25
|
CompletionChunk,
|
|
26
|
+
CompletionUsage,
|
|
25
27
|
CreateCompletionLlamaCpp,
|
|
26
28
|
Embedding,
|
|
27
29
|
LlamaCppGenerateConfig,
|
|
@@ -100,6 +102,8 @@ class LlamaCppModel(LLM):
|
|
|
100
102
|
generate_config = LlamaCppGenerateConfig(
|
|
101
103
|
**CreateCompletionLlamaCpp(**generate_config).dict()
|
|
102
104
|
)
|
|
105
|
+
# Currently, llama.cpp does not support lora
|
|
106
|
+
generate_config.pop("lora_name", None) # type: ignore
|
|
103
107
|
return generate_config
|
|
104
108
|
|
|
105
109
|
def _convert_ggml_to_gguf(self, model_path: str) -> str:
|
|
@@ -195,16 +199,59 @@ class LlamaCppModel(LLM):
|
|
|
195
199
|
_generate_config: LlamaCppGenerateConfig,
|
|
196
200
|
) -> Iterator[CompletionChunk]:
|
|
197
201
|
assert self._llm is not None
|
|
198
|
-
|
|
202
|
+
prompt_token_ids: List[int] = (
|
|
203
|
+
(
|
|
204
|
+
self._llm.tokenize(prompt.encode("utf-8"), special=True)
|
|
205
|
+
if prompt != ""
|
|
206
|
+
else [self._llm.token_bos()]
|
|
207
|
+
)
|
|
208
|
+
if isinstance(prompt, str)
|
|
209
|
+
else prompt
|
|
210
|
+
)
|
|
211
|
+
prompt_tokens = len(prompt_token_ids)
|
|
212
|
+
completion_tokens, total_tokens = 0, 0
|
|
213
|
+
request_id = 0
|
|
214
|
+
for index, _completion_chunk in enumerate(
|
|
215
|
+
self._llm(prompt=_prompt, **_generate_config)
|
|
216
|
+
):
|
|
217
|
+
request_id = _completion_chunk["id"]
|
|
218
|
+
choice = _completion_chunk["choices"][0]
|
|
219
|
+
if choice["finish_reason"] is not None:
|
|
220
|
+
completion_tokens = index
|
|
221
|
+
total_tokens = prompt_tokens + completion_tokens
|
|
222
|
+
_completion_chunk["usage"] = CompletionUsage(
|
|
223
|
+
prompt_tokens=total_tokens,
|
|
224
|
+
completion_tokens=completion_tokens,
|
|
225
|
+
total_tokens=total_tokens,
|
|
226
|
+
)
|
|
199
227
|
yield _completion_chunk
|
|
228
|
+
if include_usage:
|
|
229
|
+
chunk = CompletionChunk(
|
|
230
|
+
id=request_id,
|
|
231
|
+
object="text_completion",
|
|
232
|
+
created=int(time.time()),
|
|
233
|
+
model=self.model_uid,
|
|
234
|
+
choices=[],
|
|
235
|
+
)
|
|
236
|
+
chunk["usage"] = CompletionUsage(
|
|
237
|
+
prompt_tokens=prompt_tokens,
|
|
238
|
+
completion_tokens=completion_tokens,
|
|
239
|
+
total_tokens=total_tokens,
|
|
240
|
+
)
|
|
241
|
+
yield chunk
|
|
200
242
|
|
|
201
243
|
logger.debug(
|
|
202
244
|
"Enter generate, prompt: %s, generate config: %s", prompt, generate_config
|
|
203
245
|
)
|
|
204
246
|
|
|
205
247
|
generate_config = self._sanitize_generate_config(generate_config)
|
|
206
|
-
|
|
207
248
|
stream = generate_config.get("stream", False)
|
|
249
|
+
stream_options = generate_config.pop("stream_options", None)
|
|
250
|
+
include_usage = (
|
|
251
|
+
stream_options["include_usage"]
|
|
252
|
+
if isinstance(stream_options, dict)
|
|
253
|
+
else False
|
|
254
|
+
)
|
|
208
255
|
|
|
209
256
|
if not stream:
|
|
210
257
|
assert self._llm is not None
|