xinference 0.10.3__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (89) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +1 -1
  3. xinference/api/restful_api.py +53 -61
  4. xinference/client/restful/restful_client.py +52 -57
  5. xinference/conftest.py +1 -1
  6. xinference/core/cache_tracker.py +1 -1
  7. xinference/core/event.py +1 -1
  8. xinference/core/model.py +15 -4
  9. xinference/core/status_guard.py +1 -1
  10. xinference/core/supervisor.py +58 -72
  11. xinference/core/worker.py +68 -101
  12. xinference/deploy/cmdline.py +166 -1
  13. xinference/deploy/test/test_cmdline.py +2 -0
  14. xinference/deploy/utils.py +1 -1
  15. xinference/device_utils.py +29 -3
  16. xinference/fields.py +5 -1
  17. xinference/model/audio/whisper.py +88 -12
  18. xinference/model/core.py +2 -2
  19. xinference/model/image/__init__.py +29 -0
  20. xinference/model/image/core.py +6 -0
  21. xinference/model/image/custom.py +109 -0
  22. xinference/model/llm/__init__.py +92 -32
  23. xinference/model/llm/core.py +57 -102
  24. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
  25. xinference/model/llm/llm_family.json +306 -4
  26. xinference/model/llm/llm_family.py +45 -41
  27. xinference/model/llm/llm_family_modelscope.json +119 -2
  28. xinference/model/llm/pytorch/deepseek_vl.py +89 -33
  29. xinference/model/llm/pytorch/qwen_vl.py +67 -12
  30. xinference/model/llm/pytorch/yi_vl.py +62 -45
  31. xinference/model/llm/utils.py +29 -15
  32. xinference/model/llm/vllm/core.py +19 -4
  33. xinference/thirdparty/omnilmm/chat.py +2 -1
  34. xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
  35. xinference/types.py +2 -0
  36. xinference/web/ui/build/asset-manifest.json +6 -3
  37. xinference/web/ui/build/index.html +1 -1
  38. xinference/web/ui/build/static/css/main.54bca460.css +2 -0
  39. xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
  40. xinference/web/ui/build/static/js/main.8e44da4b.js +3 -0
  41. xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.8e44da4b.js.LICENSE.txt} +7 -0
  42. xinference/web/ui/build/static/js/main.8e44da4b.js.map +1 -0
  43. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
  44. xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
  45. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
  46. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
  47. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
  48. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
  49. xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
  50. xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/ddaec68b88e5eff792df1e39a4b4b8b737bfc832293c015660c3c69334e3cf5c.json +1 -0
  58. xinference/web/ui/node_modules/.package-lock.json +33 -0
  59. xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
  60. xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
  61. xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
  62. xinference/web/ui/node_modules/clipboard/bower.json +18 -0
  63. xinference/web/ui/node_modules/clipboard/composer.json +25 -0
  64. xinference/web/ui/node_modules/clipboard/package.json +63 -0
  65. xinference/web/ui/node_modules/delegate/package.json +31 -0
  66. xinference/web/ui/node_modules/good-listener/bower.json +11 -0
  67. xinference/web/ui/node_modules/good-listener/package.json +35 -0
  68. xinference/web/ui/node_modules/select/bower.json +13 -0
  69. xinference/web/ui/node_modules/select/package.json +29 -0
  70. xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
  71. xinference/web/ui/package-lock.json +34 -0
  72. xinference/web/ui/package.json +1 -0
  73. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/METADATA +11 -11
  74. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/RECORD +78 -57
  75. xinference/client/oscar/__init__.py +0 -13
  76. xinference/client/oscar/actor_client.py +0 -611
  77. xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
  78. xinference/model/llm/pytorch/spec_model.py +0 -186
  79. xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
  80. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
  81. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
  82. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
  83. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
  84. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
  85. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
  86. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/LICENSE +0 -0
  87. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/WHEEL +0 -0
  88. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/entry_points.txt +0 -0
  89. {xinference-0.10.3.dist-info → xinference-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,611 +0,0 @@
1
- # Copyright 2022-2023 XProbe Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import asyncio
16
- import re
17
- from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, List, Optional, Union
18
-
19
- import orjson
20
- import xoscar as xo
21
-
22
- from ...core.model import ModelActor
23
- from ...core.supervisor import SupervisorActor
24
- from ...isolation import Isolation
25
- from ..restful.restful_client import Client
26
-
27
- if TYPE_CHECKING:
28
- import PIL
29
-
30
- from ...types import (
31
- ChatCompletion,
32
- ChatCompletionChunk,
33
- ChatCompletionMessage,
34
- ChatglmCppGenerateConfig,
35
- Completion,
36
- CompletionChunk,
37
- ImageList,
38
- LlamaCppGenerateConfig,
39
- PytorchGenerateConfig,
40
- )
41
-
42
-
43
- class SSEEvent(object):
44
- # https://github.com/btubbs/sseclient/blob/master/sseclient.py
45
- sse_line_pattern = re.compile("(?P<name>[^:]*):?( ?(?P<value>.*))?")
46
-
47
- def __init__(self, data="", event="message", id=None, retry=None):
48
- self.data = data
49
- self.event = event
50
- self.id = id
51
- self.retry = retry
52
-
53
- @classmethod
54
- def parse(cls, raw):
55
- """
56
- Given a possibly-multiline string representing an SSE message, parse it
57
- and return a Event object.
58
- """
59
- msg = cls()
60
- for line in raw.splitlines():
61
- m = cls.sse_line_pattern.match(line)
62
- if m is None:
63
- # Malformed line. Discard but warn.
64
- continue
65
-
66
- name = m.group("name")
67
- if name == "":
68
- # line began with a ":", so is a comment. Ignore
69
- continue
70
- value = m.group("value")
71
-
72
- if name == "data":
73
- # If we already have some data, then join to it with a newline.
74
- # Else this is it.
75
- if msg.data:
76
- msg.data = "%s\n%s" % (msg.data, value)
77
- else:
78
- msg.data = value
79
- elif name == "event":
80
- msg.event = value
81
- elif name == "id":
82
- msg.id = value
83
- elif name == "retry":
84
- msg.retry = int(value)
85
-
86
- return msg
87
-
88
-
89
- class ModelHandle:
90
- """
91
- A sync model interface (for rpc client) which provides type hints that makes it much easier to use xinference
92
- programmatically.
93
- """
94
-
95
- def __init__(self, model_ref: xo.ActorRefType["ModelActor"], isolation: Isolation):
96
- self._model_ref = model_ref
97
- self._isolation = isolation
98
-
99
-
100
- class ClientIteratorWrapper(AsyncIterator):
101
- def __init__(self, iterator_wrapper):
102
- self._iw = iterator_wrapper
103
-
104
- def __aiter__(self):
105
- return self
106
-
107
- async def __anext__(self):
108
- r = await self._iw.__anext__()
109
- text = r.decode("utf-8")
110
- return orjson.loads(SSEEvent.parse(text).data)
111
-
112
-
113
- class EmbeddingModelHandle(ModelHandle):
114
- def create_embedding(self, input: Union[str, List[str]], **kwargs) -> bytes:
115
- """
116
- Creates an embedding vector representing the input text.
117
-
118
- Parameters
119
- ----------
120
- input: Union[str, List[str]]
121
- Input text to embed, encoded as a string or array of tokens.
122
- To embed multiple inputs in a single request, pass an array of strings or array of token arrays.
123
-
124
- Returns
125
- -------
126
- bytes
127
- A json bytes of Embedding. The resulted Embedding vector that can be easily consumed by
128
- machine learning models and algorithms.
129
- """
130
-
131
- coro = self._model_ref.create_embedding(input, **kwargs)
132
- return orjson.loads(self._isolation.call(coro))
133
-
134
-
135
- class RerankModelHandle(ModelHandle):
136
- def rerank(
137
- self,
138
- documents: List[str],
139
- query: str,
140
- top_n: Optional[int],
141
- max_chunks_per_doc: Optional[int],
142
- return_documents: Optional[bool],
143
- **kwargs,
144
- ):
145
- """
146
- Returns an ordered list of documents ordered by their relevance to the provided query.
147
-
148
- Parameters
149
- ----------
150
- query: str
151
- The search query
152
- documents: List[str]
153
- The documents to rerank
154
- top_n: int
155
- The number of results to return, defaults to returning all results
156
- max_chunks_per_doc: int
157
- The maximum number of chunks derived from a document
158
- return_documents: bool
159
- if return documents
160
- Returns
161
- -------
162
- Scores
163
- The scores of documents ordered by their relevance to the provided query
164
-
165
- """
166
- coro = self._model_ref.rerank(
167
- documents, query, top_n, max_chunks_per_doc, return_documents, **kwargs
168
- )
169
- results = orjson.loads(self._isolation.call(coro))
170
- for r in results["results"]:
171
- r["document"] = documents[r["index"]]
172
- return results
173
-
174
-
175
- class GenerateModelHandle(ModelHandle):
176
- def generate(
177
- self,
178
- prompt: str,
179
- generate_config: Optional[
180
- Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]
181
- ] = None,
182
- ) -> Union["Completion", AsyncIterator["CompletionChunk"]]:
183
- """
184
- Creates a completion for the provided prompt and parameters.
185
-
186
- Parameters
187
- ----------
188
- prompt: str
189
- The user's input.
190
- generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
191
- Additional configurations for completion.
192
- "LlamaCppGenerateConfig" -> Configuration for ggml model.
193
- "PytorchGenerateConfig" -> Configuration for pytorch model.
194
-
195
- Returns
196
- -------
197
- Union["Completion", Iterator["CompletionChunk"]]
198
- Stream is a parameter in generate_config.
199
- When stream is set to True, the function will return Iterator["CompletionChunk"].
200
- When stream is set to False, the function will return "Completion".
201
-
202
- """
203
-
204
- coro = self._model_ref.generate(prompt, generate_config)
205
- r = self._isolation.call(coro)
206
- if isinstance(r, bytes):
207
- return orjson.loads(r)
208
- return ClientIteratorWrapper(r)
209
-
210
-
211
- class ChatModelHandle(GenerateModelHandle):
212
- def chat(
213
- self,
214
- prompt: str,
215
- system_prompt: Optional[str] = None,
216
- chat_history: Optional[List["ChatCompletionMessage"]] = None,
217
- generate_config: Optional[
218
- Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]
219
- ] = None,
220
- ) -> Union["ChatCompletion", AsyncIterator["ChatCompletionChunk"]]:
221
- """
222
- Given a list of messages comprising a conversation, the model will return a response.
223
-
224
- Parameters
225
- ----------
226
- prompt : str
227
- The user's input.
228
- Parameters
229
- ----------
230
- prompt: str
231
- The user's input.
232
- system_prompt: Optional[str]
233
- The system context provide to Model prior to any chats.
234
- chat_history: Optional[List["ChatCompletionMessage"]]
235
- A list of messages comprising the conversation so far.
236
- generate_config: Optional[Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]]
237
- Additional configuration for the chat generation.
238
- "LlamaCppGenerateConfig" -> configuration for ggml model
239
- "PytorchGenerateConfig" -> configuration for pytorch model
240
-
241
- Returns
242
- -------
243
- Union["ChatCompletion", Iterator["ChatCompletionChunk"]]
244
- Stream is a parameter in generate_config.
245
- When stream is set to True, the function will return Iterator["ChatCompletionChunk"].
246
- When stream is set to False, the function will return "ChatCompletion".
247
-
248
- """
249
-
250
- coro = self._model_ref.chat(
251
- prompt, system_prompt, chat_history, generate_config
252
- )
253
- r = self._isolation.call(coro)
254
- if isinstance(r, bytes):
255
- return orjson.loads(r)
256
- return ClientIteratorWrapper(r)
257
-
258
-
259
- class ChatglmCppChatModelHandle(ModelHandle):
260
- def chat(
261
- self,
262
- prompt: str,
263
- chat_history: Optional[List["ChatCompletionMessage"]] = None,
264
- generate_config: Optional["ChatglmCppGenerateConfig"] = None,
265
- ) -> Union["ChatCompletion", AsyncIterator["ChatCompletionChunk"]]:
266
- """
267
- Given a list of messages comprising a conversation, the ChatGLM model will return a response.
268
-
269
- Parameters
270
- ----------
271
- prompt: str
272
- The user's input
273
- chat_history: Optional[List["ChatCompletionMessage"]]
274
- A list of messages comprising the conversation so far.
275
- generate_config: Optional["ChatglmCppGenerateConfig"]
276
- Additional Configuration for the ChatGLM Model generation.
277
-
278
- Returns
279
- -------
280
- Union["ChatCompletion", Iterator["ChatCompletionChunk"]]
281
- Stream is a parameter in generate_config.
282
- When stream is set to True, the function will return Iterator["ChatCompletionChunk"].
283
- When stream is set to False, the function will return "ChatCompletion".
284
-
285
- """
286
-
287
- coro = self._model_ref.chat(prompt, chat_history, generate_config)
288
- r = self._isolation.call(coro)
289
- if isinstance(r, bytes):
290
- return orjson.loads(r)
291
- return ClientIteratorWrapper(r)
292
-
293
-
294
- class ImageModelHandle(ModelHandle):
295
- def text_to_image(
296
- self,
297
- prompt: str,
298
- n: int = 1,
299
- size: str = "1024*1024",
300
- response_format: str = "url",
301
- **kwargs,
302
- ) -> "ImageList":
303
- """
304
- Creates an image by the input text.
305
-
306
- Parameters
307
- ----------
308
- prompt (`str` or `List[str]`, *optional*):
309
- The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
310
- n (`int`, *optional*, defaults to 1):
311
- The number of images to generate per prompt. Must be between 1 and 10.
312
- size (`str`, *optional*, defaults to `1024*1024`):
313
- The width*height in pixels of the generated image. Must be one of 256x256, 512x512, or 1024x1024.
314
- response_format (`str`, *optional*, defaults to `url`):
315
- The format in which the generated images are returned. Must be one of url or b64_json.
316
- Returns
317
- -------
318
- ImageList
319
- A list of image objects.
320
- """
321
-
322
- coro = self._model_ref.text_to_image(prompt, n, size, response_format, **kwargs)
323
- return orjson.loads(self._isolation.call(coro))
324
-
325
- def image_to_image(
326
- self,
327
- image: "PIL.Image",
328
- prompt: str,
329
- negative_prompt: str,
330
- n: int = 1,
331
- size: str = "1024*1024",
332
- response_format: str = "url",
333
- **kwargs,
334
- ) -> "ImageList":
335
- """
336
- Creates an image by the input text.
337
-
338
- Parameters
339
- ----------
340
- image (`PIL.Image`):
341
- The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
342
- specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
343
- accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
344
- and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
345
- `init`, images must be passed as a list such that each element of the list can be correctly batched for
346
- input to a single ControlNet.
347
- prompt (`str` or `List[str]`, *optional*):
348
- The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
349
- negative_prompt (`str` or `List[str]`, *optional*):
350
- The prompt or prompts not to guide the image generation. If not defined, one has to pass
351
- `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
352
- less than `1`).
353
- n (`int`, *optional*, defaults to 1):
354
- The number of images to generate per prompt. Must be between 1 and 10.
355
- size (`str`, *optional*, defaults to `1024*1024`):
356
- The width*height in pixels of the generated image. Must be one of 256x256, 512x512, or 1024x1024.
357
- response_format (`str`, *optional*, defaults to `url`):
358
- The format in which the generated images are returned. Must be one of url or b64_json.
359
- Returns
360
- -------
361
- ImageList
362
- A list of image objects.
363
- """
364
-
365
- coro = self._model_ref.image_to_image(
366
- image, prompt, negative_prompt, n, size, response_format, **kwargs
367
- )
368
- return orjson.loads(self._isolation.call(coro))
369
-
370
-
371
- class ActorClient:
372
- def __init__(self, endpoint: str):
373
- restful_client = Client(endpoint)
374
- self._supervisor_address = restful_client._get_supervisor_internal_address()
375
- self._isolation = Isolation(asyncio.new_event_loop(), threaded=True)
376
- self._isolation.start()
377
- self._supervisor_ref: xo.ActorRefType["SupervisorActor"] = self._isolation.call(
378
- xo.actor_ref(address=self._supervisor_address, uid=SupervisorActor.uid())
379
- )
380
-
381
- def register_model(self, model_type: str, model: str, persist: bool):
382
- """
383
- Register a custom model.
384
-
385
- Parameters
386
- ----------
387
- model_type: str
388
- The type of model.
389
- model: str
390
- The model definition. (refer to: https://inference.readthedocs.io/en/latest/models/custom.html)
391
- persist: bool
392
- """
393
- coro = self._supervisor_ref.register_model(model_type, model, persist)
394
- self._isolation.call(coro)
395
-
396
- def unregister_model(self, model_type: str, model_name: str):
397
- """
398
- Unregister a custom model.
399
-
400
- Parameters
401
- ----------
402
- model_type: str
403
- The type of model.
404
- model_name: str
405
- The name of the model
406
- """
407
- coro = self._supervisor_ref.unregister_model(model_type, model_name)
408
- self._isolation.call(coro)
409
-
410
- def list_model_registrations(self, model_type: str) -> List[Dict[str, Any]]:
411
- """
412
- List models registered on the server.
413
-
414
- Parameters
415
- ----------
416
- model_type: str
417
- The type of the model.
418
-
419
- Returns
420
- -------
421
- List[Dict[str, Any]]
422
- The collection of registered models on the server.
423
- """
424
- coro = self._supervisor_ref.list_model_registrations(model_type)
425
- return self._isolation.call(coro)
426
-
427
- def get_model_registration(
428
- self, model_type: str, model_name: str
429
- ) -> Dict[str, Any]:
430
- """
431
- Get the model with the model type and model name registered on the server.
432
-
433
- Parameters
434
- ----------
435
- model_type: str
436
- The type of the model.
437
-
438
- model_name: str
439
- The name of the model.
440
- Returns
441
- -------
442
- List[Dict[str, Any]]
443
- The collection of registered models on the server.
444
- """
445
- coro = self._supervisor_ref.get_model_registration(model_type, model_name)
446
- return self._isolation.call(coro)
447
-
448
- def launch_model(
449
- self,
450
- model_name: str,
451
- model_type: str = "LLM",
452
- model_size_in_billions: Optional[int] = None,
453
- model_format: Optional[str] = None,
454
- quantization: Optional[str] = None,
455
- replica: int = 1,
456
- n_gpu: Optional[Union[int, str]] = "auto",
457
- request_limits: Optional[int] = None,
458
- **kwargs,
459
- ) -> str:
460
- """
461
- Launch the Model based on the parameters on the server.
462
-
463
- Parameters
464
- ----------
465
- model_name: str
466
- The name of model.
467
- model_type: str
468
- Type of model.
469
- model_size_in_billions: Optional[int]
470
- The size (in billions) of the model.
471
- model_format: Optional[str]
472
- The format of the model.
473
- quantization: Optional[str]
474
- The quantization of model.
475
- replica: Optional[int]
476
- The replica of model, default is 1.
477
- n_gpu: Optional[Union[int, str]],
478
- The number of GPUs used by the model, default is "auto".
479
- ``n_gpu=None`` means cpu only, ``n_gpu=auto`` lets the system automatically determine the best number of GPUs to use.
480
- request_limits: Optional[int]
481
- The number of request limits for this model, default is None.
482
- ``request_limits=None`` means no limits for this model.
483
- **kwargs:
484
- Any other parameters been specified.
485
-
486
- Returns
487
- -------
488
- str
489
- The unique model_uid for the launched model.
490
-
491
- """
492
-
493
- coro = self._supervisor_ref.launch_builtin_model(
494
- model_uid=None,
495
- model_name=model_name,
496
- model_type=model_type,
497
- model_size_in_billions=model_size_in_billions,
498
- model_format=model_format,
499
- quantization=quantization,
500
- replica=replica,
501
- n_gpu=n_gpu,
502
- request_limits=request_limits,
503
- **kwargs,
504
- )
505
-
506
- return self._isolation.call(coro)
507
-
508
- def terminate_model(self, model_uid: str):
509
- """
510
- Terminate the specific model running on the server.
511
-
512
- Parameters
513
- ----------
514
- model_uid: str
515
- The unique id that identify the model we want.
516
- """
517
-
518
- coro = self._supervisor_ref.terminate_model(model_uid)
519
- self._isolation.call(coro)
520
-
521
- def list_models(self) -> Dict[str, Dict[str, Any]]:
522
- """
523
- Retrieve the model specifications from the Server.
524
-
525
- Returns
526
- -------
527
- Dict[str, Dict[str, Any]]
528
- The collection of model specifications with their names on the server.
529
-
530
- """
531
-
532
- coro = self._supervisor_ref.list_models()
533
- return self._isolation.call(coro)
534
-
535
- def get_model(self, model_uid: str) -> "ModelHandle":
536
- """
537
- Launch the Model based on the parameters on the server.
538
-
539
- Parameters
540
- ----------
541
- model_uid: str
542
- The unique id that identify the model.
543
-
544
- Returns
545
- -------
546
- ModelHandle
547
- The corresponding Model Handler based on the Model specified in the uid:
548
- "ChatglmCppChatModelHandle" -> handler for ChatGLM chat model
549
- "GenerateModelHandle" -> handle for generate model. e.g. Baichuan.
550
- "ChatModelHandle" -> handle for chat model. e.g. Baichuan-chat.
551
-
552
- """
553
-
554
- desc: Dict[str, Any] = self._isolation.call(
555
- self._supervisor_ref.describe_model(model_uid)
556
- )
557
- model_ref = self._isolation.call(self._supervisor_ref.get_model(model_uid))
558
- if desc["model_type"] == "LLM":
559
- if desc["model_format"] == "ggmlv3" and "chatglm" in desc["model_name"]:
560
- return ChatglmCppChatModelHandle(model_ref, self._isolation)
561
- elif "chat" in desc["model_ability"]:
562
- return ChatModelHandle(model_ref, self._isolation)
563
- elif "generate" in desc["model_ability"]:
564
- return GenerateModelHandle(model_ref, self._isolation)
565
- else:
566
- raise ValueError(f"Unrecognized model ability: {desc['model_ability']}")
567
- elif desc["model_type"] == "embedding":
568
- return EmbeddingModelHandle(model_ref, self._isolation)
569
- elif desc["model_type"] == "image":
570
- return ImageModelHandle(model_ref, self._isolation)
571
- elif desc["model_type"] == "rerank":
572
- return RerankModelHandle(model_ref, self._isolation)
573
- else:
574
- raise ValueError(f"Unknown model type:{desc['model_type']}")
575
-
576
- def describe_model(self, model_uid: str) -> Dict:
577
- """
578
- Get model information.
579
-
580
- Parameters
581
- ----------
582
- model_uid: str
583
- The unique id that identify the model.
584
-
585
- Returns
586
- -------
587
- dict
588
- A dictionary containing the following keys:
589
- - "model_type": str
590
- the type of the model determined by its function, e.g. "LLM" (Large Language Model)
591
- - "model_name": str
592
- the name of the specific LLM model family
593
- - "model_lang": List[str]
594
- the languages supported by the LLM model
595
- - "model_ability": List[str]
596
- the ability or capabilities of the LLM model
597
- - "model_description": str
598
- a detailed description of the LLM model
599
- - "model_format": str
600
- the format specification of the LLM model
601
- - "model_size_in_billions": int
602
- the size of the LLM model in billions
603
- - "quantization": str
604
- the quantization applied to the model
605
- - "revision": str
606
- the revision number of the LLM model specification
607
- - "context_length": int
608
- the maximum text length the LLM model can accommodate (include all input & output)
609
- """
610
-
611
- return self._isolation.call(self._supervisor_ref.describe_model(model_uid))