xinference 0.11.2__py3-none-any.whl → 0.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xinference might be problematic. Click here for more details.
- xinference/_version.py +3 -3
- xinference/api/restful_api.py +14 -8
- xinference/constants.py +4 -0
- xinference/core/__init__.py +0 -2
- xinference/core/cache_tracker.py +22 -1
- xinference/core/chat_interface.py +71 -10
- xinference/core/supervisor.py +5 -3
- xinference/core/worker.py +8 -3
- xinference/model/llm/__init__.py +2 -0
- xinference/model/llm/llm_family.json +336 -39
- xinference/model/llm/llm_family_modelscope.json +267 -1
- xinference/model/llm/pytorch/baichuan.py +2 -1
- xinference/model/llm/pytorch/cogvlm2.py +257 -0
- xinference/model/llm/pytorch/core.py +1 -0
- xinference/model/llm/pytorch/intern_vl.py +8 -53
- xinference/model/llm/vllm/core.py +1 -1
- {xinference-0.11.2.dist-info → xinference-0.11.3.dist-info}/METADATA +4 -4
- {xinference-0.11.2.dist-info → xinference-0.11.3.dist-info}/RECORD +22 -21
- {xinference-0.11.2.dist-info → xinference-0.11.3.dist-info}/LICENSE +0 -0
- {xinference-0.11.2.dist-info → xinference-0.11.3.dist-info}/WHEEL +0 -0
- {xinference-0.11.2.dist-info → xinference-0.11.3.dist-info}/entry_points.txt +0 -0
- {xinference-0.11.2.dist-info → xinference-0.11.3.dist-info}/top_level.txt +0 -0
xinference/_version.py
CHANGED
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2024-05-
|
|
11
|
+
"date": "2024-05-31T17:12:13+0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.11.
|
|
14
|
+
"full-revisionid": "69c09cd068a530cd2fdcac07e4e81f03d48f04f9",
|
|
15
|
+
"version": "0.11.3"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
xinference/api/restful_api.py
CHANGED
|
@@ -52,7 +52,7 @@ from xoscar.utils import get_next_port
|
|
|
52
52
|
|
|
53
53
|
from .._compat import BaseModel, Field
|
|
54
54
|
from .._version import get_versions
|
|
55
|
-
from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT
|
|
55
|
+
from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT, XINFERENCE_DISABLE_METRICS
|
|
56
56
|
from ..core.event import Event, EventCollectorActor, EventType
|
|
57
57
|
from ..core.supervisor import SupervisorActor
|
|
58
58
|
from ..core.utils import json_dumps
|
|
@@ -504,13 +504,19 @@ class RESTfulAPI:
|
|
|
504
504
|
),
|
|
505
505
|
)
|
|
506
506
|
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
507
|
+
if XINFERENCE_DISABLE_METRICS:
|
|
508
|
+
logger.info(
|
|
509
|
+
"Supervisor metrics is disabled due to the environment XINFERENCE_DISABLE_METRICS=1"
|
|
510
|
+
)
|
|
511
|
+
self._app.include_router(self._router)
|
|
512
|
+
else:
|
|
513
|
+
# Clear the global Registry for the MetricsMiddleware, or
|
|
514
|
+
# the MetricsMiddleware will register duplicated metrics if the port
|
|
515
|
+
# conflict (This serve method run more than once).
|
|
516
|
+
REGISTRY.clear()
|
|
517
|
+
self._app.add_middleware(MetricsMiddleware)
|
|
518
|
+
self._app.include_router(self._router)
|
|
519
|
+
self._app.add_route("/metrics", metrics)
|
|
514
520
|
|
|
515
521
|
# Check all the routes returns Response.
|
|
516
522
|
# This is to avoid `jsonable_encoder` performance issue:
|
xinference/constants.py
CHANGED
|
@@ -26,6 +26,7 @@ XINFERENCE_ENV_HEALTH_CHECK_TIMEOUT = "XINFERENCE_HEALTH_CHECK_TIMEOUT"
|
|
|
26
26
|
XINFERENCE_ENV_DISABLE_HEALTH_CHECK = "XINFERENCE_DISABLE_HEALTH_CHECK"
|
|
27
27
|
XINFERENCE_ENV_DISABLE_VLLM = "XINFERENCE_DISABLE_VLLM"
|
|
28
28
|
XINFERENCE_ENV_ENABLE_SGLANG = "XINFERENCE_ENABLE_SGLANG"
|
|
29
|
+
XINFERENCE_ENV_DISABLE_METRICS = "XINFERENCE_DISABLE_METRICS"
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
def get_xinference_home() -> str:
|
|
@@ -66,3 +67,6 @@ XINFERENCE_DISABLE_HEALTH_CHECK = bool(
|
|
|
66
67
|
)
|
|
67
68
|
XINFERENCE_DISABLE_VLLM = bool(int(os.environ.get(XINFERENCE_ENV_DISABLE_VLLM, 0)))
|
|
68
69
|
XINFERENCE_ENABLE_SGLANG = bool(int(os.environ.get(XINFERENCE_ENV_ENABLE_SGLANG, 0)))
|
|
70
|
+
XINFERENCE_DISABLE_METRICS = bool(
|
|
71
|
+
int(os.environ.get(XINFERENCE_ENV_DISABLE_METRICS, 0))
|
|
72
|
+
)
|
xinference/core/__init__.py
CHANGED
xinference/core/cache_tracker.py
CHANGED
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import os
|
|
14
15
|
from logging import getLogger
|
|
15
16
|
from typing import Any, Dict, List, Optional
|
|
16
17
|
|
|
@@ -105,9 +106,29 @@ class CacheTrackerActor(xo.Actor):
|
|
|
105
106
|
cached_models = []
|
|
106
107
|
for model_name, model_versions in self._model_name_to_version_info.items():
|
|
107
108
|
for version_info in model_versions:
|
|
108
|
-
|
|
109
|
+
cache_status = version_info.get("cache_status", None)
|
|
110
|
+
if cache_status == True:
|
|
109
111
|
ret = version_info.copy()
|
|
110
112
|
ret["model_name"] = model_name
|
|
113
|
+
|
|
114
|
+
re_dict = version_info.get("model_file_location", None)
|
|
115
|
+
if re_dict is not None and isinstance(re_dict, dict):
|
|
116
|
+
if re_dict:
|
|
117
|
+
actor_ip_address, path = next(iter(re_dict.items()))
|
|
118
|
+
else:
|
|
119
|
+
raise ValueError("The dictionary is empty.")
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError("re_dict must be a non-empty dictionary.")
|
|
122
|
+
|
|
123
|
+
ret["actor_ip_address"] = actor_ip_address
|
|
124
|
+
ret["path"] = path
|
|
125
|
+
if os.path.isdir(path):
|
|
126
|
+
files = os.listdir(path)
|
|
127
|
+
resolved_file = os.path.realpath(os.path.join(path, files[0]))
|
|
128
|
+
if resolved_file:
|
|
129
|
+
ret["real_path"] = os.path.dirname(resolved_file)
|
|
130
|
+
else:
|
|
131
|
+
ret["real_path"] = os.path.realpath(path)
|
|
111
132
|
cached_models.append(ret)
|
|
112
133
|
cached_models = sorted(cached_models, key=lambda x: x["model_name"])
|
|
113
134
|
return cached_models
|
|
@@ -186,8 +186,7 @@ class GradioInterface:
|
|
|
186
186
|
def build_chat_vl_interface(
|
|
187
187
|
self,
|
|
188
188
|
) -> "gr.Blocks":
|
|
189
|
-
def predict(history, bot):
|
|
190
|
-
logger.debug("Predict model: %s, history: %s", self.model_uid, history)
|
|
189
|
+
def predict(history, bot, max_tokens, temperature, stream):
|
|
191
190
|
from ..client import RESTfulClient
|
|
192
191
|
|
|
193
192
|
client = RESTfulClient(self.endpoint)
|
|
@@ -199,10 +198,46 @@ class GradioInterface:
|
|
|
199
198
|
assert prompt["role"] == "user"
|
|
200
199
|
prompt = prompt["content"]
|
|
201
200
|
# multimodal chat does not support stream.
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
201
|
+
if stream:
|
|
202
|
+
response_content = ""
|
|
203
|
+
for chunk in model.chat(
|
|
204
|
+
prompt=prompt,
|
|
205
|
+
chat_history=history[:-1],
|
|
206
|
+
generate_config={
|
|
207
|
+
"max_tokens": max_tokens,
|
|
208
|
+
"temperature": temperature,
|
|
209
|
+
"stream": stream,
|
|
210
|
+
},
|
|
211
|
+
):
|
|
212
|
+
assert isinstance(chunk, dict)
|
|
213
|
+
delta = chunk["choices"][0]["delta"]
|
|
214
|
+
if "content" not in delta:
|
|
215
|
+
continue
|
|
216
|
+
else:
|
|
217
|
+
response_content += delta["content"]
|
|
218
|
+
bot[-1][1] = response_content
|
|
219
|
+
yield history, bot
|
|
220
|
+
history.append(
|
|
221
|
+
{
|
|
222
|
+
"content": response_content,
|
|
223
|
+
"role": "assistant",
|
|
224
|
+
}
|
|
225
|
+
)
|
|
226
|
+
bot[-1][1] = response_content
|
|
227
|
+
yield history, bot
|
|
228
|
+
else:
|
|
229
|
+
response = model.chat(
|
|
230
|
+
prompt=prompt,
|
|
231
|
+
chat_history=history[:-1],
|
|
232
|
+
generate_config={
|
|
233
|
+
"max_tokens": max_tokens,
|
|
234
|
+
"temperature": temperature,
|
|
235
|
+
"stream": stream,
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
history.append(response["choices"][0]["message"])
|
|
239
|
+
bot[-1][1] = history[-1]["content"]
|
|
240
|
+
yield history, bot
|
|
206
241
|
|
|
207
242
|
def add_text(history, bot, text, image):
|
|
208
243
|
logger.debug("Add text, text: %s, image: %s", text, image)
|
|
@@ -217,14 +252,19 @@ class GradioInterface:
|
|
|
217
252
|
"role": "user",
|
|
218
253
|
"content": [
|
|
219
254
|
{"type": "text", "text": text},
|
|
220
|
-
{
|
|
255
|
+
{
|
|
256
|
+
"type": "image_url",
|
|
257
|
+
"image_url": {
|
|
258
|
+
"url": f"data:image/png;base64,{img_b64_str}"
|
|
259
|
+
},
|
|
260
|
+
},
|
|
221
261
|
],
|
|
222
262
|
}
|
|
223
263
|
else:
|
|
224
264
|
display_content = text
|
|
225
265
|
message = {"role": "user", "content": text}
|
|
226
266
|
history = history + [message]
|
|
227
|
-
bot = bot + [
|
|
267
|
+
bot = bot + [[display_content, None]]
|
|
228
268
|
return history, bot, "", None
|
|
229
269
|
|
|
230
270
|
def clear_history():
|
|
@@ -286,6 +326,19 @@ class GradioInterface:
|
|
|
286
326
|
)
|
|
287
327
|
clear_btn = gr.Button(value="Clear")
|
|
288
328
|
|
|
329
|
+
with gr.Accordion("Additional Inputs", open=False):
|
|
330
|
+
max_tokens = gr.Slider(
|
|
331
|
+
minimum=1,
|
|
332
|
+
maximum=self.context_length,
|
|
333
|
+
value=512,
|
|
334
|
+
step=1,
|
|
335
|
+
label="Max Tokens",
|
|
336
|
+
)
|
|
337
|
+
temperature = gr.Slider(
|
|
338
|
+
minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
|
|
339
|
+
)
|
|
340
|
+
stream = gr.Checkbox(label="Stream", value=False)
|
|
341
|
+
|
|
289
342
|
textbox.change(update_button, [textbox], [submit_btn], queue=False)
|
|
290
343
|
|
|
291
344
|
textbox.submit(
|
|
@@ -293,14 +346,22 @@ class GradioInterface:
|
|
|
293
346
|
[state, chatbot, textbox, imagebox],
|
|
294
347
|
[state, chatbot, textbox, imagebox],
|
|
295
348
|
queue=False,
|
|
296
|
-
).then(
|
|
349
|
+
).then(
|
|
350
|
+
predict,
|
|
351
|
+
[state, chatbot, max_tokens, temperature, stream],
|
|
352
|
+
[state, chatbot],
|
|
353
|
+
)
|
|
297
354
|
|
|
298
355
|
submit_btn.click(
|
|
299
356
|
add_text,
|
|
300
357
|
[state, chatbot, textbox, imagebox],
|
|
301
358
|
[state, chatbot, textbox, imagebox],
|
|
302
359
|
queue=False,
|
|
303
|
-
).then(
|
|
360
|
+
).then(
|
|
361
|
+
predict,
|
|
362
|
+
[state, chatbot, max_tokens, temperature, stream],
|
|
363
|
+
[state, chatbot],
|
|
364
|
+
)
|
|
304
365
|
|
|
305
366
|
clear_btn.click(
|
|
306
367
|
clear_history, None, [state, chatbot, textbox, imagebox], queue=False
|
xinference/core/supervisor.py
CHANGED
|
@@ -28,7 +28,7 @@ from ..constants import (
|
|
|
28
28
|
XINFERENCE_HEALTH_CHECK_INTERVAL,
|
|
29
29
|
XINFERENCE_HEALTH_CHECK_TIMEOUT,
|
|
30
30
|
)
|
|
31
|
-
from ..core import ModelActor
|
|
31
|
+
from ..core.model import ModelActor
|
|
32
32
|
from ..core.status_guard import InstanceInfo, LaunchStatus
|
|
33
33
|
from ..types import PeftModelConfig
|
|
34
34
|
from .metrics import record_metrics
|
|
@@ -993,8 +993,9 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
993
993
|
"model_size_in_billions", None
|
|
994
994
|
)
|
|
995
995
|
quantizations = model_version.get("quantization", None)
|
|
996
|
-
|
|
997
|
-
|
|
996
|
+
actor_ip_address = model_version.get("actor_ip_address", None)
|
|
997
|
+
path = model_version.get("path", None)
|
|
998
|
+
real_path = model_version.get("real_path", None)
|
|
998
999
|
|
|
999
1000
|
cache_entry = {
|
|
1000
1001
|
"model_name": model_name,
|
|
@@ -1003,6 +1004,7 @@ class SupervisorActor(xo.StatelessActor):
|
|
|
1003
1004
|
"quantizations": quantizations,
|
|
1004
1005
|
"path": path,
|
|
1005
1006
|
"Actor IP Address": actor_ip_address,
|
|
1007
|
+
"real_path": real_path,
|
|
1006
1008
|
}
|
|
1007
1009
|
|
|
1008
1010
|
cached_models.append(cache_entry)
|
xinference/core/worker.py
CHANGED
|
@@ -30,9 +30,10 @@ from xoscar import MainActorPoolType
|
|
|
30
30
|
from ..constants import (
|
|
31
31
|
XINFERENCE_CACHE_DIR,
|
|
32
32
|
XINFERENCE_DISABLE_HEALTH_CHECK,
|
|
33
|
+
XINFERENCE_DISABLE_METRICS,
|
|
33
34
|
XINFERENCE_HEALTH_CHECK_INTERVAL,
|
|
34
35
|
)
|
|
35
|
-
from ..core import ModelActor
|
|
36
|
+
from ..core.model import ModelActor
|
|
36
37
|
from ..core.status_guard import LaunchStatus
|
|
37
38
|
from ..device_utils import get_available_device_env_name, gpu_count
|
|
38
39
|
from ..model.core import ModelDescription, create_model_instance
|
|
@@ -83,8 +84,12 @@ class WorkerActor(xo.StatelessActor):
|
|
|
83
84
|
self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
|
|
84
85
|
self._model_uid_to_launch_args: Dict[str, Dict] = {}
|
|
85
86
|
|
|
86
|
-
|
|
87
|
-
|
|
87
|
+
if XINFERENCE_DISABLE_METRICS:
|
|
88
|
+
logger.info(
|
|
89
|
+
"Worker metrics is disabled due to the environment XINFERENCE_DISABLE_METRICS=1"
|
|
90
|
+
)
|
|
91
|
+
elif metrics_exporter_host is not None or metrics_exporter_port is not None:
|
|
92
|
+
# metrics export server.
|
|
88
93
|
logger.info(
|
|
89
94
|
f"Starting metrics export server at {metrics_exporter_host}:{metrics_exporter_port}"
|
|
90
95
|
)
|
xinference/model/llm/__init__.py
CHANGED
|
@@ -113,6 +113,7 @@ def _install():
|
|
|
113
113
|
from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
|
|
114
114
|
from .pytorch.baichuan import BaichuanPytorchChatModel
|
|
115
115
|
from .pytorch.chatglm import ChatglmPytorchChatModel
|
|
116
|
+
from .pytorch.cogvlm2 import CogVLM2Model
|
|
116
117
|
from .pytorch.core import PytorchChatModel, PytorchModel
|
|
117
118
|
from .pytorch.deepseek_vl import DeepSeekVLChatModel
|
|
118
119
|
from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
|
|
@@ -159,6 +160,7 @@ def _install():
|
|
|
159
160
|
DeepSeekVLChatModel,
|
|
160
161
|
InternVLChatModel,
|
|
161
162
|
PytorchModel,
|
|
163
|
+
CogVLM2Model,
|
|
162
164
|
]
|
|
163
165
|
)
|
|
164
166
|
if OmniLMMModel: # type: ignore
|