xinference 0.11.2.post1__py3-none-any.whl → 0.11.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-05-24T19:39:58+0800",
11
+ "date": "2024-05-31T17:12:13+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "ac8f33439c25e6fb05eba79e7932cbbadd068174",
15
- "version": "0.11.2.post1"
14
+ "full-revisionid": "69c09cd068a530cd2fdcac07e4e81f03d48f04f9",
15
+ "version": "0.11.3"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -52,7 +52,7 @@ from xoscar.utils import get_next_port
52
52
 
53
53
  from .._compat import BaseModel, Field
54
54
  from .._version import get_versions
55
- from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT
55
+ from ..constants import XINFERENCE_DEFAULT_ENDPOINT_PORT, XINFERENCE_DISABLE_METRICS
56
56
  from ..core.event import Event, EventCollectorActor, EventType
57
57
  from ..core.supervisor import SupervisorActor
58
58
  from ..core.utils import json_dumps
@@ -504,13 +504,19 @@ class RESTfulAPI:
504
504
  ),
505
505
  )
506
506
 
507
- # Clear the global Registry for the MetricsMiddleware, or
508
- # the MetricsMiddleware will register duplicated metrics if the port
509
- # conflict (This serve method run more than once).
510
- REGISTRY.clear()
511
- self._app.add_middleware(MetricsMiddleware)
512
- self._app.include_router(self._router)
513
- self._app.add_route("/metrics", metrics)
507
+ if XINFERENCE_DISABLE_METRICS:
508
+ logger.info(
509
+ "Supervisor metrics is disabled due to the environment XINFERENCE_DISABLE_METRICS=1"
510
+ )
511
+ self._app.include_router(self._router)
512
+ else:
513
+ # Clear the global Registry for the MetricsMiddleware, or
514
+ # the MetricsMiddleware will register duplicated metrics if the port
515
+ # conflict (This serve method run more than once).
516
+ REGISTRY.clear()
517
+ self._app.add_middleware(MetricsMiddleware)
518
+ self._app.include_router(self._router)
519
+ self._app.add_route("/metrics", metrics)
514
520
 
515
521
  # Check all the routes returns Response.
516
522
  # This is to avoid `jsonable_encoder` performance issue:
xinference/constants.py CHANGED
@@ -26,6 +26,7 @@ XINFERENCE_ENV_HEALTH_CHECK_TIMEOUT = "XINFERENCE_HEALTH_CHECK_TIMEOUT"
26
26
  XINFERENCE_ENV_DISABLE_HEALTH_CHECK = "XINFERENCE_DISABLE_HEALTH_CHECK"
27
27
  XINFERENCE_ENV_DISABLE_VLLM = "XINFERENCE_DISABLE_VLLM"
28
28
  XINFERENCE_ENV_ENABLE_SGLANG = "XINFERENCE_ENABLE_SGLANG"
29
+ XINFERENCE_ENV_DISABLE_METRICS = "XINFERENCE_DISABLE_METRICS"
29
30
 
30
31
 
31
32
  def get_xinference_home() -> str:
@@ -66,3 +67,6 @@ XINFERENCE_DISABLE_HEALTH_CHECK = bool(
66
67
  )
67
68
  XINFERENCE_DISABLE_VLLM = bool(int(os.environ.get(XINFERENCE_ENV_DISABLE_VLLM, 0)))
68
69
  XINFERENCE_ENABLE_SGLANG = bool(int(os.environ.get(XINFERENCE_ENV_ENABLE_SGLANG, 0)))
70
+ XINFERENCE_DISABLE_METRICS = bool(
71
+ int(os.environ.get(XINFERENCE_ENV_DISABLE_METRICS, 0))
72
+ )
@@ -11,5 +11,3 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
- from .model import ModelActor
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import os
14
15
  from logging import getLogger
15
16
  from typing import Any, Dict, List, Optional
16
17
 
@@ -105,9 +106,29 @@ class CacheTrackerActor(xo.Actor):
105
106
  cached_models = []
106
107
  for model_name, model_versions in self._model_name_to_version_info.items():
107
108
  for version_info in model_versions:
108
- if version_info["cache_status"]:
109
+ cache_status = version_info.get("cache_status", None)
110
+ if cache_status == True:
109
111
  ret = version_info.copy()
110
112
  ret["model_name"] = model_name
113
+
114
+ re_dict = version_info.get("model_file_location", None)
115
+ if re_dict is not None and isinstance(re_dict, dict):
116
+ if re_dict:
117
+ actor_ip_address, path = next(iter(re_dict.items()))
118
+ else:
119
+ raise ValueError("The dictionary is empty.")
120
+ else:
121
+ raise ValueError("re_dict must be a non-empty dictionary.")
122
+
123
+ ret["actor_ip_address"] = actor_ip_address
124
+ ret["path"] = path
125
+ if os.path.isdir(path):
126
+ files = os.listdir(path)
127
+ resolved_file = os.path.realpath(os.path.join(path, files[0]))
128
+ if resolved_file:
129
+ ret["real_path"] = os.path.dirname(resolved_file)
130
+ else:
131
+ ret["real_path"] = os.path.realpath(path)
111
132
  cached_models.append(ret)
112
133
  cached_models = sorted(cached_models, key=lambda x: x["model_name"])
113
134
  return cached_models
@@ -186,8 +186,7 @@ class GradioInterface:
186
186
  def build_chat_vl_interface(
187
187
  self,
188
188
  ) -> "gr.Blocks":
189
- def predict(history, bot):
190
- logger.debug("Predict model: %s, history: %s", self.model_uid, history)
189
+ def predict(history, bot, max_tokens, temperature, stream):
191
190
  from ..client import RESTfulClient
192
191
 
193
192
  client = RESTfulClient(self.endpoint)
@@ -199,10 +198,46 @@ class GradioInterface:
199
198
  assert prompt["role"] == "user"
200
199
  prompt = prompt["content"]
201
200
  # multimodal chat does not support stream.
202
- response = model.chat(prompt=prompt, chat_history=history[:-1])
203
- history.append(response["choices"][0]["message"])
204
- bot[-1][1] = history[-1]["content"]
205
- return history, bot
201
+ if stream:
202
+ response_content = ""
203
+ for chunk in model.chat(
204
+ prompt=prompt,
205
+ chat_history=history[:-1],
206
+ generate_config={
207
+ "max_tokens": max_tokens,
208
+ "temperature": temperature,
209
+ "stream": stream,
210
+ },
211
+ ):
212
+ assert isinstance(chunk, dict)
213
+ delta = chunk["choices"][0]["delta"]
214
+ if "content" not in delta:
215
+ continue
216
+ else:
217
+ response_content += delta["content"]
218
+ bot[-1][1] = response_content
219
+ yield history, bot
220
+ history.append(
221
+ {
222
+ "content": response_content,
223
+ "role": "assistant",
224
+ }
225
+ )
226
+ bot[-1][1] = response_content
227
+ yield history, bot
228
+ else:
229
+ response = model.chat(
230
+ prompt=prompt,
231
+ chat_history=history[:-1],
232
+ generate_config={
233
+ "max_tokens": max_tokens,
234
+ "temperature": temperature,
235
+ "stream": stream,
236
+ },
237
+ )
238
+ history.append(response["choices"][0]["message"])
239
+ bot[-1][1] = history[-1]["content"]
240
+ yield history, bot
206
241
 
207
242
  def add_text(history, bot, text, image):
208
243
  logger.debug("Add text, text: %s, image: %s", text, image)
@@ -217,14 +252,19 @@ class GradioInterface:
217
252
  "role": "user",
218
253
  "content": [
219
254
  {"type": "text", "text": text},
220
- {"type": "image_url", "image_url": {"url": image}},
255
+ {
256
+ "type": "image_url",
257
+ "image_url": {
258
+ "url": f"data:image/png;base64,{img_b64_str}"
259
+ },
260
+ },
221
261
  ],
222
262
  }
223
263
  else:
224
264
  display_content = text
225
265
  message = {"role": "user", "content": text}
226
266
  history = history + [message]
227
- bot = bot + [(display_content, None)]
267
+ bot = bot + [[display_content, None]]
228
268
  return history, bot, "", None
229
269
 
230
270
  def clear_history():
@@ -286,6 +326,19 @@ class GradioInterface:
286
326
  )
287
327
  clear_btn = gr.Button(value="Clear")
288
328
 
329
+ with gr.Accordion("Additional Inputs", open=False):
330
+ max_tokens = gr.Slider(
331
+ minimum=1,
332
+ maximum=self.context_length,
333
+ value=512,
334
+ step=1,
335
+ label="Max Tokens",
336
+ )
337
+ temperature = gr.Slider(
338
+ minimum=0, maximum=2, value=1, step=0.01, label="Temperature"
339
+ )
340
+ stream = gr.Checkbox(label="Stream", value=False)
341
+
289
342
  textbox.change(update_button, [textbox], [submit_btn], queue=False)
290
343
 
291
344
  textbox.submit(
@@ -293,14 +346,22 @@ class GradioInterface:
293
346
  [state, chatbot, textbox, imagebox],
294
347
  [state, chatbot, textbox, imagebox],
295
348
  queue=False,
296
- ).then(predict, [state, chatbot], [state, chatbot])
349
+ ).then(
350
+ predict,
351
+ [state, chatbot, max_tokens, temperature, stream],
352
+ [state, chatbot],
353
+ )
297
354
 
298
355
  submit_btn.click(
299
356
  add_text,
300
357
  [state, chatbot, textbox, imagebox],
301
358
  [state, chatbot, textbox, imagebox],
302
359
  queue=False,
303
- ).then(predict, [state, chatbot], [state, chatbot])
360
+ ).then(
361
+ predict,
362
+ [state, chatbot, max_tokens, temperature, stream],
363
+ [state, chatbot],
364
+ )
304
365
 
305
366
  clear_btn.click(
306
367
  clear_history, None, [state, chatbot, textbox, imagebox], queue=False
@@ -28,7 +28,7 @@ from ..constants import (
28
28
  XINFERENCE_HEALTH_CHECK_INTERVAL,
29
29
  XINFERENCE_HEALTH_CHECK_TIMEOUT,
30
30
  )
31
- from ..core import ModelActor
31
+ from ..core.model import ModelActor
32
32
  from ..core.status_guard import InstanceInfo, LaunchStatus
33
33
  from ..types import PeftModelConfig
34
34
  from .metrics import record_metrics
@@ -993,8 +993,9 @@ class SupervisorActor(xo.StatelessActor):
993
993
  "model_size_in_billions", None
994
994
  )
995
995
  quantizations = model_version.get("quantization", None)
996
- re_dict = model_version.get("model_file_location", None)
997
- actor_ip_address, path = next(iter(re_dict.items()))
996
+ actor_ip_address = model_version.get("actor_ip_address", None)
997
+ path = model_version.get("path", None)
998
+ real_path = model_version.get("real_path", None)
998
999
 
999
1000
  cache_entry = {
1000
1001
  "model_name": model_name,
@@ -1003,6 +1004,7 @@ class SupervisorActor(xo.StatelessActor):
1003
1004
  "quantizations": quantizations,
1004
1005
  "path": path,
1005
1006
  "Actor IP Address": actor_ip_address,
1007
+ "real_path": real_path,
1006
1008
  }
1007
1009
 
1008
1010
  cached_models.append(cache_entry)
xinference/core/worker.py CHANGED
@@ -30,9 +30,10 @@ from xoscar import MainActorPoolType
30
30
  from ..constants import (
31
31
  XINFERENCE_CACHE_DIR,
32
32
  XINFERENCE_DISABLE_HEALTH_CHECK,
33
+ XINFERENCE_DISABLE_METRICS,
33
34
  XINFERENCE_HEALTH_CHECK_INTERVAL,
34
35
  )
35
- from ..core import ModelActor
36
+ from ..core.model import ModelActor
36
37
  from ..core.status_guard import LaunchStatus
37
38
  from ..device_utils import get_available_device_env_name, gpu_count
38
39
  from ..model.core import ModelDescription, create_model_instance
@@ -83,8 +84,12 @@ class WorkerActor(xo.StatelessActor):
83
84
  self._model_uid_to_recover_count: Dict[str, Optional[int]] = {}
84
85
  self._model_uid_to_launch_args: Dict[str, Dict] = {}
85
86
 
86
- # metrics export server.
87
- if metrics_exporter_host is not None or metrics_exporter_port is not None:
87
+ if XINFERENCE_DISABLE_METRICS:
88
+ logger.info(
89
+ "Worker metrics is disabled due to the environment XINFERENCE_DISABLE_METRICS=1"
90
+ )
91
+ elif metrics_exporter_host is not None or metrics_exporter_port is not None:
92
+ # metrics export server.
88
93
  logger.info(
89
94
  f"Starting metrics export server at {metrics_exporter_host}:{metrics_exporter_port}"
90
95
  )
@@ -113,6 +113,7 @@ def _install():
113
113
  from .ggml.llamacpp import LlamaCppChatModel, LlamaCppModel
114
114
  from .pytorch.baichuan import BaichuanPytorchChatModel
115
115
  from .pytorch.chatglm import ChatglmPytorchChatModel
116
+ from .pytorch.cogvlm2 import CogVLM2Model
116
117
  from .pytorch.core import PytorchChatModel, PytorchModel
117
118
  from .pytorch.deepseek_vl import DeepSeekVLChatModel
118
119
  from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
@@ -159,6 +160,7 @@ def _install():
159
160
  DeepSeekVLChatModel,
160
161
  InternVLChatModel,
161
162
  PytorchModel,
163
+ CogVLM2Model,
162
164
  ]
163
165
  )
164
166
  if OmniLMMModel: # type: ignore