xinference 0.14.4.post1__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (149) hide show
  1. xinference/_compat.py +51 -0
  2. xinference/_version.py +3 -3
  3. xinference/api/restful_api.py +5 -39
  4. xinference/client/restful/restful_client.py +3 -24
  5. xinference/conftest.py +1 -1
  6. xinference/constants.py +5 -0
  7. xinference/core/cache_tracker.py +1 -1
  8. xinference/core/chat_interface.py +8 -14
  9. xinference/core/event.py +1 -1
  10. xinference/core/model.py +82 -31
  11. xinference/core/scheduler.py +37 -37
  12. xinference/core/status_guard.py +1 -1
  13. xinference/core/supervisor.py +11 -10
  14. xinference/core/utils.py +80 -22
  15. xinference/core/worker.py +17 -16
  16. xinference/deploy/cmdline.py +8 -16
  17. xinference/deploy/local.py +1 -1
  18. xinference/deploy/supervisor.py +1 -1
  19. xinference/deploy/utils.py +1 -1
  20. xinference/deploy/worker.py +1 -1
  21. xinference/model/audio/cosyvoice.py +86 -41
  22. xinference/model/embedding/core.py +52 -31
  23. xinference/model/image/stable_diffusion/core.py +18 -1
  24. xinference/model/llm/__init__.py +21 -11
  25. xinference/model/llm/llama_cpp/core.py +16 -33
  26. xinference/model/llm/llm_family.json +619 -1297
  27. xinference/model/llm/llm_family.py +31 -52
  28. xinference/model/llm/llm_family_csghub.json +18 -35
  29. xinference/model/llm/llm_family_modelscope.json +573 -1119
  30. xinference/model/llm/lmdeploy/core.py +56 -88
  31. xinference/model/llm/mlx/core.py +46 -69
  32. xinference/model/llm/sglang/core.py +33 -18
  33. xinference/model/llm/transformers/chatglm.py +167 -305
  34. xinference/model/llm/transformers/cogvlm2.py +36 -63
  35. xinference/model/llm/transformers/cogvlm2_video.py +33 -223
  36. xinference/model/llm/transformers/core.py +49 -50
  37. xinference/model/llm/transformers/deepseek_vl.py +53 -96
  38. xinference/model/llm/transformers/glm4v.py +55 -111
  39. xinference/model/llm/transformers/intern_vl.py +39 -70
  40. xinference/model/llm/transformers/internlm2.py +32 -54
  41. xinference/model/llm/transformers/minicpmv25.py +22 -55
  42. xinference/model/llm/transformers/minicpmv26.py +158 -68
  43. xinference/model/llm/transformers/omnilmm.py +5 -28
  44. xinference/model/llm/transformers/qwen2_vl.py +208 -0
  45. xinference/model/llm/transformers/qwen_vl.py +34 -86
  46. xinference/model/llm/transformers/utils.py +32 -38
  47. xinference/model/llm/transformers/yi_vl.py +32 -72
  48. xinference/model/llm/utils.py +195 -489
  49. xinference/model/llm/vllm/core.py +153 -100
  50. xinference/model/rerank/core.py +41 -8
  51. xinference/model/rerank/model_spec.json +7 -0
  52. xinference/model/rerank/model_spec_modelscope.json +7 -1
  53. xinference/model/utils.py +1 -31
  54. xinference/thirdparty/cosyvoice/bin/export_jit.py +64 -0
  55. xinference/thirdparty/cosyvoice/bin/export_trt.py +8 -0
  56. xinference/thirdparty/cosyvoice/bin/inference.py +5 -2
  57. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +38 -22
  58. xinference/thirdparty/cosyvoice/cli/model.py +139 -26
  59. xinference/thirdparty/cosyvoice/flow/flow.py +15 -9
  60. xinference/thirdparty/cosyvoice/flow/length_regulator.py +20 -1
  61. xinference/thirdparty/cosyvoice/hifigan/generator.py +8 -4
  62. xinference/thirdparty/cosyvoice/llm/llm.py +14 -13
  63. xinference/thirdparty/cosyvoice/transformer/attention.py +7 -3
  64. xinference/thirdparty/cosyvoice/transformer/decoder.py +1 -1
  65. xinference/thirdparty/cosyvoice/transformer/embedding.py +4 -3
  66. xinference/thirdparty/cosyvoice/transformer/encoder.py +4 -2
  67. xinference/thirdparty/cosyvoice/utils/common.py +36 -0
  68. xinference/thirdparty/cosyvoice/utils/file_utils.py +16 -0
  69. xinference/thirdparty/deepseek_vl/serve/assets/Kelpy-Codos.js +100 -0
  70. xinference/thirdparty/deepseek_vl/serve/assets/avatar.png +0 -0
  71. xinference/thirdparty/deepseek_vl/serve/assets/custom.css +355 -0
  72. xinference/thirdparty/deepseek_vl/serve/assets/custom.js +22 -0
  73. xinference/thirdparty/deepseek_vl/serve/assets/favicon.ico +0 -0
  74. xinference/thirdparty/deepseek_vl/serve/examples/app.png +0 -0
  75. xinference/thirdparty/deepseek_vl/serve/examples/chart.png +0 -0
  76. xinference/thirdparty/deepseek_vl/serve/examples/mirror.png +0 -0
  77. xinference/thirdparty/deepseek_vl/serve/examples/pipeline.png +0 -0
  78. xinference/thirdparty/deepseek_vl/serve/examples/puzzle.png +0 -0
  79. xinference/thirdparty/deepseek_vl/serve/examples/rap.jpeg +0 -0
  80. xinference/thirdparty/fish_speech/fish_speech/configs/base.yaml +87 -0
  81. xinference/thirdparty/fish_speech/fish_speech/configs/firefly_gan_vq.yaml +34 -0
  82. xinference/thirdparty/fish_speech/fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  83. xinference/thirdparty/fish_speech/fish_speech/configs/text2semantic_finetune.yaml +83 -0
  84. xinference/thirdparty/fish_speech/fish_speech/datasets/protos/text-data.proto +24 -0
  85. xinference/thirdparty/fish_speech/fish_speech/i18n/README.md +27 -0
  86. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/.gitignore +114 -0
  87. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/README.md +36 -0
  88. xinference/thirdparty/fish_speech/fish_speech/webui/css/style.css +161 -0
  89. xinference/thirdparty/fish_speech/fish_speech/webui/html/footer.html +11 -0
  90. xinference/thirdparty/fish_speech/fish_speech/webui/js/animate.js +69 -0
  91. xinference/thirdparty/fish_speech/tools/sensevoice/README.md +59 -0
  92. xinference/thirdparty/matcha/VERSION +1 -0
  93. xinference/thirdparty/matcha/hifigan/LICENSE +21 -0
  94. xinference/thirdparty/matcha/hifigan/README.md +101 -0
  95. xinference/thirdparty/omnilmm/LICENSE +201 -0
  96. xinference/thirdparty/whisper/__init__.py +156 -0
  97. xinference/thirdparty/whisper/__main__.py +3 -0
  98. xinference/thirdparty/whisper/assets/gpt2.tiktoken +50256 -0
  99. xinference/thirdparty/whisper/assets/mel_filters.npz +0 -0
  100. xinference/thirdparty/whisper/assets/multilingual.tiktoken +50257 -0
  101. xinference/thirdparty/whisper/audio.py +157 -0
  102. xinference/thirdparty/whisper/decoding.py +826 -0
  103. xinference/thirdparty/whisper/model.py +314 -0
  104. xinference/thirdparty/whisper/normalizers/__init__.py +2 -0
  105. xinference/thirdparty/whisper/normalizers/basic.py +76 -0
  106. xinference/thirdparty/whisper/normalizers/english.json +1741 -0
  107. xinference/thirdparty/whisper/normalizers/english.py +550 -0
  108. xinference/thirdparty/whisper/timing.py +386 -0
  109. xinference/thirdparty/whisper/tokenizer.py +395 -0
  110. xinference/thirdparty/whisper/transcribe.py +605 -0
  111. xinference/thirdparty/whisper/triton_ops.py +109 -0
  112. xinference/thirdparty/whisper/utils.py +316 -0
  113. xinference/thirdparty/whisper/version.py +1 -0
  114. xinference/types.py +7 -49
  115. xinference/web/ui/build/asset-manifest.json +6 -6
  116. xinference/web/ui/build/index.html +1 -1
  117. xinference/web/ui/build/static/css/{main.4bafd904.css → main.632e9148.css} +2 -2
  118. xinference/web/ui/build/static/css/main.632e9148.css.map +1 -0
  119. xinference/web/ui/build/static/js/main.9cfafbd6.js +3 -0
  120. xinference/web/ui/build/static/js/{main.eb13fe95.js.LICENSE.txt → main.9cfafbd6.js.LICENSE.txt} +2 -0
  121. xinference/web/ui/build/static/js/main.9cfafbd6.js.map +1 -0
  122. xinference/web/ui/node_modules/.cache/babel-loader/01d6d198156bacbd436c51435edbd4b2cacd47a79db929105eba30f74b67d48d.json +1 -0
  123. xinference/web/ui/node_modules/.cache/babel-loader/10c69dc7a296779fcffedeff9393d832dfcb0013c36824adf623d3c518b801ff.json +1 -0
  124. xinference/web/ui/node_modules/.cache/babel-loader/59eb25f514afcc4fefd1b309d192b2455f1e0aec68a9de598ca4b2333fe2c774.json +1 -0
  125. xinference/web/ui/node_modules/.cache/babel-loader/68bede6d95bb5ef0b35bbb3ec5b8c937eaf6862c6cdbddb5ef222a7776aaf336.json +1 -0
  126. xinference/web/ui/node_modules/.cache/babel-loader/77d50223f3e734d4485cca538cb098a8c3a7a0a1a9f01f58cdda3af42fe1adf5.json +1 -0
  127. xinference/web/ui/node_modules/.cache/babel-loader/a56d5a642409a84988891089c98ca28ad0546432dfbae8aaa51bc5a280e1cdd2.json +1 -0
  128. xinference/web/ui/node_modules/.cache/babel-loader/d9ff696a3e3471f01b46c63d18af32e491eb5dc0e43cb30202c96871466df57f.json +1 -0
  129. xinference/web/ui/node_modules/.cache/babel-loader/f5039ddbeb815c51491a1989532006b96fc3ae49c6c60e3c097f875b4ae915ae.json +1 -0
  130. xinference/web/ui/node_modules/.package-lock.json +37 -0
  131. xinference/web/ui/node_modules/a-sync-waterfall/package.json +21 -0
  132. xinference/web/ui/node_modules/nunjucks/node_modules/commander/package.json +48 -0
  133. xinference/web/ui/node_modules/nunjucks/package.json +112 -0
  134. xinference/web/ui/package-lock.json +38 -0
  135. xinference/web/ui/package.json +1 -0
  136. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/METADATA +8 -8
  137. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/RECORD +141 -87
  138. xinference/model/llm/transformers/llama_2.py +0 -108
  139. xinference/web/ui/build/static/css/main.4bafd904.css.map +0 -1
  140. xinference/web/ui/build/static/js/main.eb13fe95.js +0 -3
  141. xinference/web/ui/build/static/js/main.eb13fe95.js.map +0 -1
  142. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +0 -1
  143. xinference/web/ui/node_modules/.cache/babel-loader/213b5913e164773c2b0567455377765715f5f07225fbac77ad8e1e9dc9648a47.json +0 -1
  144. xinference/web/ui/node_modules/.cache/babel-loader/5c26a23b5eacf5b752a08531577ae3840bb247745ef9a39583dc2d05ba93a82a.json +0 -1
  145. xinference/web/ui/node_modules/.cache/babel-loader/978b57d1a04a701bc3fcfebc511f5f274eed6ed7eade67f6fb76c27d5fd9ecc8.json +0 -1
  146. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/LICENSE +0 -0
  147. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/WHEEL +0 -0
  148. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/entry_points.txt +0 -0
  149. {xinference-0.14.4.post1.dist-info → xinference-0.15.0.dist-info}/top_level.txt +0 -0
xinference/_compat.py CHANGED
@@ -11,6 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from typing import Dict, Iterable, List, Literal, Optional, Union
15
+
14
16
  from pydantic.version import VERSION as PYDANTIC_VERSION
15
17
 
16
18
  PYDANTIC_V2 = PYDANTIC_VERSION.startswith("2.")
@@ -50,3 +52,52 @@ else:
50
52
  from pydantic.parse import load_str_bytes # noqa: F401
51
53
  from pydantic.types import StrBytes # noqa: F401
52
54
  from pydantic.utils import ROOT_KEY # noqa: F401
55
+
56
+ from openai.types.chat.chat_completion_named_tool_choice_param import (
57
+ ChatCompletionNamedToolChoiceParam,
58
+ )
59
+ from openai.types.chat.chat_completion_stream_options_param import (
60
+ ChatCompletionStreamOptionsParam,
61
+ )
62
+ from openai.types.chat.chat_completion_tool_param import ChatCompletionToolParam
63
+
64
+ OpenAIChatCompletionStreamOptionsParam = create_model_from_typeddict(
65
+ ChatCompletionStreamOptionsParam
66
+ )
67
+ OpenAIChatCompletionToolParam = create_model_from_typeddict(ChatCompletionToolParam)
68
+ OpenAIChatCompletionNamedToolChoiceParam = create_model_from_typeddict(
69
+ ChatCompletionNamedToolChoiceParam
70
+ )
71
+
72
+
73
+ class CreateChatCompletionOpenAI(BaseModel):
74
+ """
75
+ Comes from source code: https://github.com/openai/openai-python/blob/main/src/openai/types/chat/completion_create_params.py
76
+ """
77
+
78
+ messages: List[Dict]
79
+ model: str
80
+ frequency_penalty: Optional[float]
81
+ logit_bias: Optional[Dict[str, int]]
82
+ logprobs: Optional[bool]
83
+ max_tokens: Optional[int]
84
+ n: Optional[int]
85
+ parallel_tool_calls: Optional[bool]
86
+ presence_penalty: Optional[float]
87
+ # we do not support this
88
+ # response_format: ResponseFormat
89
+ seed: Optional[int]
90
+ service_tier: Optional[Literal["auto", "default"]]
91
+ stop: Union[Optional[str], List[str]]
92
+ stream_options: Optional[OpenAIChatCompletionStreamOptionsParam] # type: ignore
93
+ temperature: Optional[float]
94
+ tool_choice: Optional[ # type: ignore
95
+ Union[
96
+ Literal["none", "auto", "required"],
97
+ OpenAIChatCompletionNamedToolChoiceParam,
98
+ ]
99
+ ]
100
+ tools: Optional[Iterable[OpenAIChatCompletionToolParam]] # type: ignore
101
+ top_logprobs: Optional[int]
102
+ top_p: Optional[float]
103
+ user: Optional[str]
xinference/_version.py CHANGED
@@ -8,11 +8,11 @@ import json
8
8
 
9
9
  version_json = '''
10
10
  {
11
- "date": "2024-09-03T15:42:58+0800",
11
+ "date": "2024-09-06T16:29:42+0800",
12
12
  "dirty": false,
13
13
  "error": null,
14
- "full-revisionid": "b1b7c44e6f0ad934eb8366d531c87f29cfa239a7",
15
- "version": "0.14.4.post1"
14
+ "full-revisionid": "e2618be96293f112709c9ceed639a3443455a0e7",
15
+ "version": "0.15.0"
16
16
  }
17
17
  ''' # END VERSION_JSON
18
18
 
@@ -57,9 +57,7 @@ from ..core.event import Event, EventCollectorActor, EventType
57
57
  from ..core.supervisor import SupervisorActor
58
58
  from ..core.utils import json_dumps
59
59
  from ..types import (
60
- SPECIAL_TOOL_PROMPT,
61
60
  ChatCompletion,
62
- ChatCompletionMessage,
63
61
  Completion,
64
62
  CreateChatCompletion,
65
63
  CreateCompletion,
@@ -199,14 +197,14 @@ class RESTfulAPI:
199
197
  async def _get_supervisor_ref(self) -> xo.ActorRefType[SupervisorActor]:
200
198
  if self._supervisor_ref is None:
201
199
  self._supervisor_ref = await xo.actor_ref(
202
- address=self._supervisor_address, uid=SupervisorActor.uid()
200
+ address=self._supervisor_address, uid=SupervisorActor.default_uid()
203
201
  )
204
202
  return self._supervisor_ref
205
203
 
206
204
  async def _get_event_collector_ref(self) -> xo.ActorRefType[EventCollectorActor]:
207
205
  if self._event_collector_ref is None:
208
206
  self._event_collector_ref = await xo.actor_ref(
209
- address=self._supervisor_address, uid=EventCollectorActor.uid()
207
+ address=self._supervisor_address, uid=EventCollectorActor.default_uid()
210
208
  )
211
209
  return self._event_collector_ref
212
210
 
@@ -1627,33 +1625,7 @@ class RESTfulAPI:
1627
1625
  status_code=400, detail="Invalid input. Please specify the prompt."
1628
1626
  )
1629
1627
 
1630
- system_messages: List["ChatCompletionMessage"] = []
1631
- system_messages_contents = []
1632
- non_system_messages = []
1633
- for msg in messages:
1634
- assert (
1635
- msg.get("content") != SPECIAL_TOOL_PROMPT
1636
- ), f"Invalid message content {SPECIAL_TOOL_PROMPT}"
1637
- if msg["role"] == "system":
1638
- system_messages_contents.append(msg["content"])
1639
- else:
1640
- non_system_messages.append(msg)
1641
- system_messages.append(
1642
- {"role": "system", "content": ". ".join(system_messages_contents)}
1643
- )
1644
-
1645
1628
  has_tool_message = messages[-1].get("role") == "tool"
1646
- if has_tool_message:
1647
- prompt = SPECIAL_TOOL_PROMPT
1648
- system_prompt = system_messages[0]["content"] if system_messages else None
1649
- chat_history = non_system_messages # exclude the prompt
1650
- else:
1651
- prompt = None
1652
- if non_system_messages:
1653
- prompt = non_system_messages[-1]["content"]
1654
- system_prompt = system_messages[0]["content"] if system_messages else None
1655
- chat_history = non_system_messages[:-1] # exclude the prompt
1656
-
1657
1629
  model_uid = body.model
1658
1630
 
1659
1631
  try:
@@ -1681,9 +1653,7 @@ class RESTfulAPI:
1681
1653
  from ..model.llm.utils import GLM4_TOOL_CALL_FAMILY, QWEN_TOOL_CALL_FAMILY
1682
1654
 
1683
1655
  model_family = desc.get("model_family", "")
1684
- function_call_models = (
1685
- ["gorilla-openfunctions-v1"] + QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY
1686
- )
1656
+ function_call_models = QWEN_TOOL_CALL_FAMILY + GLM4_TOOL_CALL_FAMILY
1687
1657
 
1688
1658
  if model_family not in function_call_models:
1689
1659
  if body.tools:
@@ -1716,9 +1686,7 @@ class RESTfulAPI:
1716
1686
  try:
1717
1687
  try:
1718
1688
  iterator = await model.chat(
1719
- prompt,
1720
- system_prompt,
1721
- chat_history,
1689
+ messages,
1722
1690
  kwargs,
1723
1691
  raw_params=raw_kwargs,
1724
1692
  )
@@ -1750,9 +1718,7 @@ class RESTfulAPI:
1750
1718
  else:
1751
1719
  try:
1752
1720
  data = await model.chat(
1753
- prompt,
1754
- system_prompt,
1755
- chat_history,
1721
+ messages,
1756
1722
  kwargs,
1757
1723
  raw_params=raw_kwargs,
1758
1724
  )
@@ -13,7 +13,6 @@
13
13
  # limitations under the License.
14
14
  import json
15
15
  import typing
16
- import warnings
17
16
  from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
18
17
 
19
18
  import requests
@@ -470,9 +469,7 @@ class RESTfulGenerateModelHandle(RESTfulModelHandle):
470
469
  class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
471
470
  def chat(
472
471
  self,
473
- prompt: str,
474
- system_prompt: Optional[str] = None,
475
- chat_history: Optional[List["ChatCompletionMessage"]] = None,
472
+ messages: List[Dict],
476
473
  tools: Optional[List[Dict]] = None,
477
474
  generate_config: Optional[
478
475
  Union["LlamaCppGenerateConfig", "PytorchGenerateConfig"]
@@ -483,11 +480,7 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
483
480
 
484
481
  Parameters
485
482
  ----------
486
- prompt: str
487
- The user's input.
488
- system_prompt: Optional[str]
489
- The system context provide to Model prior to any chats.
490
- chat_history: Optional[List["ChatCompletionMessage"]]
483
+ messages: List[Dict]
491
484
  A list of messages comprising the conversation so far.
492
485
  tools: Optional[List[Dict]]
493
486
  A tool list.
@@ -509,25 +502,11 @@ class RESTfulChatModelHandle(RESTfulGenerateModelHandle):
509
502
  Report the failure to generate the chat from the server. Detailed information provided in error message.
510
503
 
511
504
  """
512
- warnings.warn(
513
- "The parameters `prompt`, `system_prompt` and `chat_history` will be deprecated in version v0.15.0, "
514
- "and will be replaced by the parameter `messages`, "
515
- "similar to the OpenAI API: https://platform.openai.com/docs/guides/chat-completions/getting-started",
516
- category=DeprecationWarning,
517
- stacklevel=2,
518
- )
519
-
520
505
  url = f"{self._base_url}/v1/chat/completions"
521
506
 
522
- if chat_history is None:
523
- chat_history = []
524
-
525
- chat_history = handle_system_prompts(chat_history, system_prompt)
526
- chat_history.append({"role": "user", "content": prompt}) # type: ignore
527
-
528
507
  request_body: Dict[str, Any] = {
529
508
  "model": self._model_uid,
530
- "messages": chat_history,
509
+ "messages": messages,
531
510
  }
532
511
  if tools is not None:
533
512
  request_body["tools"] = tools
xinference/conftest.py CHANGED
@@ -144,7 +144,7 @@ async def _start_test_cluster(
144
144
  address=f"test://{address}", logging_conf=logging_conf
145
145
  )
146
146
  await xo.create_actor(
147
- SupervisorActor, address=address, uid=SupervisorActor.uid()
147
+ SupervisorActor, address=address, uid=SupervisorActor.default_uid()
148
148
  )
149
149
  await start_worker_components(
150
150
  address=address,
xinference/constants.py CHANGED
@@ -38,6 +38,10 @@ def get_xinference_home() -> str:
38
38
  # if user has already set `XINFERENCE_HOME` env, change huggingface and modelscope default download path
39
39
  os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(home_path, "huggingface")
40
40
  os.environ["MODELSCOPE_CACHE"] = os.path.join(home_path, "modelscope")
41
+ # In multi-tenant mode,
42
+ # gradio's temporary files are stored in their respective home directories,
43
+ # to prevent insufficient permissions
44
+ os.environ["GRADIO_TEMP_DIR"] = os.path.join(home_path, "tmp", "gradio")
41
45
  return home_path
42
46
 
43
47
 
@@ -59,6 +63,7 @@ XINFERENCE_DEFAULT_ENDPOINT_PORT = 9997
59
63
  XINFERENCE_DEFAULT_LOG_FILE_NAME = "xinference.log"
60
64
  XINFERENCE_LOG_MAX_BYTES = 100 * 1024 * 1024
61
65
  XINFERENCE_LOG_BACKUP_COUNT = 30
66
+ XINFERENCE_LOG_ARG_MAX_LENGTH = 100
62
67
  XINFERENCE_HEALTH_CHECK_FAILURE_THRESHOLD = int(
63
68
  os.environ.get(XINFERENCE_ENV_HEALTH_CHECK_FAILURE_THRESHOLD, 5)
64
69
  )
@@ -25,7 +25,7 @@ class CacheTrackerActor(xo.Actor):
25
25
  self._model_name_to_version_info: Dict[str, List[Dict]] = {} # type: ignore
26
26
 
27
27
  @classmethod
28
- def uid(cls) -> str:
28
+ def default_uid(cls) -> str:
29
29
  return "cache_tracker"
30
30
 
31
31
  @staticmethod
@@ -16,7 +16,7 @@ import base64
16
16
  import logging
17
17
  import os
18
18
  from io import BytesIO
19
- from typing import Generator, List, Optional
19
+ from typing import Dict, Generator, List, Optional
20
20
 
21
21
  import gradio as gr
22
22
  import PIL.Image
@@ -27,7 +27,6 @@ from ..client.restful.restful_client import (
27
27
  RESTfulChatModelHandle,
28
28
  RESTfulGenerateModelHandle,
29
29
  )
30
- from ..types import ChatCompletionMessage
31
30
 
32
31
  logger = logging.getLogger(__name__)
33
32
 
@@ -96,11 +95,11 @@ class GradioInterface:
96
95
  flat_list += row
97
96
  return flat_list
98
97
 
99
- def to_chat(lst: List[str]) -> List[ChatCompletionMessage]:
98
+ def to_chat(lst: List[str]) -> List[Dict]:
100
99
  res = []
101
100
  for i in range(len(lst)):
102
101
  role = "assistant" if i % 2 == 1 else "user"
103
- res.append(ChatCompletionMessage(role=role, content=lst[i]))
102
+ res.append(dict(role=role, content=lst[i]))
104
103
  return res
105
104
 
106
105
  def generate_wrapper(
@@ -116,11 +115,12 @@ class GradioInterface:
116
115
  client._set_token(self._access_token)
117
116
  model = client.get_model(self.model_uid)
118
117
  assert isinstance(model, RESTfulChatModelHandle)
118
+ messages = to_chat(flatten(history))
119
+ messages.append(dict(role="user", content=message))
119
120
 
120
121
  response_content = ""
121
122
  for chunk in model.chat(
122
- prompt=message,
123
- chat_history=to_chat(flatten(history)),
123
+ messages,
124
124
  generate_config={
125
125
  "max_tokens": int(max_tokens),
126
126
  "temperature": temperature,
@@ -191,15 +191,10 @@ class GradioInterface:
191
191
  model = client.get_model(self.model_uid)
192
192
  assert isinstance(model, RESTfulChatModelHandle)
193
193
 
194
- prompt = history[-1]
195
- assert prompt["role"] == "user"
196
- prompt = prompt["content"]
197
- # multimodal chat does not support stream.
198
194
  if stream:
199
195
  response_content = ""
200
196
  for chunk in model.chat(
201
- prompt=prompt,
202
- chat_history=history[:-1],
197
+ messages=history,
203
198
  generate_config={
204
199
  "max_tokens": max_tokens,
205
200
  "temperature": temperature,
@@ -224,8 +219,7 @@ class GradioInterface:
224
219
  yield history, bot
225
220
  else:
226
221
  response = model.chat(
227
- prompt=prompt,
228
- chat_history=history[:-1],
222
+ messages=history,
229
223
  generate_config={
230
224
  "max_tokens": max_tokens,
231
225
  "temperature": temperature,
xinference/core/event.py CHANGED
@@ -41,7 +41,7 @@ class EventCollectorActor(xo.StatelessActor):
41
41
  )
42
42
 
43
43
  @classmethod
44
- def uid(cls) -> str:
44
+ def default_uid(cls) -> str:
45
45
  return "event_collector"
46
46
 
47
47
  def get_model_events(self, model_uid: str) -> List[Dict]:
xinference/core/model.py CHANGED
@@ -19,6 +19,7 @@ import json
19
19
  import os
20
20
  import time
21
21
  import types
22
+ import uuid
22
23
  import weakref
23
24
  from asyncio.queues import Queue
24
25
  from asyncio.tasks import wait_for
@@ -65,7 +66,12 @@ except ImportError:
65
66
  OutOfMemoryError = _OutOfMemoryError
66
67
 
67
68
 
68
- XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = ["qwen-vl-chat", "cogvlm2", "glm-4v"]
69
+ XINFERENCE_BATCHING_ALLOWED_VISION_MODELS = [
70
+ "qwen-vl-chat",
71
+ "cogvlm2",
72
+ "glm-4v",
73
+ "MiniCPM-V-2.6",
74
+ ]
69
75
 
70
76
 
71
77
  def request_limit(fn):
@@ -265,7 +271,7 @@ class ModelActor(xo.StatelessActor):
265
271
 
266
272
  if self._worker_ref is None:
267
273
  self._worker_ref = await xo.actor_ref(
268
- address=self._worker_address, uid=WorkerActor.uid()
274
+ address=self._worker_address, uid=WorkerActor.default_uid()
269
275
  )
270
276
  return self._worker_ref
271
277
 
@@ -434,23 +440,35 @@ class ModelActor(xo.StatelessActor):
434
440
  assert output_type == "binary", f"Unknown output type '{output_type}'"
435
441
  return ret
436
442
 
437
- @log_async(logger=logger)
438
443
  @request_limit
439
444
  @xo.generator
445
+ @log_async(logger=logger)
440
446
  async def generate(self, prompt: str, *args, **kwargs):
441
447
  if self.allow_batching():
448
+ # not support request_id
449
+ kwargs.pop("request_id", None)
442
450
  return await self.handle_batching_request(
443
451
  prompt, "generate", *args, **kwargs
444
452
  )
445
453
  else:
446
454
  kwargs.pop("raw_params", None)
447
455
  if hasattr(self._model, "generate"):
456
+ # not support request_id
457
+ kwargs.pop("request_id", None)
448
458
  return await self._call_wrapper_json(
449
459
  self._model.generate, prompt, *args, **kwargs
450
460
  )
451
461
  if hasattr(self._model, "async_generate"):
462
+ if "request_id" not in kwargs:
463
+ kwargs["request_id"] = str(uuid.uuid1())
464
+ else:
465
+ # model only accept string
466
+ kwargs["request_id"] = str(kwargs["request_id"])
452
467
  return await self._call_wrapper_json(
453
- self._model.async_generate, prompt, *args, **kwargs
468
+ self._model.async_generate,
469
+ prompt,
470
+ *args,
471
+ **kwargs,
454
472
  )
455
473
  raise AttributeError(f"Model {self._model.model_spec} is not for generate.")
456
474
 
@@ -481,22 +499,27 @@ class ModelActor(xo.StatelessActor):
481
499
  yield res
482
500
 
483
501
  @staticmethod
484
- def _get_stream_from_args(ability: str, *args) -> bool:
485
- if ability == "chat":
486
- assert args[2] is None or isinstance(args[2], dict)
487
- return False if args[2] is None else args[2].get("stream", False)
488
- else:
489
- assert args[0] is None or isinstance(args[0], dict)
490
- return False if args[0] is None else args[0].get("stream", False)
502
+ def _get_stream_from_args(*args) -> bool:
503
+ assert args[0] is None or isinstance(args[0], dict)
504
+ return False if args[0] is None else args[0].get("stream", False)
491
505
 
492
- async def handle_batching_request(self, prompt: str, ability: str, *args, **kwargs):
493
- stream = self._get_stream_from_args(ability, *args)
506
+ async def handle_batching_request(
507
+ self, prompt_or_messages: Union[str, List[Dict]], call_ability, *args, **kwargs
508
+ ):
509
+ """
510
+ The input parameter `prompt_or_messages`:
511
+ - when the model_ability is `generate`, it's `prompt`, which is str type.
512
+ - when the model_ability is `chat`, it's `messages`, which is List[Dict] type.
513
+ """
514
+ stream = self._get_stream_from_args(*args)
494
515
  assert self._scheduler_ref is not None
495
516
  if stream:
496
517
  assert self._scheduler_ref is not None
497
518
  queue: Queue[Any] = Queue()
498
519
  ret = self._queue_consumer(queue)
499
- await self._scheduler_ref.add_request(prompt, queue, *args, **kwargs)
520
+ await self._scheduler_ref.add_request(
521
+ prompt_or_messages, queue, call_ability, *args, **kwargs
522
+ )
500
523
  gen = self._to_async_gen("json", ret)
501
524
  self._current_generator = weakref.ref(gen)
502
525
  return gen
@@ -505,7 +528,9 @@ class ModelActor(xo.StatelessActor):
505
528
 
506
529
  assert self._loop is not None
507
530
  future = ConcurrentFuture()
508
- await self._scheduler_ref.add_request(prompt, future, *args, **kwargs)
531
+ await self._scheduler_ref.add_request(
532
+ prompt_or_messages, future, call_ability, *args, **kwargs
533
+ )
509
534
  fut = asyncio.wrap_future(future, loop=self._loop)
510
535
  result = await fut
511
536
  if result == XINFERENCE_NON_STREAMING_ABORT_FLAG:
@@ -514,27 +539,36 @@ class ModelActor(xo.StatelessActor):
514
539
  )
515
540
  return await asyncio.to_thread(json_dumps, result)
516
541
 
517
- @log_async(logger=logger)
518
542
  @request_limit
519
543
  @xo.generator
520
- async def chat(self, prompt: str, *args, **kwargs):
544
+ @log_async(logger=logger)
545
+ async def chat(self, messages: List[Dict], *args, **kwargs):
521
546
  start_time = time.time()
522
547
  response = None
523
548
  try:
524
549
  if self.allow_batching():
550
+ # not support request_id
551
+ kwargs.pop("request_id", None)
525
552
  return await self.handle_batching_request(
526
- prompt, "chat", *args, **kwargs
553
+ messages, "chat", *args, **kwargs
527
554
  )
528
555
  else:
529
556
  kwargs.pop("raw_params", None)
530
557
  if hasattr(self._model, "chat"):
558
+ # not support request_id
559
+ kwargs.pop("request_id", None)
531
560
  response = await self._call_wrapper_json(
532
- self._model.chat, prompt, *args, **kwargs
561
+ self._model.chat, messages, *args, **kwargs
533
562
  )
534
563
  return response
535
564
  if hasattr(self._model, "async_chat"):
565
+ if "request_id" not in kwargs:
566
+ kwargs["request_id"] = str(uuid.uuid1())
567
+ else:
568
+ # model only accept string
569
+ kwargs["request_id"] = str(kwargs["request_id"])
536
570
  response = await self._call_wrapper_json(
537
- self._model.async_chat, prompt, *args, **kwargs
571
+ self._model.async_chat, messages, *args, **kwargs
538
572
  )
539
573
  return response
540
574
  raise AttributeError(f"Model {self._model.model_spec} is not for chat.")
@@ -565,9 +599,10 @@ class ModelActor(xo.StatelessActor):
565
599
  return await self._scheduler_ref.abort_request(request_id)
566
600
  return AbortRequestMessage.NO_OP.name
567
601
 
568
- @log_async(logger=logger)
569
602
  @request_limit
603
+ @log_async(logger=logger)
570
604
  async def create_embedding(self, input: Union[str, List[str]], *args, **kwargs):
605
+ kwargs.pop("request_id", None)
571
606
  if hasattr(self._model, "create_embedding"):
572
607
  return await self._call_wrapper_json(
573
608
  self._model.create_embedding, input, *args, **kwargs
@@ -577,8 +612,8 @@ class ModelActor(xo.StatelessActor):
577
612
  f"Model {self._model.model_spec} is not for creating embedding."
578
613
  )
579
614
 
580
- @log_async(logger=logger)
581
615
  @request_limit
616
+ @log_async(logger=logger)
582
617
  async def rerank(
583
618
  self,
584
619
  documents: List[str],
@@ -590,6 +625,7 @@ class ModelActor(xo.StatelessActor):
590
625
  *args,
591
626
  **kwargs,
592
627
  ):
628
+ kwargs.pop("request_id", None)
593
629
  if hasattr(self._model, "rerank"):
594
630
  return await self._call_wrapper_json(
595
631
  self._model.rerank,
@@ -604,8 +640,8 @@ class ModelActor(xo.StatelessActor):
604
640
  )
605
641
  raise AttributeError(f"Model {self._model.model_spec} is not for reranking.")
606
642
 
607
- @log_async(logger=logger, args_formatter=lambda _, kwargs: kwargs.pop("audio"))
608
643
  @request_limit
644
+ @log_async(logger=logger, ignore_kwargs=["audio"])
609
645
  async def transcriptions(
610
646
  self,
611
647
  audio: bytes,
@@ -614,7 +650,9 @@ class ModelActor(xo.StatelessActor):
614
650
  response_format: str = "json",
615
651
  temperature: float = 0,
616
652
  timestamp_granularities: Optional[List[str]] = None,
653
+ **kwargs,
617
654
  ):
655
+ kwargs.pop("request_id", None)
618
656
  if hasattr(self._model, "transcriptions"):
619
657
  return await self._call_wrapper_json(
620
658
  self._model.transcriptions,
@@ -629,8 +667,8 @@ class ModelActor(xo.StatelessActor):
629
667
  f"Model {self._model.model_spec} is not for creating transcriptions."
630
668
  )
631
669
 
632
- @log_async(logger=logger, args_formatter=lambda _, kwargs: kwargs.pop("audio"))
633
670
  @request_limit
671
+ @log_async(logger=logger, ignore_kwargs=["audio"])
634
672
  async def translations(
635
673
  self,
636
674
  audio: bytes,
@@ -639,7 +677,9 @@ class ModelActor(xo.StatelessActor):
639
677
  response_format: str = "json",
640
678
  temperature: float = 0,
641
679
  timestamp_granularities: Optional[List[str]] = None,
680
+ **kwargs,
642
681
  ):
682
+ kwargs.pop("request_id", None)
643
683
  if hasattr(self._model, "translations"):
644
684
  return await self._call_wrapper_json(
645
685
  self._model.translations,
@@ -654,12 +694,9 @@ class ModelActor(xo.StatelessActor):
654
694
  f"Model {self._model.model_spec} is not for creating translations."
655
695
  )
656
696
 
657
- @log_async(
658
- logger=logger,
659
- args_formatter=lambda _, kwargs: kwargs.pop("prompt_speech", None),
660
- )
661
697
  @request_limit
662
698
  @xo.generator
699
+ @log_async(logger=logger, ignore_kwargs=["prompt_speech"])
663
700
  async def speech(
664
701
  self,
665
702
  input: str,
@@ -669,6 +706,7 @@ class ModelActor(xo.StatelessActor):
669
706
  stream: bool = False,
670
707
  **kwargs,
671
708
  ):
709
+ kwargs.pop("request_id", None)
672
710
  if hasattr(self._model, "speech"):
673
711
  return await self._call_wrapper_binary(
674
712
  self._model.speech,
@@ -683,8 +721,8 @@ class ModelActor(xo.StatelessActor):
683
721
  f"Model {self._model.model_spec} is not for creating speech."
684
722
  )
685
723
 
686
- @log_async(logger=logger)
687
724
  @request_limit
725
+ @log_async(logger=logger)
688
726
  async def text_to_image(
689
727
  self,
690
728
  prompt: str,
@@ -694,6 +732,7 @@ class ModelActor(xo.StatelessActor):
694
732
  *args,
695
733
  **kwargs,
696
734
  ):
735
+ kwargs.pop("request_id", None)
697
736
  if hasattr(self._model, "text_to_image"):
698
737
  return await self._call_wrapper_json(
699
738
  self._model.text_to_image,
@@ -708,6 +747,10 @@ class ModelActor(xo.StatelessActor):
708
747
  f"Model {self._model.model_spec} is not for creating image."
709
748
  )
710
749
 
750
+ @log_async(
751
+ logger=logger,
752
+ ignore_kwargs=["image"],
753
+ )
711
754
  async def image_to_image(
712
755
  self,
713
756
  image: "PIL.Image",
@@ -719,6 +762,7 @@ class ModelActor(xo.StatelessActor):
719
762
  *args,
720
763
  **kwargs,
721
764
  ):
765
+ kwargs.pop("request_id", None)
722
766
  if hasattr(self._model, "image_to_image"):
723
767
  return await self._call_wrapper_json(
724
768
  self._model.image_to_image,
@@ -735,6 +779,10 @@ class ModelActor(xo.StatelessActor):
735
779
  f"Model {self._model.model_spec} is not for creating image."
736
780
  )
737
781
 
782
+ @log_async(
783
+ logger=logger,
784
+ ignore_kwargs=["image"],
785
+ )
738
786
  async def inpainting(
739
787
  self,
740
788
  image: "PIL.Image",
@@ -747,6 +795,7 @@ class ModelActor(xo.StatelessActor):
747
795
  *args,
748
796
  **kwargs,
749
797
  ):
798
+ kwargs.pop("request_id", None)
750
799
  if hasattr(self._model, "inpainting"):
751
800
  return await self._call_wrapper_json(
752
801
  self._model.inpainting,
@@ -764,12 +813,13 @@ class ModelActor(xo.StatelessActor):
764
813
  f"Model {self._model.model_spec} is not for creating image."
765
814
  )
766
815
 
767
- @log_async(logger=logger)
768
816
  @request_limit
817
+ @log_async(logger=logger, ignore_kwargs=["image"])
769
818
  async def infer(
770
819
  self,
771
820
  **kwargs,
772
821
  ):
822
+ kwargs.pop("request_id", None)
773
823
  if hasattr(self._model, "infer"):
774
824
  return await self._call_wrapper_json(
775
825
  self._model.infer,
@@ -779,8 +829,8 @@ class ModelActor(xo.StatelessActor):
779
829
  f"Model {self._model.model_spec} is not for flexible infer."
780
830
  )
781
831
 
782
- @log_async(logger=logger)
783
832
  @request_limit
833
+ @log_async(logger=logger)
784
834
  async def text_to_video(
785
835
  self,
786
836
  prompt: str,
@@ -788,6 +838,7 @@ class ModelActor(xo.StatelessActor):
788
838
  *args,
789
839
  **kwargs,
790
840
  ):
841
+ kwargs.pop("request_id", None)
791
842
  if hasattr(self._model, "text_to_video"):
792
843
  return await self._call_wrapper_json(
793
844
  self._model.text_to_video,