xinference 1.8.1rc1__py3-none-any.whl → 1.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (108) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +2 -1
  3. xinference/core/model.py +8 -4
  4. xinference/core/supervisor.py +2 -3
  5. xinference/core/worker.py +7 -5
  6. xinference/deploy/cmdline.py +2 -0
  7. xinference/deploy/local.py +5 -0
  8. xinference/deploy/test/test_cmdline.py +1 -1
  9. xinference/deploy/worker.py +6 -0
  10. xinference/model/audio/cosyvoice.py +0 -1
  11. xinference/model/audio/model_spec.json +44 -20
  12. xinference/model/core.py +3 -0
  13. xinference/model/embedding/flag/core.py +5 -0
  14. xinference/model/embedding/llama_cpp/core.py +22 -19
  15. xinference/model/embedding/sentence_transformers/core.py +18 -4
  16. xinference/model/embedding/vllm/core.py +36 -9
  17. xinference/model/image/cache_manager.py +56 -0
  18. xinference/model/image/core.py +9 -0
  19. xinference/model/image/model_spec.json +178 -1
  20. xinference/model/image/stable_diffusion/core.py +155 -23
  21. xinference/model/llm/cache_manager.py +17 -3
  22. xinference/model/llm/harmony.py +245 -0
  23. xinference/model/llm/llama_cpp/core.py +41 -40
  24. xinference/model/llm/llm_family.json +688 -11
  25. xinference/model/llm/llm_family.py +1 -1
  26. xinference/model/llm/sglang/core.py +108 -5
  27. xinference/model/llm/transformers/core.py +20 -18
  28. xinference/model/llm/transformers/gemma3.py +1 -1
  29. xinference/model/llm/transformers/gpt_oss.py +91 -0
  30. xinference/model/llm/transformers/multimodal/core.py +1 -1
  31. xinference/model/llm/transformers/multimodal/gemma3.py +1 -1
  32. xinference/model/llm/transformers/multimodal/glm4_1v.py +2 -2
  33. xinference/model/llm/transformers/multimodal/ovis2.py +1 -1
  34. xinference/model/llm/transformers/multimodal/qwen-omni.py +7 -8
  35. xinference/model/llm/transformers/multimodal/qwen2_vl.py +9 -6
  36. xinference/model/llm/transformers/utils.py +1 -33
  37. xinference/model/llm/utils.py +61 -7
  38. xinference/model/llm/vllm/core.py +44 -8
  39. xinference/model/rerank/__init__.py +66 -23
  40. xinference/model/rerank/cache_manager.py +35 -0
  41. xinference/model/rerank/core.py +87 -339
  42. xinference/model/rerank/custom.py +33 -8
  43. xinference/model/rerank/model_spec.json +251 -212
  44. xinference/model/rerank/rerank_family.py +137 -0
  45. xinference/model/rerank/sentence_transformers/__init__.py +13 -0
  46. xinference/model/rerank/sentence_transformers/core.py +337 -0
  47. xinference/model/rerank/vllm/__init__.py +13 -0
  48. xinference/model/rerank/vllm/core.py +156 -0
  49. xinference/model/utils.py +108 -0
  50. xinference/model/video/model_spec.json +95 -1
  51. xinference/thirdparty/cosyvoice/bin/export_jit.py +3 -4
  52. xinference/thirdparty/cosyvoice/bin/export_onnx.py +49 -126
  53. xinference/thirdparty/cosyvoice/bin/{inference.py → inference_deprecated.py} +1 -0
  54. xinference/thirdparty/cosyvoice/bin/train.py +23 -3
  55. xinference/thirdparty/cosyvoice/cli/cosyvoice.py +8 -4
  56. xinference/thirdparty/cosyvoice/cli/frontend.py +4 -4
  57. xinference/thirdparty/cosyvoice/cli/model.py +53 -75
  58. xinference/thirdparty/cosyvoice/dataset/dataset.py +5 -18
  59. xinference/thirdparty/cosyvoice/dataset/processor.py +24 -25
  60. xinference/thirdparty/cosyvoice/flow/decoder.py +24 -433
  61. xinference/thirdparty/cosyvoice/flow/flow.py +6 -14
  62. xinference/thirdparty/cosyvoice/flow/flow_matching.py +33 -145
  63. xinference/thirdparty/cosyvoice/hifigan/generator.py +169 -1
  64. xinference/thirdparty/cosyvoice/llm/llm.py +108 -17
  65. xinference/thirdparty/cosyvoice/transformer/upsample_encoder.py +14 -115
  66. xinference/thirdparty/cosyvoice/utils/common.py +20 -0
  67. xinference/thirdparty/cosyvoice/utils/executor.py +8 -4
  68. xinference/thirdparty/cosyvoice/utils/file_utils.py +45 -1
  69. xinference/thirdparty/cosyvoice/utils/losses.py +37 -0
  70. xinference/thirdparty/cosyvoice/utils/mask.py +35 -1
  71. xinference/thirdparty/cosyvoice/utils/train_utils.py +24 -6
  72. xinference/thirdparty/cosyvoice/vllm/cosyvoice2.py +103 -0
  73. xinference/types.py +2 -0
  74. xinference/ui/gradio/chat_interface.py +2 -0
  75. xinference/ui/gradio/media_interface.py +353 -7
  76. xinference/ui/web/ui/build/asset-manifest.json +3 -3
  77. xinference/ui/web/ui/build/index.html +1 -1
  78. xinference/ui/web/ui/build/static/js/main.1086c759.js +3 -0
  79. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +1 -0
  80. xinference/ui/web/ui/node_modules/.cache/babel-loader/28012da921a51f1082549956d3ae82acd769a754b22afda9acddd98a4daf9ea4.json +1 -0
  81. xinference/ui/web/ui/node_modules/.cache/babel-loader/3c5758bd12fa334294b1de0ff6b1a4bac8d963c45472eab9dc3e530d82aa6b3f.json +1 -0
  82. xinference/ui/web/ui/node_modules/.cache/babel-loader/475936ebe725eca62a6f52ce182c06a19b2cef4df9545a05ed0591ee0c539d43.json +1 -0
  83. xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +1 -0
  84. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +1 -0
  85. xinference/ui/web/ui/node_modules/.cache/babel-loader/aee5aaba26f2b1e816a3ea9efa68bad8b95695a3d80adcfd8dd57a7bb17ac71a.json +1 -0
  86. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +1 -0
  87. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +1 -0
  88. xinference/ui/web/ui/src/locales/en.json +2 -0
  89. xinference/ui/web/ui/src/locales/ja.json +2 -0
  90. xinference/ui/web/ui/src/locales/ko.json +2 -0
  91. xinference/ui/web/ui/src/locales/zh.json +2 -0
  92. {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/METADATA +15 -10
  93. {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/RECORD +98 -89
  94. xinference/ui/web/ui/build/static/js/main.b969199a.js +0 -3
  95. xinference/ui/web/ui/build/static/js/main.b969199a.js.map +0 -1
  96. xinference/ui/web/ui/node_modules/.cache/babel-loader/1409a96b9f9f9f5de99a89ab0f738f6da62b449521b0a8d3e4efcf7f5c23534d.json +0 -1
  97. xinference/ui/web/ui/node_modules/.cache/babel-loader/3d2a89f0eccc1f90fc5036c9a1d587c2120e6a6b128aae31d1db7d6bad52722b.json +0 -1
  98. xinference/ui/web/ui/node_modules/.cache/babel-loader/43b889c3a8e2634092ade463d52481c7c5581c72ded8f23bc5f012ea0ef8cea5.json +0 -1
  99. xinference/ui/web/ui/node_modules/.cache/babel-loader/5d47532fb42128280d87f57c8a0b02bc1930f7ef764aa7e90579247df18bba83.json +0 -1
  100. xinference/ui/web/ui/node_modules/.cache/babel-loader/830882bb275468a969614824a9ab8983f874b4581f2eb625e9c66426cdc65e5b.json +0 -1
  101. xinference/ui/web/ui/node_modules/.cache/babel-loader/8e5cb82c2ff3299c6a44563fe6b1c5515c9750613c51bb63abee0b1d70fc5019.json +0 -1
  102. xinference/ui/web/ui/node_modules/.cache/babel-loader/9df08abcb5a7c1e48a4eb25c5d5f5d7253ea6854a4397e6d74d1fd75a14acda1.json +0 -1
  103. xinference/ui/web/ui/node_modules/.cache/babel-loader/b99034986a06445701accc7a4914bb9320947435e8d4e15793392ca4f679316c.json +0 -1
  104. /xinference/ui/web/ui/build/static/js/{main.b969199a.js.LICENSE.txt → main.1086c759.js.LICENSE.txt} +0 -0
  105. {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/WHEEL +0 -0
  106. {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/entry_points.txt +0 -0
  107. {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/licenses/LICENSE +0 -0
  108. {xinference-1.8.1rc1.dist-info → xinference-1.9.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,245 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from copy import deepcopy
16
+ from typing import TYPE_CHECKING, AsyncGenerator, Dict, Union
17
+
18
+ if TYPE_CHECKING:
19
+ from ...types import ChatCompletion, ChatCompletionChunk
20
+
21
+
22
+ class HarmonyStreamParser:
23
+ def __init__(self):
24
+ # Current channel: either 'analysis', 'final', or None if not started yet
25
+ self.current_channel = None
26
+ # Buffer for accumulating text when looking for 'assistantfinal' marker
27
+ self.buffer = ""
28
+
29
+ def feed(self, text):
30
+ """
31
+ Feed a chunk of text into the parser and return parsed segments.
32
+
33
+ Each segment is a dict:
34
+ {
35
+ "channel": "analysis" | "final",
36
+ "content": <string>
37
+ }
38
+
39
+ The parser detects 'assistantfinal' markers inside reasoning text,
40
+ splits the reasoning and final content correctly, and switches the channel.
41
+ """
42
+ segments = []
43
+
44
+ # If we are currently in 'analysis' mode
45
+ if self.current_channel == "analysis":
46
+ # Add text to buffer and check for 'assistantfinal' marker
47
+ self.buffer += text
48
+ if "assistantfinal" in self.buffer:
49
+ # Split reasoning and final content
50
+ before, after = self.buffer.split("assistantfinal", 1)
51
+ if before:
52
+ segments.append({"channel": "analysis", "content": before})
53
+ # Switch to final channel
54
+ self.current_channel = "final"
55
+ self.buffer = ""
56
+ if after:
57
+ segments.append({"channel": "final", "content": after})
58
+ return segments
59
+ else:
60
+ # Check if buffer ends with partial 'assistantfinal'
61
+ if any(
62
+ self.buffer.endswith("assistantfinal"[:i])
63
+ for i in range(1, len("assistantfinal") + 1)
64
+ ):
65
+ # Don't emit anything yet, wait for more text
66
+ return segments
67
+ else:
68
+ # Emit what we have so far and keep buffer for next time
69
+ if self.buffer:
70
+ segments.append({"channel": "analysis", "content": self.buffer})
71
+ self.buffer = ""
72
+ return segments
73
+
74
+ # If we are currently in 'final' mode
75
+ if self.current_channel == "final":
76
+ # Check if this is actually a new message starting with 'analysis'
77
+ if text.startswith("analysis"):
78
+ # Reset parser state for new message
79
+ self.current_channel = None
80
+ self.buffer = ""
81
+ # Re-process this text with the new state
82
+ return self.feed(text)
83
+ else:
84
+ segments.append({"channel": "final", "content": text})
85
+ return segments
86
+
87
+ # If no channel has been started yet
88
+ if text.startswith("analysis"):
89
+ self.current_channel = "analysis"
90
+ rest = text[len("analysis") :]
91
+ if "assistantfinal" in rest:
92
+ # Split immediately if marker is found in the first chunk
93
+ before, after = rest.split("assistantfinal", 1)
94
+ if before:
95
+ segments.append({"channel": "analysis", "content": before})
96
+ self.current_channel = "final"
97
+ if after:
98
+ segments.append({"channel": "final", "content": after})
99
+ else:
100
+ # Start buffering for potential 'assistantfinal' marker
101
+ self.buffer = rest
102
+ # Check if buffer ends with partial 'assistantfinal'
103
+ if any(
104
+ self.buffer.endswith("assistantfinal"[:i])
105
+ for i in range(1, len("assistantfinal") + 1)
106
+ ):
107
+ # Don't emit anything yet, wait for more text
108
+ pass
109
+ else:
110
+ # Emit what we have so far
111
+ if self.buffer:
112
+ segments.append({"channel": "analysis", "content": self.buffer})
113
+ self.buffer = ""
114
+ elif text.startswith("assistantfinal"):
115
+ self.current_channel = "final"
116
+ rest = text[len("assistantfinal") :]
117
+ if rest:
118
+ segments.append({"channel": "final", "content": rest})
119
+
120
+ return segments
121
+
122
+
123
+ async def async_stream_harmony_chat_completion(
124
+ chunks: Union[
125
+ "ChatCompletion",
126
+ AsyncGenerator["ChatCompletionChunk", None],
127
+ ],
128
+ ) -> AsyncGenerator["ChatCompletion", None]:
129
+ """
130
+ Parse Harmony-formatted content from either a full ChatCompletion (non-streaming)
131
+ or an async stream of ChatCompletionChunk (streaming), using the HarmonyStreamParser defined in this file.
132
+
133
+ Yields parsed objects incrementally.
134
+ """
135
+
136
+ # --- Non-streaming: ChatCompletion ---
137
+ if isinstance(chunks, dict) and chunks.get("object") == "chat.completion":
138
+ out_data = deepcopy(chunks)
139
+
140
+ for choice in out_data["choices"]:
141
+ parser = HarmonyStreamParser()
142
+ msg = choice["message"]
143
+
144
+ # Backup original content & reasoning
145
+ original_content = msg.get("content") or ""
146
+ original_reasoning = msg.get("reasoning_content") or ""
147
+
148
+ # Reset fields before parsing
149
+ msg["content"] = ""
150
+ msg["reasoning_content"] = ""
151
+ msg.setdefault("tool_calls", [])
152
+
153
+ # Feed original content
154
+ for seg in parser.feed(original_content):
155
+ ch, c = seg["channel"], seg["content"]
156
+ if ch == "final":
157
+ msg["content"] += c
158
+ elif ch == "analysis":
159
+ msg["reasoning_content"] += c
160
+ elif ch == "tool":
161
+ msg["tool_calls"].append(c)
162
+
163
+ # Feed original reasoning_content
164
+ for seg in parser.feed(original_reasoning):
165
+ if seg["channel"] == "analysis":
166
+ msg["reasoning_content"] += seg["content"]
167
+ elif seg["channel"] == "tool":
168
+ msg["tool_calls"].append(seg["content"])
169
+
170
+ # Clean up reasoning_content: set to None if no reasoning content was parsed
171
+ if not msg["reasoning_content"] and not original_reasoning:
172
+ msg["reasoning_content"] = None # type: ignore
173
+
174
+ yield out_data
175
+
176
+ else:
177
+ # Streaming: handle async generator
178
+ parsers_per_choice = {}
179
+
180
+ async for chunk in chunks: # type: ignore
181
+ out_chunk = { # type: ignore
182
+ "id": chunk["id"],
183
+ "model": chunk["model"],
184
+ "object": chunk["object"],
185
+ "created": chunk["created"],
186
+ "choices": [],
187
+ }
188
+
189
+ for i, choice in enumerate(chunk["choices"]):
190
+ delta = choice.get("delta", {})
191
+ text = delta.get("content") or "" # type: ignore
192
+
193
+ if i not in parsers_per_choice:
194
+ parsers_per_choice[i] = HarmonyStreamParser()
195
+
196
+ # Feed text to parser and collect current delta only
197
+ curr_delta: Dict[str, object] = {
198
+ "content": "",
199
+ "reasoning_content": "",
200
+ "tool_calls": [],
201
+ }
202
+
203
+ for seg in parsers_per_choice[i].feed(text):
204
+ ch = seg["channel"]
205
+ c = seg["content"]
206
+ if ch == "final":
207
+ curr_delta["content"] += c # type: ignore
208
+ elif ch == "analysis":
209
+ curr_delta["reasoning_content"] += c # type: ignore
210
+ elif ch == "tool":
211
+ curr_delta["tool_calls"].append(c) # type: ignore
212
+
213
+ if curr_delta["reasoning_content"]:
214
+ if not curr_delta["content"]:
215
+ curr_delta["content"] = None
216
+
217
+ elif curr_delta["content"]:
218
+ if not curr_delta["reasoning_content"]:
219
+ curr_delta["reasoning_content"] = None
220
+
221
+ elif (
222
+ choice.get("finish_reason") is not None
223
+ and not curr_delta["reasoning_content"]
224
+ ):
225
+ # For the final chunk, if there's no new reasoning content,
226
+ # don't include empty reasoning_content to avoid clearing existing state
227
+ curr_delta["reasoning_content"] = None
228
+
229
+ out_chunk["choices"].append( # type: ignore
230
+ {
231
+ "index": i,
232
+ "delta": curr_delta,
233
+ "finish_reason": choice.get("finish_reason"),
234
+ }
235
+ )
236
+
237
+ # Only yield if we have either content or reasoning_content
238
+ has_content = any(
239
+ choice["delta"].get("content") # type: ignore
240
+ or choice["delta"].get("reasoning_content") # type: ignore
241
+ or choice.get("finish_reason") is not None # type: ignore
242
+ for choice in out_chunk["choices"] # type: ignore
243
+ )
244
+ if has_content:
245
+ yield out_chunk # type: ignore
@@ -19,11 +19,11 @@ import pprint
19
19
  import queue
20
20
  from typing import Iterator, List, Optional, Union
21
21
 
22
- import orjson
22
+ from packaging import version
23
23
 
24
24
  from ....constants import XINFERENCE_MAX_TOKENS
25
25
  from ....types import ChatCompletion, ChatCompletionChunk, Completion, CompletionChunk
26
- from ..core import LLM
26
+ from ..core import LLM, chat_context_var
27
27
  from ..llm_family import LLMFamilyV2, LLMSpecV1
28
28
  from ..utils import ChatModelMixin
29
29
 
@@ -98,10 +98,19 @@ class XllamaCppModel(LLM, ChatModelMixin):
98
98
  from xllamacpp import (
99
99
  CommonParams,
100
100
  Server,
101
+ __version__,
101
102
  estimate_gpu_layers,
102
103
  get_device_info,
103
104
  ggml_backend_dev_type,
104
105
  )
106
+
107
+ try:
108
+ if version.parse(__version__) < version.parse("0.2.0"):
109
+ raise RuntimeError(
110
+ "Please update xllamacpp to >= 0.2.0 by `pip install -U xllamacpp`"
111
+ )
112
+ except version.InvalidVersion:
113
+ pass # If the version parse failed, we just skip the version check.
105
114
  except ImportError:
106
115
  error_message = "Failed to import module 'xllamacpp'"
107
116
  installation_guide = ["Please make sure 'xllamacpp' is installed. "]
@@ -160,6 +169,7 @@ class XllamaCppModel(LLM, ChatModelMixin):
160
169
  params.mmproj.path = mmproj
161
170
  if self.model_family.chat_template:
162
171
  params.chat_template = self.model_family.chat_template
172
+ params.use_jinja = True
163
173
  # This is the default value, could be overwritten by _llamacpp_model_config
164
174
  params.n_parallel = min(8, os.cpu_count() or 1)
165
175
  for k, v in self._llamacpp_model_config.items():
@@ -208,7 +218,8 @@ class XllamaCppModel(LLM, ChatModelMixin):
208
218
  )
209
219
  logger.info("Estimate num gpu layers: %s", estimate)
210
220
  if estimate.tensor_split:
211
- params.tensor_split = estimate.tensor_split
221
+ for i in range(len(estimate.tensor_split)):
222
+ params.tensor_split[i] = estimate.tensor_split[i]
212
223
  else:
213
224
  params.n_gpu_layers = estimate.layers
214
225
  except Exception as e:
@@ -242,28 +253,18 @@ class XllamaCppModel(LLM, ChatModelMixin):
242
253
  {
243
254
  "prompt": prompt,
244
255
  "stream": stream,
256
+ "model": self.model_uid,
245
257
  }
246
258
  )
247
- prompt_json = orjson.dumps(data)
248
-
249
- def _error_callback(err):
250
- try:
251
- msg = orjson.loads(err)
252
- q.put(_Error(msg))
253
- except Exception as e:
254
- q.put(_Error(str(e)))
259
+ try:
255
260
 
256
- def _ok_callback(ok):
257
- try:
258
- res = orjson.loads(ok)
259
- res["model"] = self.model_uid
260
- q.put(res)
261
- except Exception as e:
262
- logger.exception("handle_completions callback failed: %s", e)
263
- q.put(_Error(str(e)))
261
+ def _callback(res):
262
+ if res.get("code"):
263
+ q.put(_Error(res))
264
+ else:
265
+ q.put(res)
264
266
 
265
- try:
266
- self._llm.handle_completions(prompt_json, _error_callback, _ok_callback)
267
+ self._llm.handle_completions(data, _callback)
267
268
  except Exception as ex:
268
269
  logger.exception("handle_completions failed: %s", ex)
269
270
  q.put(_Error(str(ex)))
@@ -296,6 +297,15 @@ class XllamaCppModel(LLM, ChatModelMixin):
296
297
  if not generate_config.get("max_tokens") and XINFERENCE_MAX_TOKENS:
297
298
  generate_config["max_tokens"] = XINFERENCE_MAX_TOKENS
298
299
  stream = generate_config.get("stream", False)
300
+
301
+ chat_template_kwargs = (
302
+ self._get_chat_template_kwargs_from_generate_config(
303
+ generate_config, self.reasoning_parser
304
+ )
305
+ or {}
306
+ )
307
+ chat_context_var.set(chat_template_kwargs)
308
+
299
309
  tools = generate_config.pop("tools", []) if generate_config else None
300
310
  q: queue.Queue = queue.Queue()
301
311
 
@@ -310,30 +320,21 @@ class XllamaCppModel(LLM, ChatModelMixin):
310
320
  "messages": messages,
311
321
  "stream": stream,
312
322
  "tools": tools,
323
+ "model": self.model_uid,
313
324
  }
314
325
  )
315
- prompt_json = orjson.dumps(data)
326
+ if chat_template_kwargs:
327
+ data["chat_template_kwargs"] = chat_template_kwargs
316
328
 
317
- def _error_callback(err):
318
- try:
319
- msg = orjson.loads(err)
320
- q.put(_Error(msg))
321
- except Exception as e:
322
- q.put(_Error(str(e)))
329
+ try:
323
330
 
324
- def _ok_callback(ok):
325
- try:
326
- res = orjson.loads(ok)
327
- res["model"] = self.model_uid
328
- q.put(res)
329
- except Exception as e:
330
- logger.exception("handle_chat_completions callback failed: %s", e)
331
- q.put(_Error(str(e)))
331
+ def _callback(res):
332
+ if res.get("code"):
333
+ q.put(_Error(res))
334
+ else:
335
+ q.put(res)
332
336
 
333
- try:
334
- self._llm.handle_chat_completions(
335
- prompt_json, _error_callback, _ok_callback
336
- )
337
+ self._llm.handle_chat_completions(data, _callback)
337
338
  except Exception as ex:
338
339
  logger.exception("handle_chat_completions failed: %s", ex)
339
340
  q.put(_Error(str(ex)))