xinference 0.10.3__py3-none-any.whl → 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (101) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/oauth2/auth_service.py +1 -1
  3. xinference/api/restful_api.py +53 -61
  4. xinference/client/restful/restful_client.py +52 -57
  5. xinference/conftest.py +1 -1
  6. xinference/core/cache_tracker.py +1 -1
  7. xinference/core/chat_interface.py +10 -4
  8. xinference/core/event.py +1 -1
  9. xinference/core/model.py +17 -6
  10. xinference/core/status_guard.py +1 -1
  11. xinference/core/supervisor.py +58 -72
  12. xinference/core/worker.py +68 -101
  13. xinference/deploy/cmdline.py +166 -1
  14. xinference/deploy/test/test_cmdline.py +2 -0
  15. xinference/deploy/utils.py +1 -1
  16. xinference/device_utils.py +29 -3
  17. xinference/fields.py +7 -1
  18. xinference/model/audio/whisper.py +88 -12
  19. xinference/model/core.py +2 -2
  20. xinference/model/image/__init__.py +29 -0
  21. xinference/model/image/core.py +6 -0
  22. xinference/model/image/custom.py +109 -0
  23. xinference/model/llm/__init__.py +92 -32
  24. xinference/model/llm/core.py +57 -102
  25. xinference/model/llm/ggml/chatglm.py +98 -13
  26. xinference/model/llm/ggml/llamacpp.py +49 -2
  27. xinference/model/llm/ggml/tools/convert_ggml_to_gguf.py +2 -2
  28. xinference/model/llm/llm_family.json +438 -7
  29. xinference/model/llm/llm_family.py +45 -41
  30. xinference/model/llm/llm_family_modelscope.json +258 -5
  31. xinference/model/llm/pytorch/chatglm.py +48 -0
  32. xinference/model/llm/pytorch/core.py +23 -6
  33. xinference/model/llm/pytorch/deepseek_vl.py +115 -33
  34. xinference/model/llm/pytorch/internlm2.py +32 -1
  35. xinference/model/llm/pytorch/qwen_vl.py +94 -12
  36. xinference/model/llm/pytorch/utils.py +38 -1
  37. xinference/model/llm/pytorch/yi_vl.py +96 -51
  38. xinference/model/llm/sglang/core.py +31 -9
  39. xinference/model/llm/utils.py +54 -20
  40. xinference/model/llm/vllm/core.py +101 -7
  41. xinference/thirdparty/omnilmm/chat.py +2 -1
  42. xinference/thirdparty/omnilmm/model/omnilmm.py +2 -1
  43. xinference/types.py +11 -0
  44. xinference/web/ui/build/asset-manifest.json +6 -3
  45. xinference/web/ui/build/index.html +1 -1
  46. xinference/web/ui/build/static/css/main.54bca460.css +2 -0
  47. xinference/web/ui/build/static/css/main.54bca460.css.map +1 -0
  48. xinference/web/ui/build/static/js/main.551aa479.js +3 -0
  49. xinference/web/ui/build/static/js/{main.26fdbfbe.js.LICENSE.txt → main.551aa479.js.LICENSE.txt} +7 -0
  50. xinference/web/ui/build/static/js/main.551aa479.js.map +1 -0
  51. xinference/web/ui/node_modules/.cache/babel-loader/0b11a5339468c13b2d31ac085e7effe4303259b2071abd46a0a8eb8529233a5e.json +1 -0
  52. xinference/web/ui/node_modules/.cache/babel-loader/1fa824d82b2af519de7700c594e50bde4bbca60d13bd3fabff576802e4070304.json +1 -0
  53. xinference/web/ui/node_modules/.cache/babel-loader/23caf6f1e52c43e983ca3bfd4189f41dbd645fa78f2dfdcd7f6b69bc41678665.json +1 -0
  54. xinference/web/ui/node_modules/.cache/babel-loader/29dda700ab913cf7f2cfabe450ddabfb283e96adfa3ec9d315b2fa6c63cd375c.json +1 -0
  55. xinference/web/ui/node_modules/.cache/babel-loader/2c63e940b945fd5817157e08a42b889b30d668ea4c91332f48ef2b1b9d26f520.json +1 -0
  56. xinference/web/ui/node_modules/.cache/babel-loader/4135fe8745434cbce6438d1ebfa47422e0c77d884db4edc75c8bf32ea1d50621.json +1 -0
  57. xinference/web/ui/node_modules/.cache/babel-loader/46b6dd1f6d1109cd0e2455a0ea0be3e9bda1097cd4ebec9c4040070372671cfc.json +1 -0
  58. xinference/web/ui/node_modules/.cache/babel-loader/4de0a71074f9cbe1e7862750dcdd08cbc1bae7d9d9849a78b1783ca670017b3c.json +1 -0
  59. xinference/web/ui/node_modules/.cache/babel-loader/53f6c0c0afb51265cd8fb940daeb65523501879ac2a8c03a1ead22b9793c5041.json +1 -0
  60. xinference/web/ui/node_modules/.cache/babel-loader/8ccbb839002bc5bc03e0a0e7612362bf92f6ae64f87e094f8682d6a6fe4619bb.json +1 -0
  61. xinference/web/ui/node_modules/.cache/babel-loader/97ed30d6e22cf76f0733651e2c18364689a01665d0b5fe811c1b7ca3eb713c82.json +1 -0
  62. xinference/web/ui/node_modules/.cache/babel-loader/9c0c70f1838913aaa792a0d2260f17f90fd177b95698ed46b7bc3050eb712c1c.json +1 -0
  63. xinference/web/ui/node_modules/.cache/babel-loader/9cfd33238ca43e5bf9fc7e442690e8cc6027c73553db36de87e3597ed524ee4b.json +1 -0
  64. xinference/web/ui/node_modules/.cache/babel-loader/a6da6bc3d0d2191adebee87fb58ecebe82d071087bd2f7f3a9c7fdd2ada130f2.json +1 -0
  65. xinference/web/ui/node_modules/.cache/babel-loader/ada71518a429f821a9b1dea38bc951447f03c8db509887e0980b893acac938f3.json +1 -0
  66. xinference/web/ui/node_modules/.cache/babel-loader/b6c9558d28b5972bb8b2691c5a76a2c8814a815eb3443126da9f49f7d6a0c118.json +1 -0
  67. xinference/web/ui/node_modules/.cache/babel-loader/bb0f721c084a4d85c09201c984f02ee8437d3b6c5c38a57cb4a101f653daef1b.json +1 -0
  68. xinference/web/ui/node_modules/.package-lock.json +33 -0
  69. xinference/web/ui/node_modules/clipboard/.babelrc.json +11 -0
  70. xinference/web/ui/node_modules/clipboard/.eslintrc.json +24 -0
  71. xinference/web/ui/node_modules/clipboard/.prettierrc.json +9 -0
  72. xinference/web/ui/node_modules/clipboard/bower.json +18 -0
  73. xinference/web/ui/node_modules/clipboard/composer.json +25 -0
  74. xinference/web/ui/node_modules/clipboard/package.json +63 -0
  75. xinference/web/ui/node_modules/delegate/package.json +31 -0
  76. xinference/web/ui/node_modules/good-listener/bower.json +11 -0
  77. xinference/web/ui/node_modules/good-listener/package.json +35 -0
  78. xinference/web/ui/node_modules/select/bower.json +13 -0
  79. xinference/web/ui/node_modules/select/package.json +29 -0
  80. xinference/web/ui/node_modules/tiny-emitter/package.json +53 -0
  81. xinference/web/ui/package-lock.json +34 -0
  82. xinference/web/ui/package.json +1 -0
  83. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/METADATA +13 -12
  84. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/RECORD +88 -67
  85. xinference/client/oscar/__init__.py +0 -13
  86. xinference/client/oscar/actor_client.py +0 -611
  87. xinference/model/llm/pytorch/spec_decoding_utils.py +0 -531
  88. xinference/model/llm/pytorch/spec_model.py +0 -186
  89. xinference/web/ui/build/static/js/main.26fdbfbe.js +0 -3
  90. xinference/web/ui/build/static/js/main.26fdbfbe.js.map +0 -1
  91. xinference/web/ui/node_modules/.cache/babel-loader/1870cd6f7054d04e049e363c0a85526584fe25519378609d2838e28d7492bbf1.json +0 -1
  92. xinference/web/ui/node_modules/.cache/babel-loader/5393569d846332075b93b55656716a34f50e0a8c970be789502d7e6c49755fd7.json +0 -1
  93. xinference/web/ui/node_modules/.cache/babel-loader/63a4c48f0326d071c7772c46598215c006ae41fd3d4ff3577fe717de66ad6e89.json +0 -1
  94. xinference/web/ui/node_modules/.cache/babel-loader/de0299226173b0662b573f49e3992220f6611947073bd66ac079728a8bc8837d.json +0 -1
  95. xinference/web/ui/node_modules/.cache/babel-loader/e9b52d171223bb59fb918316297a051cdfd42dd453e8260fd918e90bc0a4ebdf.json +0 -1
  96. xinference/web/ui/node_modules/.cache/babel-loader/f4d5d1a41892a754c1ee0237450d804b20612d1b657945b59e564161ea47aa7a.json +0 -1
  97. xinference/web/ui/node_modules/.cache/babel-loader/fad4cd70de36ef6e6d5f8fd74a10ded58d964a8a91ef7681693fbb8376552da7.json +0 -1
  98. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/LICENSE +0 -0
  99. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/WHEEL +0 -0
  100. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/entry_points.txt +0 -0
  101. {xinference-0.10.3.dist-info → xinference-0.11.1.dist-info}/top_level.txt +0 -0
@@ -108,6 +108,12 @@ class Internlm2PytorchChatModel(PytorchChatModel):
108
108
  kwargs["max_length"] = int(max_new_tokens)
109
109
 
110
110
  stream = generate_config.get("stream", False)
111
+ stream_options = generate_config.pop("stream_options", None)
112
+ include_usage = (
113
+ stream_options["include_usage"]
114
+ if isinstance(stream_options, dict)
115
+ else False
116
+ )
111
117
  if chat_history:
112
118
  input_history = [
113
119
  (chat_history[i]["content"], (chat_history[i + 1]["content"]))
@@ -122,9 +128,15 @@ class Internlm2PytorchChatModel(PytorchChatModel):
122
128
  def _stream_generator():
123
129
  last_chunk_text_length = 0
124
130
  chunk_id = "chat-" + str(uuid.uuid1())
131
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
132
+ inputs = self._tokenizer([prompt], return_tensors="pt")
133
+ inputs = inputs.to(self._model.device)
134
+ prompt_tokens = len(inputs["input_ids"][0])
125
135
  for chunk_text, _ in self._model.stream_chat(
126
- self._tokenizer, prompt, input_history, **kwargs
136
+ self._tokenizer, prompt, chat_history, **kwargs
127
137
  ):
138
+ completion_tokens = completion_tokens + 1
139
+ total_tokens = prompt_tokens + completion_tokens
128
140
  chunk_text = chunk_text[last_chunk_text_length:]
129
141
  last_chunk_text_length += len(chunk_text)
130
142
  completion_choice = CompletionChoice(
@@ -136,7 +148,26 @@ class Internlm2PytorchChatModel(PytorchChatModel):
136
148
  created=int(time.time()),
137
149
  model=self.model_uid,
138
150
  choices=[completion_choice],
151
+ usage=CompletionUsage(
152
+ prompt_tokens=prompt_tokens,
153
+ completion_tokens=completion_tokens,
154
+ total_tokens=total_tokens,
155
+ ),
156
+ )
157
+ if include_usage:
158
+ chunk = CompletionChunk(
159
+ id=chunk_id,
160
+ object="text_completion",
161
+ created=int(time.time()),
162
+ model=self.model_uid,
163
+ choices=[],
164
+ )
165
+ chunk["usage"] = CompletionUsage(
166
+ prompt_tokens=prompt_tokens,
167
+ completion_tokens=completion_tokens,
168
+ total_tokens=total_tokens,
139
169
  )
170
+ yield chunk
140
171
 
141
172
  return self._to_chat_completion_chunks(_stream_generator())
142
173
  else:
@@ -22,9 +22,11 @@ from typing import Dict, Iterator, List, Optional, Union
22
22
  from ....model.utils import select_device
23
23
  from ....types import (
24
24
  ChatCompletion,
25
- ChatCompletionChoice,
26
25
  ChatCompletionChunk,
27
26
  ChatCompletionMessage,
27
+ Completion,
28
+ CompletionChoice,
29
+ CompletionChunk,
28
30
  CompletionUsage,
29
31
  )
30
32
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -116,10 +118,6 @@ class QwenVLChatModel(PytorchChatModel):
116
118
  chat_history: Optional[List[ChatCompletionMessage]] = None,
117
119
  generate_config: Optional[PytorchGenerateConfig] = None,
118
120
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
119
- if generate_config and generate_config.get("stream"):
120
- raise Exception(
121
- f"Chat with model {self.model_family.model_name} does not support stream."
122
- )
123
121
  prompt = self._message_content_to_qwen(prompt)
124
122
  # Convert openai history to qwen vl history
125
123
  qwen_history = []
@@ -134,22 +132,106 @@ class QwenVLChatModel(PytorchChatModel):
134
132
  if len(query_to_response) == 2:
135
133
  qwen_history.append(query_to_response)
136
134
  query_to_response = []
135
+
136
+ stream = generate_config.get("stream", False) if generate_config else False
137
+ stream_options = (
138
+ generate_config.pop("stream_options", None) if generate_config else None
139
+ )
140
+ include_usage = (
141
+ stream_options["include_usage"]
142
+ if isinstance(stream_options, dict)
143
+ else False
144
+ )
145
+ if stream:
146
+ it = self._generate_stream(prompt, qwen_history, include_usage)
147
+ return self._to_chat_completion_chunks(it)
148
+ else:
149
+ c = self._generate(prompt, qwen_history)
150
+ return self._to_chat_completion(c)
151
+
152
+ def _generate(self, prompt: str, qwen_history: List) -> Completion:
137
153
  response, history = self._model.chat(
138
154
  self._tokenizer, query=prompt, history=qwen_history
139
155
  )
140
- return ChatCompletion(
141
- id="chat" + str(uuid.uuid1()),
142
- object="chat.completion",
156
+ c = Completion(
157
+ id=str(uuid.uuid1()),
158
+ object="text_completion",
143
159
  created=int(time.time()),
144
160
  model=self.model_uid,
145
161
  choices=[
146
- ChatCompletionChoice(
147
- index=0,
148
- message={"role": "assistant", "content": response},
149
- finish_reason="stop",
162
+ CompletionChoice(
163
+ index=0, text=response, finish_reason="stop", logprobs=None
150
164
  )
151
165
  ],
152
166
  usage=CompletionUsage(
153
167
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
154
168
  ),
155
169
  )
170
+ return c
171
+
172
+ def _generate_stream(
173
+ self, prompt: str, qwen_history: List, include_usage
174
+ ) -> Iterator[CompletionChunk]:
175
+ # response, history = model.chat(tokenizer, message, history=history)
176
+ response_generator = self._model.chat_stream(
177
+ self._tokenizer, query=prompt, history=qwen_history
178
+ )
179
+ completion_id = str(uuid.uuid1())
180
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
181
+ input_ids = self._tokenizer(prompt, allowed_special="all").input_ids
182
+ prompt_tokens = len(input_ids)
183
+ full_response = ""
184
+ for response in response_generator:
185
+ inc_content = response[len(full_response) :]
186
+ full_response = response
187
+ completion_choice = CompletionChoice(
188
+ text=inc_content, index=0, logprobs=None, finish_reason=None
189
+ )
190
+ completion_chunk = CompletionChunk(
191
+ id=completion_id,
192
+ object="text_completion",
193
+ created=int(time.time()),
194
+ model=self.model_uid,
195
+ choices=[completion_choice],
196
+ )
197
+ completion_tokens = completion_tokens + 1
198
+ total_tokens = prompt_tokens + completion_tokens
199
+ completion_usage = CompletionUsage(
200
+ prompt_tokens=prompt_tokens,
201
+ completion_tokens=completion_tokens,
202
+ total_tokens=total_tokens,
203
+ )
204
+ completion_chunk["usage"] = completion_usage
205
+ yield completion_chunk
206
+
207
+ completion_choice = CompletionChoice(
208
+ text="", index=0, logprobs=None, finish_reason="stop"
209
+ )
210
+ completion_chunk = CompletionChunk(
211
+ id=completion_id,
212
+ object="text_completion",
213
+ created=int(time.time()),
214
+ model=self.model_uid,
215
+ choices=[completion_choice],
216
+ )
217
+ completion_usage = CompletionUsage(
218
+ prompt_tokens=prompt_tokens,
219
+ completion_tokens=completion_tokens,
220
+ total_tokens=total_tokens,
221
+ )
222
+ completion_chunk["usage"] = completion_usage
223
+ yield completion_chunk
224
+ if include_usage:
225
+ chunk = CompletionChunk(
226
+ id=completion_id,
227
+ object="text_completion",
228
+ created=int(time.time()),
229
+ model=self.model_uid,
230
+ choices=[],
231
+ )
232
+ chunk["usage"] = CompletionUsage(
233
+ prompt_tokens=prompt_tokens,
234
+ completion_tokens=completion_tokens,
235
+ total_tokens=total_tokens,
236
+ )
237
+ yield chunk
@@ -106,6 +106,10 @@ def generate_stream(
106
106
  context_len = get_context_length(model.config)
107
107
  stream_interval = generate_config.get("stream_interval", 2)
108
108
  stream = generate_config.get("stream", False)
109
+ stream_options = generate_config.pop("stream_options", None)
110
+ include_usage = (
111
+ stream_options["include_usage"] if isinstance(stream_options, dict) else False
112
+ )
109
113
 
110
114
  len_prompt = len(prompt)
111
115
 
@@ -333,6 +337,21 @@ def generate_stream(
333
337
 
334
338
  yield completion_chunk, completion_usage
335
339
 
340
+ if include_usage:
341
+ completion_chunk = CompletionChunk(
342
+ id=str(uuid.uuid1()),
343
+ object="text_completion",
344
+ created=int(time.time()),
345
+ model=model_uid,
346
+ choices=[],
347
+ )
348
+ completion_usage = CompletionUsage(
349
+ prompt_tokens=input_echo_len,
350
+ completion_tokens=i,
351
+ total_tokens=(input_echo_len + i),
352
+ )
353
+ yield completion_chunk, completion_usage
354
+
336
355
  # clean
337
356
  del past_key_values, out
338
357
  gc.collect()
@@ -352,7 +371,10 @@ def generate_stream_falcon(
352
371
  context_len = get_context_length(model.config)
353
372
  stream_interval = generate_config.get("stream_interval", 2)
354
373
  stream = generate_config.get("stream", False)
355
-
374
+ stream_options = generate_config.pop("stream_options", None)
375
+ include_usage = (
376
+ stream_options["include_usage"] if isinstance(stream_options, dict) else False
377
+ )
356
378
  len_prompt = len(prompt)
357
379
 
358
380
  temperature = float(generate_config.get("temperature", 1.0))
@@ -488,6 +510,21 @@ def generate_stream_falcon(
488
510
 
489
511
  yield completion_chunk, completion_usage
490
512
 
513
+ if include_usage:
514
+ completion_chunk = CompletionChunk(
515
+ id=str(uuid.uuid1()),
516
+ object="text_completion",
517
+ created=int(time.time()),
518
+ model=model_uid,
519
+ choices=[],
520
+ )
521
+ completion_usage = CompletionUsage(
522
+ prompt_tokens=input_echo_len,
523
+ completion_tokens=i,
524
+ total_tokens=(input_echo_len + i),
525
+ )
526
+ yield completion_chunk, completion_usage
527
+
491
528
  # clean
492
529
  gc.collect()
493
530
  empty_cache()
@@ -27,9 +27,11 @@ from PIL import Image
27
27
  from ....model.utils import select_device
28
28
  from ....types import (
29
29
  ChatCompletion,
30
- ChatCompletionChoice,
31
30
  ChatCompletionChunk,
32
31
  ChatCompletionMessage,
32
+ Completion,
33
+ CompletionChoice,
34
+ CompletionChunk,
33
35
  CompletionUsage,
34
36
  )
35
37
  from ..llm_family import LLMFamilyV1, LLMSpecV1
@@ -122,38 +124,6 @@ class YiVLChatModel(PytorchChatModel):
122
124
  raise RuntimeError("Only one image per message is supported by Yi VL.")
123
125
  return content
124
126
 
125
- @staticmethod
126
- def _parse_text(text):
127
- lines = text.split("\n")
128
- lines = [line for line in lines if line != ""]
129
- count = 0
130
- for i, line in enumerate(lines):
131
- if "```" in line:
132
- count += 1
133
- items = line.split("`")
134
- if count % 2 == 1:
135
- lines[i] = f'<pre><code class="language-{items[-1]}">'
136
- else:
137
- lines[i] = f"<br></code></pre>"
138
- else:
139
- if i > 0:
140
- if count % 2 == 1:
141
- line = line.replace("`", r"\`")
142
- line = line.replace("<", "&lt;")
143
- line = line.replace(">", "&gt;")
144
- line = line.replace(" ", "&nbsp;")
145
- line = line.replace("*", "&ast;")
146
- line = line.replace("_", "&lowbar;")
147
- line = line.replace("-", "&#45;")
148
- line = line.replace(".", "&#46;")
149
- line = line.replace("!", "&#33;")
150
- line = line.replace("(", "&#40;")
151
- line = line.replace(")", "&#41;")
152
- line = line.replace("$", "&#36;")
153
- lines[i] = "<br>" + line
154
- text = "".join(lines)
155
- return text
156
-
157
127
  def chat(
158
128
  self,
159
129
  prompt: Union[str, List[Dict]],
@@ -164,12 +134,18 @@ class YiVLChatModel(PytorchChatModel):
164
134
  from transformers import TextIteratorStreamer
165
135
 
166
136
  # TODO(codingl2k1): implement stream mode.
167
- if generate_config and generate_config.get("stream"):
168
- raise Exception(
169
- f"Chat with model {self.model_family.model_name} does not support stream."
170
- )
137
+
171
138
  if not generate_config:
172
139
  generate_config = {}
140
+
141
+ stream = generate_config.get("stream", False)
142
+ stream_options = generate_config.pop("stream_options", None)
143
+ include_usage = (
144
+ stream_options["include_usage"]
145
+ if isinstance(stream_options, dict)
146
+ else False
147
+ )
148
+
173
149
  from ....thirdparty.llava.conversation import conv_templates
174
150
  from ....thirdparty.llava.mm_utils import (
175
151
  KeywordsStoppingCriteria,
@@ -196,11 +172,11 @@ class YiVLChatModel(PytorchChatModel):
196
172
  )
197
173
 
198
174
  images = state.get_images(return_pil=True)
199
- image = images[0]
200
-
201
- image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
202
- "pixel_values"
203
- ][0]
175
+ if images:
176
+ image = images[0]
177
+ image_tensor = self._image_processor.preprocess(image, return_tensors="pt")[
178
+ "pixel_values"
179
+ ][0]
204
180
 
205
181
  stop_str = state.sep
206
182
  keywords = [stop_str]
@@ -217,7 +193,9 @@ class YiVLChatModel(PytorchChatModel):
217
193
  "input_ids": input_ids,
218
194
  "images": image_tensor.unsqueeze(0)
219
195
  .to(dtype=torch.bfloat16)
220
- .to(self._model.device),
196
+ .to(self._model.device)
197
+ if images
198
+ else None,
221
199
  "streamer": streamer,
222
200
  "do_sample": True,
223
201
  "top_p": float(top_p),
@@ -229,25 +207,92 @@ class YiVLChatModel(PytorchChatModel):
229
207
  t = Thread(target=self._model.generate, kwargs=generate_kwargs)
230
208
  t.start()
231
209
 
210
+ if stream:
211
+ it = self._generate_stream(streamer, stop_str, input_ids, include_usage)
212
+ return self._to_chat_completion_chunks(it)
213
+ else:
214
+ c = self._generate(streamer, stop_str)
215
+ return self._to_chat_completion(c)
216
+
217
+ def _generate(self, streamer, stop_str) -> Completion:
232
218
  generated_text = ""
233
219
  for new_text in streamer:
234
220
  generated_text += new_text
235
221
  if generated_text.endswith(stop_str):
236
222
  generated_text = generated_text[: -len(stop_str)]
237
- r = self._parse_text(generated_text)
238
- return ChatCompletion(
239
- id="chat" + str(uuid.uuid1()),
240
- object="chat.completion",
223
+
224
+ c = Completion(
225
+ id=str(uuid.uuid1()),
226
+ object="text_completion",
241
227
  created=int(time.time()),
242
228
  model=self.model_uid,
243
229
  choices=[
244
- ChatCompletionChoice(
245
- index=0,
246
- message={"role": "assistant", "content": r},
247
- finish_reason="stop",
230
+ CompletionChoice(
231
+ index=0, text=generated_text, finish_reason="stop", logprobs=None
248
232
  )
249
233
  ],
250
234
  usage=CompletionUsage(
251
235
  prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
252
236
  ),
253
237
  )
238
+ return c
239
+
240
+ def _generate_stream(
241
+ self, streamer, stop_str, input_ids, include_usage
242
+ ) -> Iterator[CompletionChunk]:
243
+ completion_id = str(uuid.uuid1())
244
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
245
+ prompt_tokens = len(input_ids[0])
246
+ for i, new_text in enumerate(streamer):
247
+ if not new_text.endswith(stop_str):
248
+ completion_choice = CompletionChoice(
249
+ text=new_text, index=0, logprobs=None, finish_reason=None
250
+ )
251
+ chunk = CompletionChunk(
252
+ id=completion_id,
253
+ object="text_completion",
254
+ created=int(time.time()),
255
+ model=self.model_uid,
256
+ choices=[completion_choice],
257
+ )
258
+ completion_tokens = i
259
+ total_tokens = prompt_tokens + completion_tokens
260
+ completion_usage = CompletionUsage(
261
+ prompt_tokens=prompt_tokens,
262
+ completion_tokens=completion_tokens,
263
+ total_tokens=total_tokens,
264
+ )
265
+ chunk["usage"] = completion_usage
266
+ yield chunk
267
+
268
+ completion_choice = CompletionChoice(
269
+ text="", index=0, logprobs=None, finish_reason="stop"
270
+ )
271
+ chunk = CompletionChunk(
272
+ id=completion_id,
273
+ object="text_completion",
274
+ created=int(time.time()),
275
+ model=self.model_uid,
276
+ choices=[completion_choice],
277
+ )
278
+ completion_usage = CompletionUsage(
279
+ prompt_tokens=prompt_tokens,
280
+ completion_tokens=completion_tokens,
281
+ total_tokens=total_tokens,
282
+ )
283
+ chunk["usage"] = completion_usage
284
+ yield chunk
285
+ if include_usage:
286
+ chunk = CompletionChunk(
287
+ id=completion_id,
288
+ object="text_completion",
289
+ created=int(time.time()),
290
+ model=self.model_uid,
291
+ choices=[],
292
+ )
293
+ chunk["usage"] = CompletionUsage(
294
+ prompt_tokens=prompt_tokens,
295
+ completion_tokens=completion_tokens,
296
+ total_tokens=total_tokens,
297
+ )
298
+ yield chunk
@@ -53,6 +53,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
53
53
  stop: Optional[Union[str, List[str]]]
54
54
  ignore_eos: bool
55
55
  stream: bool
56
+ stream_options: Optional[Union[dict, None]]
56
57
 
57
58
 
58
59
  try:
@@ -157,6 +158,8 @@ class SGLANGModel(LLM):
157
158
  )
158
159
  generate_config.setdefault("stop", [])
159
160
  generate_config.setdefault("stream", False)
161
+ stream_options = generate_config.get("stream_options")
162
+ generate_config.setdefault("stream_options", stream_options)
160
163
  generate_config.setdefault("ignore_eos", False)
161
164
 
162
165
  return generate_config
@@ -192,7 +195,7 @@ class SGLANGModel(LLM):
192
195
 
193
196
  @staticmethod
194
197
  def _convert_state_to_completion_chunk(
195
- request_id: str, model: str, output_text: str, meta_info: Dict
198
+ request_id: str, model: str, output_text: str
196
199
  ) -> CompletionChunk:
197
200
  choices: List[CompletionChoice] = [
198
201
  CompletionChoice(
@@ -209,13 +212,6 @@ class SGLANGModel(LLM):
209
212
  model=model,
210
213
  choices=choices,
211
214
  )
212
- prompt_tokens = meta_info["prompt_tokens"]
213
- completion_tokens = meta_info["completion_tokens"]
214
- chunk["usage"] = CompletionUsage(
215
- prompt_tokens=prompt_tokens,
216
- completion_tokens=completion_tokens,
217
- total_tokens=prompt_tokens + completion_tokens,
218
- )
219
215
  return chunk
220
216
 
221
217
  @staticmethod
@@ -272,6 +268,9 @@ class SGLANGModel(LLM):
272
268
  "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
273
269
  )
274
270
  stream = sanitized_generate_config.pop("stream")
271
+ stream_options = sanitized_generate_config.pop("stream_options")
272
+ if isinstance(stream_options, dict):
273
+ include_usage = stream_options.pop("include_usage", False)
275
274
  request_id = str(uuid.uuid1())
276
275
  state = pipeline.run(
277
276
  question=prompt,
@@ -289,11 +288,34 @@ class SGLANGModel(LLM):
289
288
  else:
290
289
 
291
290
  async def stream_results() -> AsyncGenerator[CompletionChunk, None]:
291
+ prompt_tokens, completion_tokens, total_tokens = 0, 0, 0
292
292
  async for out, meta_info in state.text_async_iter(
293
293
  var_name="answer", return_meta_data=True
294
294
  ):
295
295
  chunk = self._convert_state_to_completion_chunk(
296
- request_id, self.model_uid, output_text=out, meta_info=meta_info
296
+ request_id, self.model_uid, output_text=out
297
+ )
298
+ prompt_tokens = meta_info["prompt_tokens"]
299
+ completion_tokens = meta_info["completion_tokens"]
300
+ total_tokens = prompt_tokens + completion_tokens
301
+ chunk["usage"] = CompletionUsage(
302
+ prompt_tokens=prompt_tokens,
303
+ completion_tokens=completion_tokens,
304
+ total_tokens=total_tokens,
305
+ )
306
+ yield chunk
307
+ if include_usage:
308
+ chunk = CompletionChunk(
309
+ id=request_id,
310
+ object="text_completion",
311
+ created=int(time.time()),
312
+ model=self.model_uid,
313
+ choices=[],
314
+ )
315
+ chunk["usage"] = CompletionUsage(
316
+ prompt_tokens=prompt_tokens,
317
+ completion_tokens=completion_tokens,
318
+ total_tokens=total_tokens,
297
319
  )
298
320
  yield chunk
299
321
 
@@ -228,16 +228,14 @@ Begin!"""
228
228
  tools_name_text = []
229
229
  for func_info in tools:
230
230
  parameters = []
231
- required_parameters = func_info["function"]["parameters"].get(
232
- "required", []
233
- )
234
- for name, p in func_info["function"]["parameters"][
235
- "properties"
236
- ].items():
237
- param = dict({"name": name}, **p)
238
- if name in required_parameters:
239
- param["required"] = True
240
- parameters.append(param)
231
+ fp = func_info["function"].get("parameters", {})
232
+ if fp:
233
+ required_parameters = fp.get("required", [])
234
+ for name, p in fp["properties"].items():
235
+ param = dict({"name": name}, **p)
236
+ if name in required_parameters:
237
+ param["required"] = True
238
+ parameters.append(param)
241
239
 
242
240
  name = func_info["function"]["name"]
243
241
  desc = func_info["function"]["description"]
@@ -447,6 +445,17 @@ Begin!"""
447
445
  else:
448
446
  ret += "<AI>" + content.strip()
449
447
  return ret
448
+ elif prompt_style.style_name == "PHI3":
449
+ ret = f"<|system|>{prompt_style.intra_message_sep}{prompt_style.system_prompt}{prompt_style.inter_message_sep}"
450
+ for message in chat_history:
451
+ content = message["content"] or ""
452
+ role = get_role(message["role"])
453
+ if content:
454
+ ret += f"<|{role}|>{prompt_style.intra_message_sep}{content}{prompt_style.inter_message_sep}"
455
+ else:
456
+ ret += f"<|{role}|>{prompt_style.intra_message_sep}"
457
+ ret += "<|assistant|>\n"
458
+ return ret
450
459
  else:
451
460
  raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
452
461
 
@@ -473,9 +482,6 @@ Begin!"""
473
482
  for i, choice in enumerate(chunk["choices"])
474
483
  ],
475
484
  }
476
- usage = chunk.get("usage")
477
- if usage is not None:
478
- chat_chunk["usage"] = usage
479
485
  return cast(ChatCompletionChunk, chat_chunk)
480
486
 
481
487
  @classmethod
@@ -499,6 +505,19 @@ Begin!"""
499
505
  for i, choice in enumerate(chunk["choices"])
500
506
  ],
501
507
  }
508
+ return cast(ChatCompletionChunk, chat_chunk)
509
+
510
+ @classmethod
511
+ def _get_final_chat_completion_chunk(
512
+ cls, chunk: CompletionChunk
513
+ ) -> ChatCompletionChunk:
514
+ chat_chunk = {
515
+ "id": "chat" + chunk["id"],
516
+ "model": chunk["model"],
517
+ "created": chunk["created"],
518
+ "object": "chat.completion.chunk",
519
+ "choices": [],
520
+ }
502
521
  usage = chunk.get("usage")
503
522
  if usage is not None:
504
523
  chat_chunk["usage"] = usage
@@ -512,7 +531,12 @@ Begin!"""
512
531
  for i, chunk in enumerate(chunks):
513
532
  if i == 0:
514
533
  yield cls._get_first_chat_completion_chunk(chunk)
515
- yield cls._to_chat_completion_chunk(chunk)
534
+ # usage
535
+ choices = chunk.get("choices")
536
+ if not choices:
537
+ yield cls._get_final_chat_completion_chunk(chunk)
538
+ else:
539
+ yield cls._to_chat_completion_chunk(chunk)
516
540
 
517
541
  @classmethod
518
542
  async def _async_to_chat_completion_chunks(
@@ -523,7 +547,12 @@ Begin!"""
523
547
  async for chunk in chunks:
524
548
  if i == 0:
525
549
  yield cls._get_first_chat_completion_chunk(chunk)
526
- yield cls._to_chat_completion_chunk(chunk)
550
+ # usage
551
+ choices = chunk.get("choices")
552
+ if not choices:
553
+ yield cls._get_final_chat_completion_chunk(chunk)
554
+ else:
555
+ yield cls._to_chat_completion_chunk(chunk)
527
556
  i += 1
528
557
 
529
558
  @staticmethod
@@ -680,6 +709,15 @@ Begin!"""
680
709
  else:
681
710
  m = {"role": "assistant", "content": content, "tool_calls": []}
682
711
  finish_reason = "stop"
712
+ try:
713
+ usage = c.get("usage")
714
+ assert "prompt_tokens" in usage
715
+ except Exception:
716
+ usage = {
717
+ "prompt_tokens": -1,
718
+ "completion_tokens": -1,
719
+ "total_tokens": -1,
720
+ }
683
721
  return {
684
722
  "id": "chat" + f"cmpl-{_id}",
685
723
  "model": model_uid,
@@ -692,11 +730,7 @@ Begin!"""
692
730
  "finish_reason": finish_reason,
693
731
  }
694
732
  ],
695
- "usage": {
696
- "prompt_tokens": -1,
697
- "completion_tokens": -1,
698
- "total_tokens": -1,
699
- },
733
+ "usage": usage,
700
734
  }
701
735
 
702
736