vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,392 @@
1
+ import asyncio
2
+ import codecs
3
+ import time
4
+ from typing import (AsyncGenerator, AsyncIterator, Awaitable, Iterable, List,
5
+ Optional, Tuple, TypedDict, Union, final)
6
+
7
+ from fastapi import Request
8
+ from openai.types.chat import (ChatCompletionContentPartParam,
9
+ ChatCompletionRole)
10
+
11
+ from vllm.engine.async_llm_engine import AsyncLLMEngine
12
+ from vllm.entrypoints.openai.protocol import (
13
+ ChatCompletionRequest, ChatCompletionResponse,
14
+ ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
15
+ ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
16
+ UsageInfo)
17
+ from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
18
+ OpenAIServing)
19
+ from vllm.logger import init_logger
20
+ from vllm.model_executor.guided_decoding import (
21
+ get_guided_decoding_logits_processor)
22
+ from vllm.outputs import RequestOutput
23
+ from vllm.utils import random_uuid
24
+
25
+ logger = init_logger(__name__)
26
+
27
+
28
+ @final # So that it should be compatible with Dict[str, str]
29
+ class ConversationMessage(TypedDict):
30
+ role: str
31
+ content: str
32
+
33
+
34
+ class OpenAIServingChat(OpenAIServing):
35
+
36
+ def __init__(self,
37
+ engine: AsyncLLMEngine,
38
+ served_model_names: List[str],
39
+ response_role: str,
40
+ lora_modules: Optional[List[LoRAModulePath]] = None,
41
+ chat_template: Optional[str] = None):
42
+ super().__init__(engine=engine,
43
+ served_model_names=served_model_names,
44
+ lora_modules=lora_modules,
45
+ await_post_init=self._load_chat_template(
46
+ chat_template=chat_template))
47
+
48
+ self.response_role = response_role
49
+
50
+ def _parse_chat_message_content(
51
+ self,
52
+ role: ChatCompletionRole,
53
+ content: Optional[Union[str,
54
+ Iterable[ChatCompletionContentPartParam]]],
55
+ ) -> Tuple[List[ConversationMessage], List[Awaitable[object]]]:
56
+ if content is None:
57
+ return [], []
58
+ if isinstance(content, str):
59
+ return [ConversationMessage(role=role, content=content)], []
60
+
61
+ texts: List[str] = []
62
+ for _, part in enumerate(content):
63
+ if part["type"] == "text":
64
+ text = part["text"]
65
+
66
+ texts.append(text)
67
+ else:
68
+ raise NotImplementedError(f"Unknown part type: {part['type']}")
69
+
70
+ return [ConversationMessage(role=role, content="\n".join(texts))], []
71
+
72
+ async def create_chat_completion(
73
+ self, request: ChatCompletionRequest, raw_request: Request
74
+ ) -> Union[ErrorResponse, AsyncGenerator[str, None],
75
+ ChatCompletionResponse]:
76
+ """Completion API similar to OpenAI's API.
77
+
78
+ See https://platform.openai.com/docs/api-reference/chat/create
79
+ for the API specification. This API mimics the OpenAI
80
+ ChatCompletion API.
81
+
82
+ NOTE: Currently we do not support the following feature:
83
+ - function_call (Users should implement this by themselves)
84
+ """
85
+ error_check_ret = await self._check_model(request)
86
+ if error_check_ret is not None:
87
+ return error_check_ret
88
+
89
+ try:
90
+ conversation: List[ConversationMessage] = []
91
+
92
+ for m in request.messages:
93
+ messages, _ = self._parse_chat_message_content(
94
+ m["role"], m["content"])
95
+
96
+ conversation.extend(messages)
97
+
98
+ prompt = self.tokenizer.apply_chat_template(
99
+ conversation=conversation,
100
+ tokenize=False,
101
+ add_generation_prompt=request.add_generation_prompt,
102
+ )
103
+ except Exception as e:
104
+ logger.error("Error in applying chat template from request: %s", e)
105
+ return self.create_error_response(str(e))
106
+
107
+ request_id = f"cmpl-{random_uuid()}"
108
+ try:
109
+ # Tokenize/detokenize depending on prompt format (string/token list)
110
+ prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
111
+ request, prompt=prompt)
112
+ sampling_params = request.to_sampling_params()
113
+ lora_request = self._maybe_get_lora(request)
114
+ decoding_config = await self.engine.get_decoding_config()
115
+ guided_decoding_backend = request.guided_decoding_backend \
116
+ or decoding_config.guided_decoding_backend
117
+ guided_decode_logits_processor = (
118
+ await get_guided_decoding_logits_processor(
119
+ guided_decoding_backend, request, await
120
+ self.engine.get_tokenizer()))
121
+ if guided_decode_logits_processor:
122
+ if sampling_params.logits_processors is None:
123
+ sampling_params.logits_processors = []
124
+ sampling_params.logits_processors.append(
125
+ guided_decode_logits_processor)
126
+ except ValueError as e:
127
+ return self.create_error_response(str(e))
128
+
129
+ result_generator = self.engine.generate(prompt_text, sampling_params,
130
+ request_id, prompt_ids,
131
+ lora_request)
132
+ # Streaming response
133
+ if request.stream:
134
+ return self.chat_completion_stream_generator(
135
+ request, result_generator, request_id, conversation)
136
+ else:
137
+ try:
138
+ return await self.chat_completion_full_generator(
139
+ request, raw_request, result_generator, request_id,
140
+ conversation)
141
+ except ValueError as e:
142
+ # TODO: Use a vllm-specific Validation Error
143
+ return self.create_error_response(str(e))
144
+
145
+ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
146
+ if request.add_generation_prompt:
147
+ return self.response_role
148
+ else:
149
+ return request.messages[-1]["role"]
150
+
151
+ async def chat_completion_stream_generator(
152
+ self, request: ChatCompletionRequest,
153
+ result_generator: AsyncIterator[RequestOutput], request_id: str,
154
+ conversation: List[ConversationMessage]
155
+ ) -> AsyncGenerator[str, None]:
156
+ model_name = self.served_model_names[0]
157
+ created_time = int(time.time())
158
+ chunk_object_type = "chat.completion.chunk"
159
+ first_iteration = True
160
+
161
+ # Send response for each token for each request.n (index)
162
+ assert request.n is not None
163
+ previous_texts = [""] * request.n
164
+ previous_num_tokens = [0] * request.n
165
+ finish_reason_sent = [False] * request.n
166
+ try:
167
+ async for res in result_generator:
168
+ # We need to do it here, because if there are exceptions in
169
+ # the result_generator, it needs to be sent as the FIRST
170
+ # response (by the try...catch).
171
+ if first_iteration:
172
+ # Send first response for each request.n (index) with
173
+ # the role
174
+ role = self.get_chat_request_role(request)
175
+ for i in range(request.n):
176
+ choice_data = ChatCompletionResponseStreamChoice(
177
+ index=i,
178
+ delta=DeltaMessage(role=role),
179
+ logprobs=None,
180
+ finish_reason=None)
181
+ chunk = ChatCompletionStreamResponse(
182
+ id=request_id,
183
+ object=chunk_object_type,
184
+ created=created_time,
185
+ choices=[choice_data],
186
+ model=model_name)
187
+ data = chunk.model_dump_json(exclude_unset=True)
188
+ yield f"data: {data}\n\n"
189
+
190
+ # Send response to echo the input portion of the
191
+ # last message
192
+ if request.echo:
193
+ last_msg_content = ""
194
+ if conversation and conversation[-1].get(
195
+ "content") and conversation[-1].get(
196
+ "role") == role:
197
+ last_msg_content = conversation[-1]["content"]
198
+
199
+ if last_msg_content:
200
+ for i in range(request.n):
201
+ choice_data = (
202
+ ChatCompletionResponseStreamChoice(
203
+ index=i,
204
+ delta=DeltaMessage(
205
+ content=last_msg_content),
206
+ finish_reason=None))
207
+ chunk = ChatCompletionStreamResponse(
208
+ id=request_id,
209
+ object=chunk_object_type,
210
+ created=created_time,
211
+ choices=[choice_data],
212
+ logprobs=None,
213
+ model=model_name)
214
+ data = chunk.model_dump_json(
215
+ exclude_unset=True)
216
+ yield f"data: {data}\n\n"
217
+ first_iteration = False
218
+
219
+ for output in res.outputs:
220
+ i = output.index
221
+
222
+ if finish_reason_sent[i]:
223
+ continue
224
+
225
+ delta_token_ids = output.token_ids[previous_num_tokens[i]:]
226
+ top_logprobs = output.logprobs[
227
+ previous_num_tokens[i]:] if output.logprobs else None
228
+
229
+ if request.logprobs:
230
+ logprobs = self._create_logprobs(
231
+ token_ids=delta_token_ids,
232
+ top_logprobs=top_logprobs,
233
+ num_output_top_logprobs=request.logprobs,
234
+ initial_text_offset=len(previous_texts[i]),
235
+ )
236
+ else:
237
+ logprobs = None
238
+
239
+ delta_text = output.text[len(previous_texts[i]):]
240
+ previous_texts[i] = output.text
241
+ previous_num_tokens[i] = len(output.token_ids)
242
+ if output.finish_reason is None:
243
+ # Send token-by-token response for each request.n
244
+ choice_data = ChatCompletionResponseStreamChoice(
245
+ index=i,
246
+ delta=DeltaMessage(content=delta_text),
247
+ logprobs=logprobs,
248
+ finish_reason=None)
249
+ chunk = ChatCompletionStreamResponse(
250
+ id=request_id,
251
+ object=chunk_object_type,
252
+ created=created_time,
253
+ choices=[choice_data],
254
+ model=model_name)
255
+ data = chunk.model_dump_json(exclude_unset=True)
256
+ yield f"data: {data}\n\n"
257
+ else:
258
+ # Send the finish response for each request.n only once
259
+ prompt_tokens = len(res.prompt_token_ids)
260
+ final_usage = UsageInfo(
261
+ prompt_tokens=prompt_tokens,
262
+ completion_tokens=previous_num_tokens[i],
263
+ total_tokens=prompt_tokens +
264
+ previous_num_tokens[i],
265
+ )
266
+ choice_data = ChatCompletionResponseStreamChoice(
267
+ index=i,
268
+ delta=DeltaMessage(content=delta_text),
269
+ logprobs=logprobs,
270
+ finish_reason=output.finish_reason,
271
+ stop_reason=output.stop_reason)
272
+ chunk = ChatCompletionStreamResponse(
273
+ id=request_id,
274
+ object=chunk_object_type,
275
+ created=created_time,
276
+ choices=[choice_data],
277
+ model=model_name)
278
+ if final_usage is not None:
279
+ chunk.usage = final_usage
280
+ data = chunk.model_dump_json(exclude_unset=True,
281
+ exclude_none=True)
282
+ yield f"data: {data}\n\n"
283
+ finish_reason_sent[i] = True
284
+ except ValueError as e:
285
+ # TODO: Use a vllm-specific Validation Error
286
+ data = self.create_streaming_error_response(str(e))
287
+ yield f"data: {data}\n\n"
288
+ # Send the final done message after all response.n are finished
289
+ yield "data: [DONE]\n\n"
290
+
291
+ async def chat_completion_full_generator(
292
+ self, request: ChatCompletionRequest, raw_request: Request,
293
+ result_generator: AsyncIterator[RequestOutput], request_id: str,
294
+ conversation: List[ConversationMessage]
295
+ ) -> Union[ErrorResponse, ChatCompletionResponse]:
296
+
297
+ model_name = self.served_model_names[0]
298
+ created_time = int(time.time())
299
+ final_res: Optional[RequestOutput] = None
300
+
301
+ async for res in result_generator:
302
+ if await raw_request.is_disconnected():
303
+ # Abort the request if the client disconnects.
304
+ await self.engine.abort(request_id)
305
+ return self.create_error_response("Client disconnected")
306
+ final_res = res
307
+ assert final_res is not None
308
+
309
+ choices = []
310
+
311
+ role = self.get_chat_request_role(request)
312
+ for output in final_res.outputs:
313
+ token_ids = output.token_ids
314
+ top_logprobs = output.logprobs
315
+
316
+ if request.logprobs:
317
+ logprobs = self._create_logprobs(
318
+ token_ids=token_ids,
319
+ top_logprobs=top_logprobs,
320
+ num_output_top_logprobs=request.logprobs,
321
+ )
322
+ else:
323
+ logprobs = None
324
+
325
+ choice_data = ChatCompletionResponseChoice(
326
+ index=output.index,
327
+ message=ChatMessage(role=role, content=output.text),
328
+ logprobs=logprobs,
329
+ finish_reason=output.finish_reason,
330
+ stop_reason=output.stop_reason,
331
+ )
332
+ choices.append(choice_data)
333
+
334
+ if request.echo:
335
+ last_msg_content = ""
336
+ if conversation and conversation[-1].get(
337
+ "content") and conversation[-1].get("role") == role:
338
+ last_msg_content = conversation[-1]["content"]
339
+
340
+ for choice in choices:
341
+ full_message = last_msg_content + choice.message.content
342
+ choice.message.content = full_message
343
+
344
+ num_prompt_tokens = len(final_res.prompt_token_ids)
345
+ num_generated_tokens = sum(
346
+ len(output.token_ids) for output in final_res.outputs)
347
+ usage = UsageInfo(
348
+ prompt_tokens=num_prompt_tokens,
349
+ completion_tokens=num_generated_tokens,
350
+ total_tokens=num_prompt_tokens + num_generated_tokens,
351
+ )
352
+ response = ChatCompletionResponse(
353
+ id=request_id,
354
+ created=created_time,
355
+ model=model_name,
356
+ choices=choices,
357
+ usage=usage,
358
+ )
359
+
360
+ return response
361
+
362
+ async def _load_chat_template(self, chat_template: Optional[str]):
363
+ while self.tokenizer is None:
364
+ # Give the parent class time to load the tokenizer
365
+ await asyncio.sleep(0.1)
366
+ tokenizer = self.tokenizer
367
+
368
+ if chat_template is not None:
369
+ try:
370
+ with open(chat_template, "r") as f:
371
+ tokenizer.chat_template = f.read()
372
+ except OSError as e:
373
+ JINJA_CHARS = "{}\n"
374
+ if not any(c in chat_template for c in JINJA_CHARS):
375
+ msg = (f"The supplied chat template ({chat_template}) "
376
+ f"looks like a file path, but it failed to be "
377
+ f"opened. Reason: {e}")
378
+ raise ValueError(msg) from e
379
+
380
+ # If opening a file fails, set chat template to be args to
381
+ # ensure we decode so our escape are interpreted correctly
382
+ tokenizer.chat_template = codecs.decode(
383
+ chat_template, "unicode_escape")
384
+
385
+ logger.info("Using supplied chat template:\n%s",
386
+ tokenizer.chat_template)
387
+ elif tokenizer.chat_template is not None:
388
+ logger.info("Using default chat template:\n%s",
389
+ tokenizer.chat_template)
390
+ else:
391
+ logger.warning(
392
+ "No chat template provided. Chat API will not work.")