vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,784 @@
1
+ import time
2
+ from typing import Iterable, List, Optional, Type, Union
3
+
4
+ from transformers import GenerationConfig, PreTrainedTokenizer
5
+
6
+ import vllm
7
+ from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
8
+ LoRAConfig, ModelConfig, ParallelConfig,
9
+ SchedulerConfig, SpeculativeConfig,
10
+ VisionLanguageConfig)
11
+ from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
12
+ SchedulerOutputs)
13
+ from vllm.engine.arg_utils import EngineArgs
14
+ from vllm.engine.metrics import StatLogger, Stats
15
+ from vllm.engine.output_processor.interfaces import (
16
+ SequenceGroupOutputProcessor)
17
+ from vllm.engine.output_processor.stop_checker import StopChecker
18
+ from vllm.engine.output_processor.util import create_output_by_sequence_group
19
+ from vllm.executor.executor_base import ExecutorBase
20
+ from vllm.executor.ray_utils import initialize_ray_cluster
21
+ from vllm.logger import init_logger
22
+ from vllm.lora.request import LoRARequest
23
+ from vllm.outputs import RequestOutput
24
+ from vllm.sampling_params import SamplingParams
25
+ from vllm.sequence import (ExecuteModelRequest, MultiModalData, SamplerOutput,
26
+ Sequence, SequenceGroup, SequenceGroupMetadata,
27
+ SequenceStatus)
28
+ from vllm.transformers_utils.detokenizer import Detokenizer
29
+ from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
30
+ get_tokenizer_group)
31
+ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
32
+ usage_message)
33
+ from vllm.utils import Counter
34
+
35
+ logger = init_logger(__name__)
36
+ _LOCAL_LOGGING_INTERVAL_SEC = 5
37
+
38
+
39
+ def _load_generation_config_dict(model_config: ModelConfig):
40
+ try:
41
+ return GenerationConfig.from_pretrained(
42
+ model_config.model,
43
+ revision=model_config.revision,
44
+ ).to_diff_dict()
45
+ except OSError:
46
+ # Not found.
47
+ return {}
48
+
49
+
50
+ class LLMEngine:
51
+ """An LLM engine that receives requests and generates texts.
52
+
53
+ This is the main class for the vLLM engine. It receives requests
54
+ from clients and generates texts from the LLM. It includes a tokenizer, a
55
+ language model (possibly distributed across multiple GPUs), and GPU memory
56
+ space allocated for intermediate states (aka KV cache). This class utilizes
57
+ iteration-level scheduling and efficient memory management to maximize the
58
+ serving throughput.
59
+
60
+ The `LLM` class wraps this class for offline batched inference and the
61
+ `AsyncLLMEngine` class wraps this class for online serving.
62
+
63
+ NOTE: The config arguments are derived from the `EngineArgs` class. For the
64
+ comprehensive list of arguments, see `EngineArgs`.
65
+
66
+ Args:
67
+ model_config: The configuration related to the LLM model.
68
+ cache_config: The configuration related to the KV cache memory
69
+ management.
70
+ parallel_config: The configuration related to distributed execution.
71
+ scheduler_config: The configuration related to the request scheduler.
72
+ device_config: The configuration related to the device.
73
+ lora_config (Optional): The configuration related to serving multi-LoRA.
74
+ vision_language_config (Optional): The configuration related to vision
75
+ language models.
76
+ speculative_config (Optional): The configuration related to speculative
77
+ decoding.
78
+ executor_class: The model executor class for managing distributed
79
+ execution.
80
+ log_stats: Whether to log statistics.
81
+ usage_context: Specified entry point, used for usage info collection
82
+ """
83
+
84
+ def __init__(
85
+ self,
86
+ model_config: ModelConfig,
87
+ cache_config: CacheConfig,
88
+ parallel_config: ParallelConfig,
89
+ scheduler_config: SchedulerConfig,
90
+ device_config: DeviceConfig,
91
+ load_config: LoadConfig,
92
+ lora_config: Optional[LoRAConfig],
93
+ vision_language_config: Optional[VisionLanguageConfig],
94
+ speculative_config: Optional[SpeculativeConfig],
95
+ decoding_config: Optional[DecodingConfig],
96
+ executor_class: Type[ExecutorBase],
97
+ log_stats: bool,
98
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
99
+ ) -> None:
100
+ logger.info(
101
+ "Initializing an LLM engine (v%s) with config: "
102
+ "model=%r, speculative_config=%r, tokenizer=%r, "
103
+ "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
104
+ "tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
105
+ "max_seq_len=%d, download_dir=%r, load_format=%s, "
106
+ "tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
107
+ "quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
108
+ "quantization_param_path=%s, device_config=%s, "
109
+ "decoding_config=%r, seed=%d, served_model_name=%s)",
110
+ vllm.__version__,
111
+ model_config.model,
112
+ speculative_config,
113
+ model_config.tokenizer,
114
+ model_config.skip_tokenizer_init,
115
+ model_config.tokenizer_mode,
116
+ model_config.revision,
117
+ model_config.tokenizer_revision,
118
+ model_config.trust_remote_code,
119
+ model_config.dtype,
120
+ model_config.max_model_len,
121
+ load_config.download_dir,
122
+ load_config.load_format,
123
+ parallel_config.tensor_parallel_size,
124
+ parallel_config.disable_custom_all_reduce,
125
+ model_config.quantization,
126
+ model_config.enforce_eager,
127
+ cache_config.cache_dtype,
128
+ model_config.quantization_param_path,
129
+ device_config.device,
130
+ decoding_config,
131
+ model_config.seed,
132
+ model_config.served_model_name,
133
+ )
134
+ # TODO(woosuk): Print more configs in debug mode.
135
+
136
+ self.model_config = model_config
137
+ self.cache_config = cache_config
138
+ self.lora_config = lora_config
139
+ self.vision_language_config = vision_language_config
140
+ self.parallel_config = parallel_config
141
+ self.scheduler_config = scheduler_config
142
+ self.device_config = device_config
143
+ self.speculative_config = speculative_config
144
+ self.load_config = load_config
145
+ self.decoding_config = decoding_config or DecodingConfig()
146
+ self.log_stats = log_stats
147
+
148
+ if not self.model_config.skip_tokenizer_init:
149
+ self.tokenizer: BaseTokenizerGroup
150
+ self._init_tokenizer()
151
+ self.detokenizer = Detokenizer(self.tokenizer)
152
+ else:
153
+ self.detokenizer = None
154
+ self.tokenizer = None
155
+
156
+ self.seq_counter = Counter()
157
+ self.generation_config_fields = _load_generation_config_dict(
158
+ model_config)
159
+
160
+ self.model_executor = executor_class(
161
+ model_config=model_config,
162
+ cache_config=cache_config,
163
+ parallel_config=parallel_config,
164
+ scheduler_config=scheduler_config,
165
+ device_config=device_config,
166
+ lora_config=lora_config,
167
+ vision_language_config=vision_language_config,
168
+ speculative_config=speculative_config,
169
+ load_config=load_config,
170
+ )
171
+
172
+ self._initialize_kv_caches()
173
+
174
+ # If usage stat is enabled, collect relevant info.
175
+ if is_usage_stats_enabled():
176
+ from vllm.model_executor.model_loader import (
177
+ get_architecture_class_name)
178
+ usage_message.report_usage(
179
+ get_architecture_class_name(model_config),
180
+ usage_context,
181
+ extra_kvs={
182
+ # Common configuration
183
+ "dtype":
184
+ str(model_config.dtype),
185
+ "tensor_parallel_size":
186
+ parallel_config.tensor_parallel_size,
187
+ "block_size":
188
+ cache_config.block_size,
189
+ "gpu_memory_utilization":
190
+ cache_config.gpu_memory_utilization,
191
+
192
+ # Quantization
193
+ "quantization":
194
+ model_config.quantization,
195
+ "kv_cache_dtype":
196
+ cache_config.cache_dtype,
197
+
198
+ # Feature flags
199
+ "enable_lora":
200
+ bool(lora_config),
201
+ "enable_prefix_caching":
202
+ cache_config.enable_prefix_caching,
203
+ "enforce_eager":
204
+ model_config.enforce_eager,
205
+ "disable_custom_all_reduce":
206
+ parallel_config.disable_custom_all_reduce,
207
+ })
208
+
209
+ if self.tokenizer:
210
+ # Ping the tokenizer to ensure liveness if it runs in a
211
+ # different process.
212
+ self.tokenizer.ping()
213
+
214
+ # Create the scheduler.
215
+ # NOTE: the cache_config here have been updated with the numbers of
216
+ # GPU and CPU blocks, which are profiled in the distributed executor.
217
+ self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
218
+
219
+ # Metric Logging.
220
+ if self.log_stats:
221
+ self.stat_logger = StatLogger(
222
+ local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
223
+ labels=dict(model_name=model_config.served_model_name),
224
+ max_model_len=self.model_config.max_model_len)
225
+ self.stat_logger.info("cache_config", self.cache_config)
226
+
227
+ # Create sequence output processor, e.g. for beam search or
228
+ # speculative decoding.
229
+ self.output_processor = (
230
+ SequenceGroupOutputProcessor.create_output_processor(
231
+ self.scheduler_config,
232
+ self.detokenizer,
233
+ self.scheduler,
234
+ self.seq_counter,
235
+ self.get_tokenizer_for_seq,
236
+ stop_checker=StopChecker(
237
+ self.scheduler_config.max_model_len,
238
+ self.get_tokenizer_for_seq,
239
+ ),
240
+ ))
241
+
242
+ def _initialize_kv_caches(self) -> None:
243
+ """Initialize the KV cache in the worker(s).
244
+
245
+ The workers will determine the number of blocks in both the GPU cache
246
+ and the swap CPU cache.
247
+ """
248
+ num_gpu_blocks, num_cpu_blocks = (
249
+ self.model_executor.determine_num_available_blocks())
250
+
251
+ if self.cache_config.num_gpu_blocks_override is not None:
252
+ num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
253
+ logger.info(
254
+ "Overriding num_gpu_blocks=%d with "
255
+ "num_gpu_blocks_override=%d", num_gpu_blocks,
256
+ num_gpu_blocks_override)
257
+ num_gpu_blocks = num_gpu_blocks_override
258
+
259
+ self.cache_config.num_gpu_blocks = num_gpu_blocks
260
+ self.cache_config.num_cpu_blocks = num_cpu_blocks
261
+
262
+ self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
263
+
264
+ @classmethod
265
+ def from_engine_args(
266
+ cls,
267
+ engine_args: EngineArgs,
268
+ usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
269
+ ) -> "LLMEngine":
270
+ """Creates an LLM engine from the engine arguments."""
271
+ # Create the engine configs.
272
+ engine_config = engine_args.create_engine_config()
273
+
274
+ # Initialize the cluster and specify the executor class.
275
+ if engine_config.device_config.device_type == "neuron":
276
+ from vllm.executor.neuron_executor import NeuronExecutor
277
+ executor_class = NeuronExecutor
278
+ elif engine_config.device_config.device_type == "cpu":
279
+ from vllm.executor.cpu_executor import CPUExecutor
280
+ executor_class = CPUExecutor
281
+ elif engine_config.parallel_config.worker_use_ray:
282
+ initialize_ray_cluster(engine_config.parallel_config)
283
+ from vllm.executor.ray_gpu_executor import RayGPUExecutor
284
+ executor_class = RayGPUExecutor
285
+ else:
286
+ assert engine_config.parallel_config.world_size == 1, (
287
+ "Ray is required if parallel_config.world_size > 1.")
288
+ from vllm.executor.gpu_executor import GPUExecutor
289
+ executor_class = GPUExecutor
290
+
291
+ # Create the LLM engine.
292
+ engine = cls(
293
+ **engine_config.to_dict(),
294
+ executor_class=executor_class,
295
+ log_stats=not engine_args.disable_log_stats,
296
+ usage_context=usage_context,
297
+ )
298
+ return engine
299
+
300
+ def __reduce__(self):
301
+ # This is to ensure that the LLMEngine is not referenced in
302
+ # the closure used to initialize Ray worker actors
303
+ raise RuntimeError("LLMEngine should not be pickled!")
304
+
305
+ def __del__(self):
306
+ # Shutdown model executor when engine is garbage collected
307
+ # Use getattr since __init__ can fail before the field is set
308
+ if model_executor := getattr(self, "model_executor", None):
309
+ model_executor.shutdown()
310
+
311
+ def get_tokenizer(self) -> "PreTrainedTokenizer":
312
+ return self.tokenizer.get_lora_tokenizer(None)
313
+
314
+ def get_tokenizer_for_seq(self,
315
+ sequence: Sequence) -> "PreTrainedTokenizer":
316
+ return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
317
+
318
+ def _init_tokenizer(self, **tokenizer_init_kwargs):
319
+ init_kwargs = dict(
320
+ tokenizer_id=self.model_config.tokenizer,
321
+ enable_lora=bool(self.lora_config),
322
+ max_num_seqs=self.scheduler_config.max_num_seqs,
323
+ max_input_length=None,
324
+ tokenizer_mode=self.model_config.tokenizer_mode,
325
+ trust_remote_code=self.model_config.trust_remote_code,
326
+ revision=self.model_config.tokenizer_revision)
327
+ init_kwargs.update(tokenizer_init_kwargs)
328
+ self.tokenizer = get_tokenizer_group(
329
+ self.parallel_config.tokenizer_pool_config, **init_kwargs)
330
+
331
+ def _verify_args(self) -> None:
332
+ self.model_config.verify_with_parallel_config(self.parallel_config)
333
+ self.cache_config.verify_with_parallel_config(self.parallel_config)
334
+ if self.lora_config:
335
+ self.lora_config.verify_with_model_config(self.model_config)
336
+ self.lora_config.verify_with_scheduler_config(
337
+ self.scheduler_config)
338
+
339
+ def encode_request(
340
+ self,
341
+ request_id: str, # pylint: disable=unused-argument
342
+ prompt: Optional[str],
343
+ prompt_token_ids: Optional[List[int]] = None,
344
+ lora_request: Optional[LoRARequest] = None,
345
+ ):
346
+ if prompt_token_ids is None:
347
+ assert prompt is not None
348
+ prompt_token_ids = self.tokenizer.encode(request_id=request_id,
349
+ prompt=prompt,
350
+ lora_request=lora_request)
351
+ return prompt_token_ids
352
+
353
+ def add_request(
354
+ self,
355
+ request_id: str,
356
+ prompt: Optional[str],
357
+ sampling_params: SamplingParams,
358
+ prompt_token_ids: Optional[List[int]] = None,
359
+ arrival_time: Optional[float] = None,
360
+ lora_request: Optional[LoRARequest] = None,
361
+ multi_modal_data: Optional[MultiModalData] = None,
362
+ ) -> None:
363
+ """Add a request to the engine's request pool.
364
+
365
+ The request is added to the request pool and will be processed by the
366
+ scheduler as `engine.step()` is called. The exact scheduling policy is
367
+ determined by the scheduler.
368
+
369
+ Args:
370
+ request_id: The unique ID of the request.
371
+ prompt: The prompt string. Can be None if prompt_token_ids is
372
+ provided.
373
+ sampling_params: The sampling parameters for text generation.
374
+ prompt_token_ids: The token IDs of the prompt. If None, we
375
+ use the tokenizer to convert the prompts to token IDs.
376
+ arrival_time: The arrival time of the request. If None, we use
377
+ the current monotonic time.
378
+ multi_modal_data: Multi modal data per request.
379
+
380
+ Details:
381
+ - Set arrival_time to the current time if it is None.
382
+ - Set prompt_token_ids to the encoded prompt if it is None.
383
+ - Create `best_of` number of :class:`~vllm.Sequence` objects.
384
+ - Create a :class:`~vllm.SequenceGroup` object
385
+ from the list of :class:`~vllm.Sequence`.
386
+ - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
387
+
388
+ Example:
389
+ >>> # initialize engine
390
+ >>> engine = LLMEngine.from_engine_args(engine_args)
391
+ >>> # set request arguments
392
+ >>> example_prompt = "Who is the president of the United States?"
393
+ >>> sampling_params = SamplingParams(temperature=0.0)
394
+ >>> request_id = 0
395
+ >>>
396
+ >>> # add the request to the engine
397
+ >>> engine.add_request(
398
+ >>> str(request_id),
399
+ >>> example_prompt,
400
+ >>> SamplingParams(temperature=0.0))
401
+ >>> # continue the request processing
402
+ >>> ...
403
+ """
404
+ if lora_request is not None and not self.lora_config:
405
+ raise ValueError(f"Got lora_request {lora_request} but LoRA is "
406
+ "not enabled!")
407
+ max_logprobs = self.get_model_config().max_logprobs
408
+ if (sampling_params.logprobs
409
+ and sampling_params.logprobs > max_logprobs) or (
410
+ sampling_params.prompt_logprobs
411
+ and sampling_params.prompt_logprobs > max_logprobs):
412
+ raise ValueError(f"Cannot request more than "
413
+ f"{max_logprobs} logprobs.")
414
+ if arrival_time is None:
415
+ arrival_time = time.time()
416
+ prompt_token_ids = self.encode_request(
417
+ request_id=request_id,
418
+ prompt=prompt,
419
+ prompt_token_ids=prompt_token_ids,
420
+ lora_request=lora_request)
421
+
422
+ # Create the sequences.
423
+ block_size = self.cache_config.block_size
424
+ seq_id = next(self.seq_counter)
425
+ eos_token_id = None
426
+ if self.tokenizer:
427
+ eos_token_id = self.tokenizer.get_lora_tokenizer(
428
+ lora_request).eos_token_id
429
+ else:
430
+ logger.warning("Use None for EOS token id because tokenizer is "
431
+ "not initialized")
432
+ seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
433
+ eos_token_id, lora_request)
434
+
435
+ # Defensive copy of SamplingParams, which are used by the sampler,
436
+ # this doesn't deep-copy LogitsProcessor objects
437
+ sampling_params = sampling_params.clone()
438
+ # Add the eos token id into the sampling_params to support min_tokens
439
+ # processing
440
+ if seq.eos_token_id is not None:
441
+ sampling_params.all_stop_token_ids.add(seq.eos_token_id)
442
+ sampling_params.update_from_generation_config(
443
+ self.generation_config_fields)
444
+
445
+ # Create the sequence group.
446
+ seq_group = SequenceGroup(request_id, [seq], sampling_params,
447
+ arrival_time, lora_request, multi_modal_data)
448
+
449
+ # Add the sequence group to the scheduler.
450
+ self.scheduler.add_seq_group(seq_group)
451
+
452
+ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
453
+ """Aborts a request(s) with the given ID.
454
+
455
+ Args:
456
+ request_id: The ID(s) of the request to abort.
457
+
458
+ Details:
459
+ - Refer to the
460
+ :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
461
+ from class :class:`~vllm.core.scheduler.Scheduler`.
462
+
463
+ Example:
464
+ >>> # initialize engine and add a request with request_id
465
+ >>> request_id = str(0)
466
+ >>> # abort the request
467
+ >>> engine.abort_request(request_id)
468
+ """
469
+ self.scheduler.abort_seq_group(request_id)
470
+
471
+ def get_model_config(self) -> ModelConfig:
472
+ """Gets the model configuration."""
473
+ return self.model_config
474
+
475
+ def get_decoding_config(self) -> DecodingConfig:
476
+ """Gets the decoding configuration."""
477
+ return self.decoding_config
478
+
479
+ def get_num_unfinished_requests(self) -> int:
480
+ """Gets the number of unfinished requests."""
481
+ return self.scheduler.get_num_unfinished_seq_groups()
482
+
483
+ def has_unfinished_requests(self) -> bool:
484
+ """Returns True if there are unfinished requests."""
485
+ return self.scheduler.has_unfinished_seqs()
486
+
487
+ def _process_model_outputs(
488
+ self,
489
+ output: List[SamplerOutput],
490
+ scheduled_seq_groups: List[ScheduledSequenceGroup],
491
+ ignored_seq_groups: List[SequenceGroup],
492
+ seq_group_metadata_list: List[SequenceGroupMetadata],
493
+ ) -> List[RequestOutput]:
494
+ """Apply the model output to the sequences in the scheduled seq groups.
495
+
496
+ Returns RequestOutputs that can be returned to the client.
497
+ """
498
+
499
+ now = time.time()
500
+
501
+ # Organize outputs by [sequence group][step] instead of
502
+ # [step][sequence group].
503
+ output_by_sequence_group = create_output_by_sequence_group(
504
+ sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups))
505
+
506
+ # Update the scheduled sequence groups with the model outputs.
507
+ for scheduled_seq_group, outputs, seq_group_meta in zip(
508
+ scheduled_seq_groups, output_by_sequence_group,
509
+ seq_group_metadata_list):
510
+ seq_group = scheduled_seq_group.seq_group
511
+ seq_group.update_num_computed_tokens(
512
+ scheduled_seq_group.token_chunk_size)
513
+
514
+ self.output_processor.process_prompt_logprob(seq_group, outputs)
515
+ if seq_group_meta.do_sample:
516
+ self.output_processor.process_outputs(seq_group, outputs)
517
+
518
+ # Free the finished sequence groups.
519
+ self.scheduler.free_finished_seq_groups()
520
+
521
+ # Create the outputs.
522
+ request_outputs: List[RequestOutput] = []
523
+ for scheduled_seq_group in scheduled_seq_groups:
524
+ seq_group = scheduled_seq_group.seq_group
525
+ seq_group.maybe_set_first_token_time(now)
526
+ request_output = RequestOutput.from_seq_group(seq_group)
527
+ request_outputs.append(request_output)
528
+ for seq_group in ignored_seq_groups:
529
+ request_output = RequestOutput.from_seq_group(seq_group)
530
+ request_outputs.append(request_output)
531
+ return request_outputs
532
+
533
+ def step(self) -> List[RequestOutput]:
534
+ """Performs one decoding iteration and returns newly generated results.
535
+
536
+ .. figure:: https://i.imgur.com/sv2HssD.png
537
+ :alt: Overview of the step function
538
+ :align: center
539
+
540
+ Overview of the step function.
541
+
542
+ Details:
543
+ - Step 1: Schedules the sequences to be executed in the next
544
+ iteration and the token blocks to be swapped in/out/copy.
545
+
546
+ - Depending on the scheduling policy,
547
+ sequences may be `preempted/reordered`.
548
+ - A Sequence Group (SG) refer to a group of sequences
549
+ that are generated from the same prompt.
550
+
551
+ - Step 2: Calls the distributed executor to execute the model.
552
+ - Step 3: Processes the model output. This mainly includes:
553
+
554
+ - Decodes the relevant outputs.
555
+ - Updates the scheduled sequence groups with model outputs
556
+ based on its `sampling parameters` (`use_beam_search` or not).
557
+ - Frees the finished sequence groups.
558
+
559
+ - Finally, it creates and returns the newly generated results.
560
+
561
+ Example:
562
+ >>> # Please see the example/ folder for more detailed examples.
563
+ >>>
564
+ >>> # initialize engine and request arguments
565
+ >>> engine = LLMEngine.from_engine_args(engine_args)
566
+ >>> example_inputs = [(0, "What is LLM?",
567
+ >>> SamplingParams(temperature=0.0))]
568
+ >>>
569
+ >>> # Start the engine with an event loop
570
+ >>> while True:
571
+ >>> if example_inputs:
572
+ >>> req_id, prompt, sampling_params = example_inputs.pop(0)
573
+ >>> engine.add_request(str(req_id), prompt, sampling_params)
574
+ >>>
575
+ >>> # continue the request processing
576
+ >>> request_outputs = engine.step()
577
+ >>> for request_output in request_outputs:
578
+ >>> if request_output.finished:
579
+ >>> # return or show the request output
580
+ >>>
581
+ >>> if not (engine.has_unfinished_requests() or example_inputs):
582
+ >>> break
583
+ """
584
+ seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
585
+
586
+ if not scheduler_outputs.is_empty():
587
+ execute_model_req = ExecuteModelRequest(
588
+ seq_group_metadata_list=seq_group_metadata_list,
589
+ blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
590
+ blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
591
+ blocks_to_copy=scheduler_outputs.blocks_to_copy,
592
+ num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
593
+ running_queue_size=scheduler_outputs.running_queue_size,
594
+ )
595
+ output = self.model_executor.execute_model(
596
+ execute_model_req=execute_model_req)
597
+ else:
598
+ output = []
599
+
600
+ request_outputs = self._process_model_outputs(
601
+ output, scheduler_outputs.scheduled_seq_groups,
602
+ scheduler_outputs.ignored_seq_groups, seq_group_metadata_list)
603
+
604
+ # Log stats.
605
+ self.do_log_stats(scheduler_outputs, output)
606
+
607
+ return request_outputs
608
+
609
+ def do_log_stats(
610
+ self,
611
+ scheduler_outputs: Optional[SchedulerOutputs] = None,
612
+ model_output: Optional[List[SamplerOutput]] = None) -> None:
613
+ """Forced log when no requests active."""
614
+ if self.log_stats:
615
+ self.stat_logger.log(
616
+ self._get_stats(scheduler_outputs, model_output))
617
+
618
+ def _get_stats(
619
+ self,
620
+ scheduler_outputs: Optional[SchedulerOutputs],
621
+ model_output: Optional[List[SamplerOutput]] = None) -> Stats:
622
+ """Get Stats to be Logged to Prometheus.
623
+
624
+ Args:
625
+ scheduler_outputs: Optional, used to populate metrics related to
626
+ the scheduled batch,
627
+ model_output: Optional, used to emit speculative decoding metrics
628
+ which are created by the workers.
629
+ """
630
+ now = time.time()
631
+
632
+ # System State
633
+ # Scheduler State
634
+ num_running_sys = len(self.scheduler.running)
635
+ num_swapped_sys = len(self.scheduler.swapped)
636
+ num_waiting_sys = len(self.scheduler.waiting)
637
+
638
+ # KV Cache Usage in %
639
+ num_total_gpu = self.cache_config.num_gpu_blocks
640
+ num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
641
+ gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
642
+
643
+ num_total_cpu = self.cache_config.num_cpu_blocks
644
+ cpu_cache_usage_sys = 0.
645
+ if num_total_cpu > 0:
646
+ num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
647
+ )
648
+ cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
649
+
650
+ # Iteration stats
651
+ num_prompt_tokens_iter = 0
652
+ num_generation_tokens_iter = 0
653
+ time_to_first_tokens_iter: List[float] = []
654
+ time_per_output_tokens_iter: List[float] = []
655
+
656
+ # Request stats
657
+ # Latency
658
+ time_e2e_requests: List[float] = []
659
+ # Metadata
660
+ num_prompt_tokens_requests: List[int] = []
661
+ num_generation_tokens_requests: List[int] = []
662
+ best_of_requests: List[int] = []
663
+ n_requests: List[int] = []
664
+ finished_reason_requests: List[str] = []
665
+
666
+ # NOTE: This loop assumes prefill seq_groups are before
667
+ # decode seq_groups in scheduled_seq_groups.
668
+ if scheduler_outputs is not None:
669
+ num_generation_tokens_from_prefill_groups = 0.
670
+ # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
671
+ # the len of scheduler_outputs.scheduled_seq_groups is !=
672
+ # scheduler_outputs.num_prefill_groups, this means that
673
+ # chunked prefills have been detected.
674
+
675
+ for idx, scheduled_seq_group in enumerate(
676
+ scheduler_outputs.scheduled_seq_groups):
677
+ group_was_prefill = idx < scheduler_outputs.num_prefill_groups
678
+ seq_group = scheduled_seq_group.seq_group
679
+
680
+ # NOTE: a seq_group that completed all of its prefill tokens
681
+ # in the last iteration will have seq_group.is_prefill() = False
682
+ # with group_was_prefill = True
683
+ if group_was_prefill:
684
+ # Number of prompt tokens.
685
+ num_prompt_tokens_iter += (
686
+ scheduled_seq_group.token_chunk_size)
687
+
688
+ # If the seq_group just finished the prefill state
689
+ # get TTFT.
690
+ if not seq_group.is_prefill():
691
+ latency = seq_group.get_last_latency(now)
692
+ time_to_first_tokens_iter.append(latency)
693
+
694
+ # One generation token per finished prefill.
695
+ num_generation_tokens_from_prefill_groups += (
696
+ seq_group.num_seqs())
697
+ else:
698
+ # TPOTs.
699
+ latency = seq_group.get_last_latency(now)
700
+ time_per_output_tokens_iter.append(latency)
701
+
702
+ # Because of chunked prefill, we can have a single sequence
703
+ # group that does multiple prompt_runs. To prevent logging
704
+ # the same metadata more than once per request, we standardize
705
+ # on logging request level information for finished requests,
706
+ # which can only happen once.
707
+ if seq_group.is_finished():
708
+ # Latency timings
709
+ time_e2e_requests.append(now -
710
+ seq_group.metrics.arrival_time)
711
+
712
+ # Metadata
713
+ num_prompt_tokens_requests.append(
714
+ len(seq_group.prompt_token_ids))
715
+ num_generation_tokens_requests.extend([
716
+ seq.get_output_len()
717
+ for seq in seq_group.get_finished_seqs()
718
+ ])
719
+ best_of_requests.append(seq_group.sampling_params.best_of)
720
+ n_requests.append(seq_group.sampling_params.n)
721
+ finished_reason_requests.extend([
722
+ SequenceStatus.get_finished_reason(seq.status)
723
+ for seq in seq_group.get_finished_seqs()
724
+ ])
725
+
726
+ # Number of generation tokens.
727
+ # num_batched_tokens equals the number of prompt_tokens plus the
728
+ # number of decode_tokens in a single iteration. So,
729
+ # num_generation_tokens = num_batched_tokens - num_prompt_tokens
730
+ # + num_generation_tokens_from_prefill_groups (since we generate
731
+ # one token on prefills on iters where the prefill finishes).
732
+ num_generation_tokens_iter = (
733
+ scheduler_outputs.num_batched_tokens - num_prompt_tokens_iter +
734
+ num_generation_tokens_from_prefill_groups)
735
+
736
+ # Spec decode, if enabled, emits specialized metrics from the worker in
737
+ # sampler output.
738
+ if model_output and (model_output[0].spec_decode_worker_metrics
739
+ is not None):
740
+ spec_decode_metrics = model_output[0].spec_decode_worker_metrics
741
+ else:
742
+ spec_decode_metrics = None
743
+
744
+ return Stats(
745
+ now=now,
746
+
747
+ # System stats
748
+ # Scheduler State
749
+ num_running_sys=num_running_sys,
750
+ num_swapped_sys=num_swapped_sys,
751
+ num_waiting_sys=num_waiting_sys,
752
+ # KV Cache Usage in %
753
+ gpu_cache_usage_sys=gpu_cache_usage_sys,
754
+ cpu_cache_usage_sys=cpu_cache_usage_sys,
755
+
756
+ # Iteration stats
757
+ num_prompt_tokens_iter=num_prompt_tokens_iter,
758
+ num_generation_tokens_iter=num_generation_tokens_iter,
759
+ time_to_first_tokens_iter=time_to_first_tokens_iter,
760
+ time_per_output_tokens_iter=time_per_output_tokens_iter,
761
+ spec_decode_metrics=spec_decode_metrics,
762
+
763
+ # Request stats
764
+ # Latency
765
+ time_e2e_requests=time_e2e_requests,
766
+ # Metadata
767
+ num_prompt_tokens_requests=num_prompt_tokens_requests,
768
+ num_generation_tokens_requests=num_generation_tokens_requests,
769
+ best_of_requests=best_of_requests,
770
+ n_requests=n_requests,
771
+ finished_reason_requests=finished_reason_requests,
772
+ )
773
+
774
+ def add_lora(self, lora_request: LoRARequest) -> bool:
775
+ return self.model_executor.add_lora(lora_request)
776
+
777
+ def remove_lora(self, lora_id: int) -> bool:
778
+ return self.model_executor.remove_lora(lora_id)
779
+
780
+ def list_loras(self) -> List[int]:
781
+ return self.model_executor.list_loras()
782
+
783
+ def check_health(self) -> None:
784
+ self.model_executor.check_health()