vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,130 @@
1
+ import asyncio
2
+ import concurrent.futures
3
+ from copy import copy
4
+ from enum import Enum
5
+ from functools import lru_cache
6
+ from json import dumps as json_dumps
7
+ from re import escape as regex_escape
8
+ from typing import Tuple, Union
9
+
10
+ from pydantic import BaseModel
11
+ from transformers import PreTrainedTokenizerBase
12
+
13
+ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
14
+ CompletionRequest)
15
+ from vllm.model_executor.guided_decoding.outlines_logits_processors import (
16
+ CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
17
+
18
+
19
+ class GuidedDecodingMode(Enum):
20
+ JSON = "json"
21
+ REGEX = "regex"
22
+ CHOICE = "choice"
23
+ GRAMMAR = "grammar"
24
+
25
+
26
+ # https://github.com/outlines-dev/outlines/blob/main/outlines/grammars/json.lark
27
+ # the main difference is that we changed the start: value to
28
+ # start: object | array, so we are denying scalar values as the root of the
29
+ # JSON. Starting with scalars as the root seems to cause llama to generate
30
+ # without stop.
31
+ JSON_GRAMMAR = r"""
32
+ ?start: object | array
33
+
34
+ ?value: object
35
+ | array
36
+ | UNESCAPED_STRING
37
+ | SIGNED_NUMBER -> number
38
+ | "true" -> true
39
+ | "false" -> false
40
+ | "null" -> null
41
+
42
+ array : "[" [value ("," value)*] "]"
43
+ object : "{" [pair ("," pair)*] "}"
44
+ pair : UNESCAPED_STRING ":" value
45
+
46
+ %import common.UNESCAPED_STRING
47
+ %import common.SIGNED_NUMBER
48
+ %import common.WS
49
+
50
+ %ignore WS
51
+ """
52
+
53
+ global_thread_pool = None # used for generating logits processor fsm
54
+
55
+
56
+ async def get_outlines_guided_decoding_logits_processor(
57
+ request: Union[CompletionRequest, ChatCompletionRequest],
58
+ tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
59
+ """
60
+ Given an OpenAI-compatible request, check for guided decoding parameters
61
+ and get the necessary logits processor for the given guide.
62
+ We cache logit processors by (guide, tokenizer), and on cache hit
63
+ we make a shallow copy to reuse the same underlying FSM.
64
+ """
65
+ global global_thread_pool
66
+ guide, mode = _get_guide_and_mode(request)
67
+ if not guide:
68
+ return None
69
+
70
+ if global_thread_pool is None:
71
+ global_thread_pool = concurrent.futures.ThreadPoolExecutor(
72
+ max_workers=2)
73
+ loop = asyncio.get_running_loop()
74
+
75
+ result = await loop.run_in_executor(global_thread_pool,
76
+ _get_cached_logits_processor, guide,
77
+ tokenizer, mode,
78
+ request.guided_whitespace_pattern)
79
+
80
+ logits_processor = copy(result)
81
+ # reset logits processor's internal state
82
+ logits_processor.init_state()
83
+ return logits_processor
84
+
85
+
86
+ def _get_guide_and_mode(
87
+ request: Union[CompletionRequest, ChatCompletionRequest]
88
+ ) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
89
+
90
+ if request.guided_json:
91
+ json = request.guided_json
92
+ if isinstance(json, dict):
93
+ # turn dict into hashable string
94
+ json = json_dumps(json)
95
+ elif isinstance(json, BaseModel):
96
+ # use pydantic signature so that different model classes
97
+ # with the same fields will get hashed the same
98
+ json = str(json.__signature__)
99
+ return json, GuidedDecodingMode.JSON
100
+ elif request.guided_regex:
101
+ return request.guided_regex, GuidedDecodingMode.REGEX
102
+ elif request.guided_choice:
103
+ # choice just uses regex
104
+ choices = [
105
+ regex_escape(str(choice)) for choice in request.guided_choice
106
+ ]
107
+ choices_regex = "(" + "|".join(choices) + ")"
108
+ return choices_regex, GuidedDecodingMode.CHOICE
109
+ elif request.guided_grammar:
110
+ return request.guided_grammar, GuidedDecodingMode.GRAMMAR
111
+ elif (request.response_format is not None
112
+ and request.response_format.type == "json_object"):
113
+ return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
114
+ else:
115
+ return None, None
116
+
117
+
118
+ @lru_cache(maxsize=32)
119
+ def _get_cached_logits_processor(guide: str,
120
+ tokenizer: PreTrainedTokenizerBase,
121
+ mode: GuidedDecodingMode,
122
+ whitespace_pattern: Union[str, None]):
123
+ if mode == GuidedDecodingMode.JSON:
124
+ return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
125
+ elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
126
+ return RegexLogitsProcessor(guide, tokenizer)
127
+ elif mode == GuidedDecodingMode.GRAMMAR:
128
+ return CFGLogitsProcessor(guide, tokenizer)
129
+ else:
130
+ raise ValueError(f"Unknown guided decoding mode {mode}")
@@ -0,0 +1,184 @@
1
+ # Copyright 2024- the Outlines developers
2
+ # This file is adapted from
3
+ # https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ import copy
17
+ import json
18
+ import math
19
+ from collections import defaultdict
20
+ from functools import lru_cache
21
+ from typing import Callable, DefaultDict, Dict, List, Union
22
+
23
+ import torch
24
+ from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
25
+ from outlines.fsm.json_schema import build_regex_from_schema
26
+ from pydantic import BaseModel
27
+ from transformers import PreTrainedTokenizerBase
28
+
29
+
30
+ class BaseLogitsProcessor:
31
+
32
+ def __init__(self):
33
+ # Child class should use initialize in their init.
34
+ self.fsm: FSM
35
+
36
+ def init_state(self):
37
+ """Initialize the FSM states."""
38
+ self.fsm_state: DefaultDict[int, int] = defaultdict(int)
39
+
40
+ def __call__(self, input_ids: List[int],
41
+ scores: torch.Tensor) -> torch.Tensor:
42
+ """Use the FSM to bias the logits before sampling the next token."""
43
+ seq_id = hash(tuple(input_ids))
44
+
45
+ if len(input_ids) == 0:
46
+ self.init_state()
47
+ else:
48
+ last_token = input_ids[-1]
49
+ last_seq_id = hash(tuple(input_ids[:-1]))
50
+ self.fsm_state[seq_id] = self.fsm.next_state(
51
+ self.fsm_state[last_seq_id], last_token)
52
+
53
+ allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
54
+
55
+ mask = torch.full((scores.shape[-1], ),
56
+ -math.inf,
57
+ device=scores.device)
58
+ mask[allowed_tokens] = 0
59
+ scores.add_(mask)
60
+ return scores
61
+
62
+
63
+ class RegexLogitsProcessor(BaseLogitsProcessor):
64
+
65
+ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
66
+ """Compile the FSM that drives the regex-structured generation.
67
+
68
+ Parameters
69
+ ----------
70
+ regex_string
71
+ A string that represents a regular expression
72
+ tokenizer
73
+ The model's tokenizer
74
+
75
+ """
76
+ tokenizer = _adapt_tokenizer(tokenizer)
77
+ fsm = RegexFSM(regex_string, tokenizer)
78
+ self.fsm = fsm
79
+
80
+
81
+ class JSONLogitsProcessor(RegexLogitsProcessor):
82
+
83
+ def __init__(self, schema: Union[str, Dict, BaseModel],
84
+ tokenizer: PreTrainedTokenizerBase,
85
+ whitespace_pattern: Union[str, None]):
86
+ """Compile the FSM that drives the JSON-guided generation.
87
+
88
+ Parameters
89
+ ----------
90
+ schema
91
+ A JSON schema that encodes the structure we want the model to
92
+ generate
93
+ tokenizer
94
+ The model's tokenizer
95
+ whitespace_pattern
96
+ Pattern to use for JSON syntactic whitespace (doesn't impact
97
+ string literals)
98
+ Example: allow only a single space or newline with
99
+ `whitespace_pattern=r"[\n ]?"`
100
+ """
101
+ if isinstance(schema, type(BaseModel)):
102
+ schema_str = json.dumps(schema.model_json_schema())
103
+ elif isinstance(schema, Dict):
104
+ schema_str = json.dumps(schema)
105
+ elif isinstance(schema, str):
106
+ schema_str = schema
107
+ else:
108
+ raise ValueError(
109
+ f"Cannot parse schema {schema}. The schema must be either "
110
+ f"a Pydantic object, a dictionary or a string that contains "
111
+ f"the JSON Schema specification")
112
+ regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
113
+ super().__init__(regex_string, tokenizer)
114
+
115
+
116
+ class CFGLogitsProcessor(BaseLogitsProcessor):
117
+
118
+ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
119
+ """Compile the FSM that drives the context free grammar generation.
120
+
121
+ Parameters
122
+ ----------
123
+ cfg
124
+ A string that represents a context-free grammar
125
+ tokenizer
126
+ The model's tokenizer
127
+
128
+ """
129
+ tokenizer = _adapt_tokenizer(tokenizer)
130
+ fsm = CFGFSM(cfg, tokenizer)
131
+ self.fsm = fsm
132
+
133
+ def init_state(self):
134
+ """Initialize state with a CFGFSM copy."""
135
+ super().init_state()
136
+ self.fsm = self.fsm.copy()
137
+
138
+
139
+ @lru_cache
140
+ def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
141
+ """Adapt vLLM's tokenizer to use to compile the FSM.
142
+
143
+ The API of Outlines tokenizers is slightly different to that of
144
+ `transformers`. The decoder of outlines, returns a list whereas
145
+ the decode of vLLM returns an str. To sync the vLLM decoder with
146
+ outlines internal api, the decoder should be adapted. In addition
147
+ we need to handle the missing spaces to Llama's tokenizer to be
148
+ able to compile FSMs for this model.
149
+
150
+ """
151
+ if getattr(tokenizer, "_outlines_adapted", False):
152
+ return tokenizer
153
+
154
+ tokenizer = copy.deepcopy(tokenizer)
155
+
156
+ tokenizer.vocabulary = tokenizer.get_vocab()
157
+ tokenizer.special_tokens = set(tokenizer.all_special_tokens)
158
+
159
+ def convert_token_to_string(token: str) -> str:
160
+ from transformers.file_utils import SPIECE_UNDERLINE
161
+
162
+ string = tokenizer.convert_tokens_to_string([token])
163
+
164
+ # A hack to handle missing spaces to HF's Llama tokenizers
165
+ if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
166
+ return " " + string
167
+
168
+ return string
169
+
170
+ def change_decoder(
171
+ decoder: Callable[[List[int]],
172
+ str]) -> Callable[[List[int]], List[str]]:
173
+ """Sync vLLM's decoder with the outlines by returning list."""
174
+
175
+ def new_decoder(inp_tokens: List[int]) -> List[str]:
176
+ return [decoder(inp_tokens)]
177
+
178
+ return new_decoder
179
+
180
+ tokenizer.convert_token_to_string = convert_token_to_string
181
+ tokenizer.decode = change_decoder(tokenizer.decode)
182
+ setattr(tokenizer, "_outlines_adapted", True) # noqa: B010
183
+
184
+ return tokenizer
File without changes
@@ -0,0 +1,173 @@
1
+ """Custom activation functions."""
2
+ import math
3
+ from typing import Optional
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+ from vllm import _custom_ops as ops
10
+ from vllm.distributed import (divide, get_tensor_model_parallel_rank,
11
+ get_tensor_model_parallel_world_size)
12
+ from vllm.model_executor.layers.quantization import QuantizationConfig
13
+ from vllm.model_executor.utils import set_weight_attrs
14
+
15
+
16
+ class SiluAndMul(nn.Module):
17
+ """An activation function for SwiGLU.
18
+
19
+ The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
20
+
21
+ Shapes:
22
+ x: (num_tokens, 2 * d) or (batch_size, seq_len, 2 * d)
23
+ return: (num_tokens, d) or (batch_size, seq_len, d)
24
+ """
25
+
26
+ def _forward(self, x: torch.Tensor) -> torch.Tensor:
27
+ """PyTorch-native implementation equivalent to forward()."""
28
+ d = x.shape[-1] // 2
29
+ return F.silu(x[..., :d]) * x[..., d:]
30
+
31
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
32
+ d = x.shape[-1] // 2
33
+ output_shape = (x.shape[:-1] + (d, ))
34
+ out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
35
+ ops.silu_and_mul(out, x)
36
+ return out
37
+
38
+
39
+ class GeluAndMul(nn.Module):
40
+ """An activation function for GeGLU.
41
+
42
+ The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
43
+
44
+ Shapes:
45
+ x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
46
+ return: (batch_size, seq_len, d) or (num_tokens, d)
47
+ """
48
+
49
+ def __init__(self, approximate: str = "none"):
50
+ super().__init__()
51
+ self.approximate = approximate
52
+ if approximate not in ("none", "tanh"):
53
+ raise ValueError(f"Unknown approximate mode: {approximate}")
54
+
55
+ def _forward(self, x: torch.Tensor) -> torch.Tensor:
56
+ """PyTorch-native implementation equivalent to forward()."""
57
+ d = x.shape[-1] // 2
58
+ return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
59
+
60
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
61
+ d = x.shape[-1] // 2
62
+ output_shape = (x.shape[:-1] + (d, ))
63
+ out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
64
+ if self.approximate == "none":
65
+ ops.gelu_and_mul(out, x)
66
+ elif self.approximate == "tanh":
67
+ ops.gelu_tanh_and_mul(out, x)
68
+ return out
69
+
70
+ def extra_repr(self) -> str:
71
+ return f'approximate={repr(self.approximate)}'
72
+
73
+
74
+ class NewGELU(nn.Module):
75
+
76
+ def _forward(self, x: torch.Tensor) -> torch.Tensor:
77
+ """PyTorch-native implementation equivalent to forward()."""
78
+ c = math.sqrt(2.0 / math.pi)
79
+ return 0.5 * x * (1.0 + torch.tanh(c *
80
+ (x + 0.044715 * torch.pow(x, 3.0))))
81
+
82
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
83
+ out = torch.empty_like(x)
84
+ ops.gelu_new(out, x)
85
+ return out
86
+
87
+
88
+ class FastGELU(nn.Module):
89
+
90
+ def _forward(self, x: torch.Tensor) -> torch.Tensor:
91
+ """PyTorch-native implementation equivalent to forward()."""
92
+ return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
93
+ (1.0 + 0.044715 * x * x)))
94
+
95
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
96
+ out = torch.empty_like(x)
97
+ ops.gelu_fast(out, x)
98
+ return out
99
+
100
+
101
+ class ScaledActivation(nn.Module):
102
+ """An activation function with post-scale parameters.
103
+
104
+ This is used for some quantization methods like AWQ.
105
+ """
106
+
107
+ def __init__(
108
+ self,
109
+ act_module: nn.Module,
110
+ intermediate_size: int,
111
+ input_is_parallel: bool = True,
112
+ params_dtype: Optional[torch.dtype] = None,
113
+ ):
114
+ super().__init__()
115
+ self.act = act_module
116
+ self.input_is_parallel = input_is_parallel
117
+ if input_is_parallel:
118
+ tp_size = get_tensor_model_parallel_world_size()
119
+ intermediate_size_per_partition = divide(intermediate_size,
120
+ tp_size)
121
+ else:
122
+ intermediate_size_per_partition = intermediate_size
123
+ if params_dtype is None:
124
+ params_dtype = torch.get_default_dtype()
125
+ self.scales = nn.Parameter(
126
+ torch.empty(intermediate_size_per_partition, dtype=params_dtype))
127
+ set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
128
+
129
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
130
+ return self.act(x) / self.scales
131
+
132
+ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
133
+ param_data = param.data
134
+ if self.input_is_parallel:
135
+ tp_rank = get_tensor_model_parallel_rank()
136
+ shard_size = param_data.shape[0]
137
+ start_idx = tp_rank * shard_size
138
+ loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
139
+ assert param_data.shape == loaded_weight.shape
140
+ param_data.copy_(loaded_weight)
141
+
142
+
143
+ _ACTIVATION_REGISTRY = {
144
+ "gelu": nn.GELU(),
145
+ "gelu_fast": FastGELU(),
146
+ "gelu_new": NewGELU(),
147
+ "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
148
+ "relu": nn.ReLU(),
149
+ }
150
+
151
+
152
+ def get_act_fn(
153
+ act_fn_name: str,
154
+ quant_config: Optional[QuantizationConfig] = None,
155
+ intermediate_size: Optional[int] = None,
156
+ input_is_parallel: bool = True,
157
+ params_dtype: Optional[torch.dtype] = None,
158
+ ) -> nn.Module:
159
+ """Get an activation function by name."""
160
+ act_fn_name = act_fn_name.lower()
161
+ if act_fn_name not in _ACTIVATION_REGISTRY:
162
+ raise ValueError(
163
+ f"Activation function {act_fn_name!r} is not supported.")
164
+
165
+ act_fn = _ACTIVATION_REGISTRY[act_fn_name]
166
+ if (quant_config is not None
167
+ and act_fn_name in quant_config.get_scaled_act_names()):
168
+ if intermediate_size is None:
169
+ raise ValueError("intermediate_size must be specified for scaled "
170
+ "activation functions.")
171
+ return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
172
+ params_dtype)
173
+ return act_fn
@@ -0,0 +1,7 @@
1
+ from vllm.model_executor.layers.fused_moe.fused_moe import (
2
+ fused_moe, get_config_file_name)
3
+
4
+ __all__ = [
5
+ "fused_moe",
6
+ "get_config_file_name",
7
+ ]
@@ -0,0 +1,146 @@
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 32,
5
+ "BLOCK_SIZE_K": 64,
6
+ "GROUP_SIZE_M": 1,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 128,
13
+ "BLOCK_SIZE_K": 128,
14
+ "GROUP_SIZE_M": 16,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 64,
21
+ "BLOCK_SIZE_K": 64,
22
+ "GROUP_SIZE_M": 16,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 64,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 64,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 128,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 32,
39
+ "num_warps": 8,
40
+ "num_stages": 4
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 64,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 4,
48
+ "num_stages": 4
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 64,
53
+ "BLOCK_SIZE_K": 256,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 4
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 64,
61
+ "BLOCK_SIZE_K": 256,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 4
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 64,
69
+ "BLOCK_SIZE_K": 256,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 16,
76
+ "BLOCK_SIZE_N": 64,
77
+ "BLOCK_SIZE_K": 256,
78
+ "GROUP_SIZE_M": 64,
79
+ "num_warps": 4,
80
+ "num_stages": 4
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 32,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 4
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 32,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 32,
95
+ "num_warps": 4,
96
+ "num_stages": 4
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 32,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 64,
103
+ "num_warps": 4,
104
+ "num_stages": 4
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 128,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 64,
110
+ "GROUP_SIZE_M": 16,
111
+ "num_warps": 8,
112
+ "num_stages": 4
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 128,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 64,
118
+ "GROUP_SIZE_M": 16,
119
+ "num_warps": 8,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 128,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 64,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 8,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 128,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 64,
134
+ "GROUP_SIZE_M": 16,
135
+ "num_warps": 8,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 128,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 64,
142
+ "GROUP_SIZE_M": 16,
143
+ "num_warps": 8,
144
+ "num_stages": 4
145
+ }
146
+ }