PyPI - vllm-npu - Versions diffs - 0.4.2__py3-none-any.whl - Mend

vllm-npu 0.4.2__py3-none-any.whl

Files changed (219) hide show

vllm/__init__.py +23 -0
vllm/_custom_ops.py +251 -0
vllm/attention/__init__.py +13 -0
vllm/attention/backends/__init__.py +0 -0
vllm/attention/backends/abstract.py +127 -0
vllm/attention/backends/flash_attn.py +271 -0
vllm/attention/backends/flashinfer.py +220 -0
vllm/attention/backends/rocm_flash_attn.py +374 -0
vllm/attention/backends/torch_sdpa.py +250 -0
vllm/attention/backends/xformers.py +393 -0
vllm/attention/layer.py +56 -0
vllm/attention/ops/__init__.py +0 -0
vllm/attention/ops/paged_attn.py +216 -0
vllm/attention/ops/prefix_prefill.py +792 -0
vllm/attention/ops/triton_flash_attention.py +810 -0
vllm/attention/selector.py +91 -0
vllm/block.py +84 -0
vllm/config.py +1225 -0
vllm/core/__init__.py +0 -0
vllm/core/block/__init__.py +0 -0
vllm/core/block/block_table.py +295 -0
vllm/core/block/common.py +199 -0
vllm/core/block/cpu_gpu_block_allocator.py +228 -0
vllm/core/block/interfaces.py +205 -0
vllm/core/block/naive_block.py +318 -0
vllm/core/block/prefix_caching_block.py +606 -0
vllm/core/block_manager_v1.py +625 -0
vllm/core/block_manager_v2.py +258 -0
vllm/core/evictor_v1.py +105 -0
vllm/core/evictor_v2.py +127 -0
vllm/core/interfaces.py +113 -0
vllm/core/policy.py +45 -0
vllm/core/scheduler.py +1163 -0
vllm/distributed/__init__.py +3 -0
vllm/distributed/communication_op.py +237 -0
vllm/distributed/device_communicators/__init__.py +0 -0
vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
vllm/distributed/device_communicators/pynccl.py +287 -0
vllm/distributed/device_communicators/pynccl_utils.py +66 -0
vllm/distributed/parallel_state.py +339 -0
vllm/distributed/utils.py +136 -0
vllm/engine/__init__.py +0 -0
vllm/engine/arg_utils.py +649 -0
vllm/engine/async_llm_engine.py +737 -0
vllm/engine/llm_engine.py +784 -0
vllm/engine/metrics.py +368 -0
vllm/engine/output_processor/__init__.py +0 -0
vllm/engine/output_processor/interfaces.py +76 -0
vllm/engine/output_processor/multi_step.py +142 -0
vllm/engine/output_processor/single_step.py +284 -0
vllm/engine/output_processor/stop_checker.py +101 -0
vllm/engine/output_processor/util.py +19 -0
vllm/entrypoints/__init__.py +0 -0
vllm/entrypoints/api_server.py +119 -0
vllm/entrypoints/llm.py +259 -0
vllm/entrypoints/openai/__init__.py +0 -0
vllm/entrypoints/openai/api_server.py +186 -0
vllm/entrypoints/openai/cli_args.py +115 -0
vllm/entrypoints/openai/protocol.py +460 -0
vllm/entrypoints/openai/serving_chat.py +392 -0
vllm/entrypoints/openai/serving_completion.py +347 -0
vllm/entrypoints/openai/serving_engine.py +234 -0
vllm/envs.py +217 -0
vllm/executor/__init__.py +0 -0
vllm/executor/cpu_executor.py +152 -0
vllm/executor/distributed_gpu_executor.py +115 -0
vllm/executor/executor_base.py +115 -0
vllm/executor/gpu_executor.py +150 -0
vllm/executor/multiproc_worker_utils.py +263 -0
vllm/executor/neuron_executor.py +91 -0
vllm/executor/ray_gpu_executor.py +327 -0
vllm/executor/ray_utils.py +119 -0
vllm/logger.py +153 -0
vllm/logging/__init__.py +5 -0
vllm/logging/formatter.py +15 -0
vllm/lora/__init__.py +0 -0
vllm/lora/fully_sharded_layers.py +262 -0
vllm/lora/layers.py +1181 -0
vllm/lora/lora.py +167 -0
vllm/lora/models.py +645 -0
vllm/lora/punica.py +213 -0
vllm/lora/request.py +32 -0
vllm/lora/utils.py +98 -0
vllm/lora/worker_manager.py +251 -0
vllm/model_executor/__init__.py +7 -0
vllm/model_executor/guided_decoding/__init__.py +25 -0
vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
vllm/model_executor/layers/__init__.py +0 -0
vllm/model_executor/layers/activation.py +173 -0
vllm/model_executor/layers/fused_moe/__init__.py +7 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
vllm/model_executor/layers/layernorm.py +71 -0
vllm/model_executor/layers/linear.py +709 -0
vllm/model_executor/layers/logits_processor.py +115 -0
vllm/model_executor/layers/ops/__init__.py +0 -0
vllm/model_executor/layers/ops/rand.py +157 -0
vllm/model_executor/layers/ops/sample.py +406 -0
vllm/model_executor/layers/quantization/__init__.py +35 -0
vllm/model_executor/layers/quantization/aqlm.py +376 -0
vllm/model_executor/layers/quantization/awq.py +175 -0
vllm/model_executor/layers/quantization/base_config.py +97 -0
vllm/model_executor/layers/quantization/fp8.py +265 -0
vllm/model_executor/layers/quantization/gptq.py +224 -0
vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
vllm/model_executor/layers/quantization/marlin.py +227 -0
vllm/model_executor/layers/quantization/schema.py +84 -0
vllm/model_executor/layers/quantization/squeezellm.py +137 -0
vllm/model_executor/layers/rejection_sampler.py +405 -0
vllm/model_executor/layers/rotary_embedding.py +525 -0
vllm/model_executor/layers/sampler.py +1051 -0
vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
vllm/model_executor/model_loader/__init__.py +30 -0
vllm/model_executor/model_loader/loader.py +362 -0
vllm/model_executor/model_loader/neuron.py +136 -0
vllm/model_executor/model_loader/tensorizer.py +368 -0
vllm/model_executor/model_loader/utils.py +41 -0
vllm/model_executor/model_loader/weight_utils.py +372 -0
vllm/model_executor/models/__init__.py +119 -0
vllm/model_executor/models/baichuan.py +410 -0
vllm/model_executor/models/bloom.py +327 -0
vllm/model_executor/models/chatglm.py +386 -0
vllm/model_executor/models/commandr.py +373 -0
vllm/model_executor/models/dbrx.py +413 -0
vllm/model_executor/models/decilm.py +122 -0
vllm/model_executor/models/deepseek.py +438 -0
vllm/model_executor/models/falcon.py +444 -0
vllm/model_executor/models/gemma.py +393 -0
vllm/model_executor/models/gpt2.py +266 -0
vllm/model_executor/models/gpt_bigcode.py +274 -0
vllm/model_executor/models/gpt_j.py +281 -0
vllm/model_executor/models/gpt_neox.py +295 -0
vllm/model_executor/models/internlm2.py +323 -0
vllm/model_executor/models/jais.py +333 -0
vllm/model_executor/models/llama.py +442 -0
vllm/model_executor/models/llava.py +239 -0
vllm/model_executor/models/minicpm.py +531 -0
vllm/model_executor/models/mixtral.py +583 -0
vllm/model_executor/models/mixtral_quant.py +404 -0
vllm/model_executor/models/mpt.py +295 -0
vllm/model_executor/models/olmo.py +356 -0
vllm/model_executor/models/opt.py +349 -0
vllm/model_executor/models/orion.py +319 -0
vllm/model_executor/models/phi.py +300 -0
vllm/model_executor/models/qwen.py +284 -0
vllm/model_executor/models/qwen2.py +367 -0
vllm/model_executor/models/qwen2_moe.py +447 -0
vllm/model_executor/models/stablelm.py +301 -0
vllm/model_executor/models/starcoder2.py +302 -0
vllm/model_executor/models/xverse.py +366 -0
vllm/model_executor/sampling_metadata.py +588 -0
vllm/model_executor/utils.py +35 -0
vllm/outputs.py +150 -0
vllm/py.typed +2 -0
vllm/sampling_params.py +340 -0
vllm/sequence.py +766 -0
vllm/spec_decode/__init__.py +0 -0
vllm/spec_decode/batch_expansion.py +397 -0
vllm/spec_decode/interfaces.py +73 -0
vllm/spec_decode/metrics.py +191 -0
vllm/spec_decode/multi_step_worker.py +203 -0
vllm/spec_decode/ngram_worker.py +176 -0
vllm/spec_decode/spec_decode_worker.py +472 -0
vllm/spec_decode/top1_proposer.py +200 -0
vllm/spec_decode/util.py +228 -0
vllm/test_utils.py +41 -0
vllm/transformers_utils/__init__.py +0 -0
vllm/transformers_utils/config.py +58 -0
vllm/transformers_utils/configs/__init__.py +16 -0
vllm/transformers_utils/configs/chatglm.py +68 -0
vllm/transformers_utils/configs/dbrx.py +278 -0
vllm/transformers_utils/configs/falcon.py +87 -0
vllm/transformers_utils/configs/jais.py +236 -0
vllm/transformers_utils/configs/mpt.py +178 -0
vllm/transformers_utils/detokenizer.py +313 -0
vllm/transformers_utils/tokenizer.py +149 -0
vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
vllm/transformers_utils/tokenizers/__init__.py +5 -0
vllm/transformers_utils/tokenizers/baichuan.py +255 -0
vllm/usage/__init__.py +0 -0
vllm/usage/usage_lib.py +209 -0
vllm/utils.py +677 -0
vllm/worker/__init__.py +0 -0
vllm/worker/cache_engine.py +105 -0
vllm/worker/cpu_model_runner.py +346 -0
vllm/worker/cpu_worker.py +321 -0
vllm/worker/model_runner.py +1168 -0
vllm/worker/neuron_model_runner.py +196 -0
vllm/worker/neuron_worker.py +98 -0
vllm/worker/worker.py +345 -0
vllm/worker/worker_base.py +146 -0
vllm_npu-0.4.2.dist-info/LICENSE +201 -0
vllm_npu-0.4.2.dist-info/METADATA +173 -0
vllm_npu-0.4.2.dist-info/RECORD +219 -0
vllm_npu-0.4.2.dist-info/WHEEL +5 -0
vllm_npu-0.4.2.dist-info/top_level.txt +1 -0

vllm/spec_decode/util.py ADDED Viewed

@@ -0,0 +1,228 @@
+from contextlib import contextmanager
+from itertools import chain
+from typing import Dict, List, Tuple
+import torch
+from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceOutput)
+SeqId = int
+def get_all_seq_ids(
+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    return list(
+        chain.from_iterable([
+            seq_group_metadata.seq_data.keys()
+            for seq_group_metadata in seq_group_metadata_list
+        ]))
+def get_all_num_logprobs(
+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
+    """Given a list of SequenceGroupMetadata, create a list of all num_logprobs.
+    If the sampling params do not call for any logprobs, return 0 for that
+    sequence.
+    """
+    all_num_logprobs = []
+    for seq_group_metadata in seq_group_metadata_list:
+        num_logprobs = seq_group_metadata.sampling_params.logprobs
+        if seq_group_metadata.sampling_params.logprobs is None:
+            num_logprobs = 0
+        all_num_logprobs.append(num_logprobs)
+    return all_num_logprobs
+def get_sampled_token_logprobs(
+        # shape [num_steps, batch_size, vocab_size]
+        logprob_tensor: torch.Tensor,
+        sampled_token_ids: torch.Tensor,  # shape [num_steps, batch_size]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Get the logprobs for the sampled tokens. Returns the ranks and logprobs.
+    """
+    num_steps, batch_size, vocab_size = logprob_tensor.shape
+    selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1),
+                                       torch.arange(batch_size),
+                                       sampled_token_ids, ]
+    expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
+        -1, -1, vocab_size)
+    sampled_token_ids_ranks = (logprob_tensor >=
+                               expanded_selected_logprobs).sum(-1)
+    return sampled_token_ids_ranks, selected_logprobs
+def create_sequence_group_output(
+    token_id: int,
+    token_id_logprob_rank: int,
+    token_id_logprob: float,
+    seq_id: SeqId,
+    topk_token_ids: List[int],
+    topk_logprobs: List[float],
+) -> SequenceGroupOutput:
+    """Create a SequenceGroupOutput given the sampling results.
+    Args:
+        token_id (int): The sampled token for the sequence.
+        token_id_logprob_rank (int): The logprob rank of the sampled token.
+        token_id_logprob (float): The logprob value of the sampled token.
+        seq_id (int): The sequence id.
+        topk_token_ids (List[int]): The list of top-k token ids.
+        topk_logprobs (List[float]): The list of top-k logprobs.
+    """
+    # vLLM logprobs always include the sampled token. In addition, the user may
+    # request topk-logprobs (where top-k varies per user up to max_logprobs).
+    logprobs: Dict[int, Logprob] = {
+        token_id: Logprob(
+            logprob=token_id_logprob,
+            rank=token_id_logprob_rank,
+        ),
+    }
+    logprobs.update({
+        topk_token_ids[topk_logprob_index]: Logprob(
+            logprob=topk_logprobs[topk_logprob_index],
+            rank=topk_logprob_index + 1,
+        )
+        for topk_logprob_index, _ in enumerate(topk_token_ids)
+    })
+    return SequenceGroupOutput(
+        samples=[
+            SequenceOutput(parent_seq_id=seq_id,
+                           output_token=token_id,
+                           logprobs=logprobs)
+        ],
+        # TODO add prompt logprobs support.
+        prompt_logprobs=None,
+    )
+def split_batch_by_proposal_len(
+    seq_group_metadata_list: List[SequenceGroupMetadata],
+    proposal_lens: List[int], select_proposal_len_zero: bool
+) -> Tuple[List[SequenceGroupMetadata], List[int]]:
+    """Utility function that splits a batch based on whether the proposal len is
+    zero or not. We should remove this once vLLM supports per-sequence proposal
+    lens in a batch.
+    """
+    if select_proposal_len_zero:
+        predicate = lambda proposal_len: proposal_len == 0
+    else:
+        predicate = lambda proposal_len: proposal_len != 0
+    indices = [
+        i for i, (_, proposal_len
+                  ) in enumerate(zip(seq_group_metadata_list, proposal_lens))
+        if predicate(proposal_len)
+    ]
+    seq_groups = [
+        seq_group for seq_group, proposal_len in zip(
+            seq_group_metadata_list, proposal_lens) if predicate(proposal_len)
+    ]
+    return seq_groups, indices
+def sampler_output_to_torch(
+    sampler_output_list: List[SamplerOutput], sampler_transposed: bool
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Utility function which converts a list of SamplerOutput to tensors.
+        sampler_transposed here is used as the indicator for whether
+        we need do additional tensor transpose logic here.
+        Returns:
+            sampled_token_ids: torch.Tensor
+                shape: [batch_size, len(sampler_output_list)]
+            sampled_token_probs: torch.Tensor
+                shape: [batch_size, len(sampler_output_list), vocab_size]
+        """
+    # shape: [batch_size, num_sampler_output, vocab_size]
+    sampled_token_probs = torch.stack(
+        [
+            sampler_output.sampled_token_probs
+            for sampler_output in sampler_output_list
+        ],
+        dim=0,
+    )
+    if sampler_transposed:
+        sampled_token_probs = sampled_token_probs.transpose(0, 1)
+    # shape: [batch_size, num_sampler_output, vocab_size]
+    sampled_token_logprobs = torch.stack(
+        [sampler_output.logprobs for sampler_output in sampler_output_list],
+        dim=0,
+    )
+    if sampler_transposed:
+        sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
+    # shape: [batch_size, num_sampler_output]
+    sampled_token_ids = torch.stack(
+        [
+            sampler_output.sampled_token_ids.flatten()
+            for sampler_output in sampler_output_list
+        ],
+        dim=0,
+    )
+    if sampler_transposed:
+        sampled_token_ids = sampled_token_ids.transpose(0, 1)
+    return sampled_token_ids, sampled_token_probs, sampled_token_logprobs
+def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int,
+                              vocab_size: int, device: str) -> None:
+    """Helper method which mocks out the GPU tensors in SamplerOutput with dummy
+    values. This will be removed in PR 7/9.
+    https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer
+    """
+    values = [
+        sampler_output.sampled_token_probs, sampler_output.sampled_token_ids
+    ]
+    assert all(v is None for v in values) or not any(v is None for v in values)
+    if not any(v is None for v in values):
+        # Do nothing if the tensors are already created (usually in unit tests).
+        return
+    # Softmax to ensure valid probs.
+    sampler_output.sampled_token_probs = torch.nn.functional.softmax(
+        torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device),
+        dim=-1)
+    sampler_output.sampled_token_ids = torch.randint(low=10,
+                                                     high=100,
+                                                     size=(batch_size, ),
+                                                     dtype=torch.long,
+                                                     device=device)
+@contextmanager
+def nvtx_range(msg, *args, **kwargs):
+    """
+    Context manager / decorator that pushes an NVTX range at the beginning
+    of its scope, and pops it at the end. If extra arguments are given,
+    they are passed as arguments to msg.format().
+    If running with cuda graphs, you must enable nsys cuda graph profiling.
+    Arguments:
+        msg (string): message to associate with the range
+    """
+    torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
+    try:
+        yield
+    finally:
+        torch.cuda.nvtx.range_pop()

vllm/test_utils.py ADDED Viewed

@@ -0,0 +1,41 @@
+import ray
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.utils import get_open_port
+def init_test_distributed_environment(
+    pipeline_parallel_size: int,
+    tensor_parallel_size: int,
+    rank: int,
+    distributed_init_port: str,
+    local_rank: int = -1,
+) -> None:
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+    init_distributed_environment(
+        world_size=pipeline_parallel_size * tensor_parallel_size,
+        rank=rank,
+        distributed_init_method=distributed_init_method,
+        local_rank=local_rank)
+    ensure_model_parallel_initialized(tensor_parallel_size,
+                                      pipeline_parallel_size)
+def multi_process_tensor_parallel(
+    tensor_parallel_size: int,
+    test_target,
+) -> None:
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    ray.init()
+    distributed_init_port = get_open_port()
+    refs = []
+    for rank in range(tensor_parallel_size):
+        refs.append(
+            test_target.remote(tensor_parallel_size, rank,
+                               distributed_init_port))
+    ray.get(refs)
+    ray.shutdown()

vllm/transformers_utils/__init__.py ADDED Viewed

File without changes

vllm/transformers_utils/config.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import Dict, Optional
+from transformers import AutoConfig, PretrainedConfig
+from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
+                                             JAISConfig, MPTConfig, RWConfig)
+_CONFIG_REGISTRY: Dict[str, PretrainedConfig] = {
+    "chatglm": ChatGLMConfig,
+    "dbrx": DbrxConfig,
+    "mpt": MPTConfig,
+    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
+    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
+    "jais": JAISConfig,
+}
+def get_config(model: str,
+               trust_remote_code: bool,
+               revision: Optional[str] = None,
+               code_revision: Optional[str] = None) -> PretrainedConfig:
+    try:
+        config = AutoConfig.from_pretrained(
+            model,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            code_revision=code_revision)
+    except ValueError as e:
+        if (not trust_remote_code and
+                "requires you to execute the configuration file" in str(e)):
+            err_msg = (
+                "Failed to load the model config. If the model is a custom "
+                "model not yet available in the HuggingFace transformers "
+                "library, consider setting `trust_remote_code=True` in LLM "
+                "or using the `--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    if config.model_type in _CONFIG_REGISTRY:
+        config_class = _CONFIG_REGISTRY[config.model_type]
+        config = config_class.from_pretrained(model,
+                                              revision=revision,
+                                              code_revision=code_revision)
+    return config
+def get_hf_text_config(config: PretrainedConfig):
+    """Get the "sub" config relevant to llm for multi modal models.
+        No op for pure text models.
+    """
+    if hasattr(config, "text_config"):
+        # The code operates under the assumption that text_config should have
+        # `num_attention_heads` (among others). Assert here to fail early
+        # if transformers config doesn't align with this assumption.
+        assert hasattr(config.text_config, "num_attention_heads")
+        return config.text_config
+    else:
+        return config

vllm/transformers_utils/configs/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
+from vllm.transformers_utils.configs.dbrx import DbrxConfig
+# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+# `FalconConfig` class from the official HuggingFace transformers library.
+from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.jais import JAISConfig
+from vllm.transformers_utils.configs.mpt import MPTConfig
+__all__ = [
+    "ChatGLMConfig",
+    "DbrxConfig",
+    "MPTConfig",
+    "RWConfig",
+    "JAISConfig",
+]

vllm/transformers_utils/configs/chatglm.py ADDED Viewed

@@ -0,0 +1,68 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+from transformers import PretrainedConfig
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers",
+        "n_head_kv": "multi_query_group_num",
+    }
+    def __init__(self,
+                 num_layers=28,
+                 padded_vocab_size=65024,
+                 hidden_size=4096,
+                 ffn_hidden_size=13696,
+                 kv_channels=128,
+                 num_attention_heads=32,
+                 seq_length=2048,
+                 hidden_dropout=0.0,
+                 attention_dropout=0.0,
+                 layernorm_epsilon=1e-5,
+                 rmsnorm=True,
+                 apply_residual_connection_post_layernorm=False,
+                 post_layer_norm=True,
+                 add_bias_linear=False,
+                 add_qkv_bias=False,
+                 interleaved_qkv=False,
+                 bias_dropout_fusion=True,
+                 multi_query_attention=False,
+                 multi_query_group_num=1,
+                 apply_query_key_layer_scaling=True,
+                 attention_softmax_in_fp32=True,
+                 fp32_residual_connection=False,
+                 quantization_bit=0,
+                 pre_seq_len=None,
+                 prefix_projection=False,
+                 **kwargs):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.interleaved_qkv = interleaved_qkv
+        super().__init__(**kwargs)

vllm/transformers_utils/configs/dbrx.py ADDED Viewed

@@ -0,0 +1,278 @@
+# yapf: disable
+# ruff: noqa: E501
+# coding=utf-8
+# Copied from
+# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
+"""Dbrx configuration."""
+from typing import Any, Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} # type: ignore
+class DbrxAttentionConfig(PretrainedConfig):
+    """Configuration class for Dbrx Attention.
+    [`DbrxAttention`] class. It is used to instantiate attention layers
+    according to the specified arguments, defining the layers architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        clip_qkv (`float`, *optional*, defaults to None):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (float): The base frequency for rope.
+    """
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["attn_config"]
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.",
+                config_dict["model_type"], cls.model_type)
+        return cls.from_dict(config_dict, **kwargs)
+class DbrxFFNConfig(PretrainedConfig):
+    """Configuration class for Dbrx FFN.
+    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+    the specified arguments, defining the layers architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of
+            the activation function along with any additional keyword arguments.
+        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+            This should only be used for benchmarking purposes.
+    """
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["ffn_config"]
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all "
+                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
+        return cls.from_dict(config_dict, **kwargs)
+class DbrxConfig(PretrainedConfig):
+    """Configuration class for Dbrx.
+    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        d_model (`int`, *optional*, defaults to 6144):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        max_seq_len (`int`, *optional*, defaults to 32768):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DbrxModel`].
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        ffn_config (`dict`, *optional*):
+            A dictionary used to configure the model's FFN module.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+    Example:
+    ```python
+    >>> from transformers import DbrxConfig, DbrxModel
+    >>> # Initializing a Dbrx configuration
+    >>> configuration = DbrxConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DbrxModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "dbrx"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+        "max_position_embeddings": "max_seq_len",
+    }
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError(
+                "tie_word_embeddings is not supported for Dbrx models."
+            )
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )