PyPI - vllm-npu - Versions diffs - 0.4.2__py3-none-any.whl - Mend

vllm-npu 0.4.2__py3-none-any.whl

Files changed (219) hide show

vllm/__init__.py +23 -0
vllm/_custom_ops.py +251 -0
vllm/attention/__init__.py +13 -0
vllm/attention/backends/__init__.py +0 -0
vllm/attention/backends/abstract.py +127 -0
vllm/attention/backends/flash_attn.py +271 -0
vllm/attention/backends/flashinfer.py +220 -0
vllm/attention/backends/rocm_flash_attn.py +374 -0
vllm/attention/backends/torch_sdpa.py +250 -0
vllm/attention/backends/xformers.py +393 -0
vllm/attention/layer.py +56 -0
vllm/attention/ops/__init__.py +0 -0
vllm/attention/ops/paged_attn.py +216 -0
vllm/attention/ops/prefix_prefill.py +792 -0
vllm/attention/ops/triton_flash_attention.py +810 -0
vllm/attention/selector.py +91 -0
vllm/block.py +84 -0
vllm/config.py +1225 -0
vllm/core/__init__.py +0 -0
vllm/core/block/__init__.py +0 -0
vllm/core/block/block_table.py +295 -0
vllm/core/block/common.py +199 -0
vllm/core/block/cpu_gpu_block_allocator.py +228 -0
vllm/core/block/interfaces.py +205 -0
vllm/core/block/naive_block.py +318 -0
vllm/core/block/prefix_caching_block.py +606 -0
vllm/core/block_manager_v1.py +625 -0
vllm/core/block_manager_v2.py +258 -0
vllm/core/evictor_v1.py +105 -0
vllm/core/evictor_v2.py +127 -0
vllm/core/interfaces.py +113 -0
vllm/core/policy.py +45 -0
vllm/core/scheduler.py +1163 -0
vllm/distributed/__init__.py +3 -0
vllm/distributed/communication_op.py +237 -0
vllm/distributed/device_communicators/__init__.py +0 -0
vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
vllm/distributed/device_communicators/pynccl.py +287 -0
vllm/distributed/device_communicators/pynccl_utils.py +66 -0
vllm/distributed/parallel_state.py +339 -0
vllm/distributed/utils.py +136 -0
vllm/engine/__init__.py +0 -0
vllm/engine/arg_utils.py +649 -0
vllm/engine/async_llm_engine.py +737 -0
vllm/engine/llm_engine.py +784 -0
vllm/engine/metrics.py +368 -0
vllm/engine/output_processor/__init__.py +0 -0
vllm/engine/output_processor/interfaces.py +76 -0
vllm/engine/output_processor/multi_step.py +142 -0
vllm/engine/output_processor/single_step.py +284 -0
vllm/engine/output_processor/stop_checker.py +101 -0
vllm/engine/output_processor/util.py +19 -0
vllm/entrypoints/__init__.py +0 -0
vllm/entrypoints/api_server.py +119 -0
vllm/entrypoints/llm.py +259 -0
vllm/entrypoints/openai/__init__.py +0 -0
vllm/entrypoints/openai/api_server.py +186 -0
vllm/entrypoints/openai/cli_args.py +115 -0
vllm/entrypoints/openai/protocol.py +460 -0
vllm/entrypoints/openai/serving_chat.py +392 -0
vllm/entrypoints/openai/serving_completion.py +347 -0
vllm/entrypoints/openai/serving_engine.py +234 -0
vllm/envs.py +217 -0
vllm/executor/__init__.py +0 -0
vllm/executor/cpu_executor.py +152 -0
vllm/executor/distributed_gpu_executor.py +115 -0
vllm/executor/executor_base.py +115 -0
vllm/executor/gpu_executor.py +150 -0
vllm/executor/multiproc_worker_utils.py +263 -0
vllm/executor/neuron_executor.py +91 -0
vllm/executor/ray_gpu_executor.py +327 -0
vllm/executor/ray_utils.py +119 -0
vllm/logger.py +153 -0
vllm/logging/__init__.py +5 -0
vllm/logging/formatter.py +15 -0
vllm/lora/__init__.py +0 -0
vllm/lora/fully_sharded_layers.py +262 -0
vllm/lora/layers.py +1181 -0
vllm/lora/lora.py +167 -0
vllm/lora/models.py +645 -0
vllm/lora/punica.py +213 -0
vllm/lora/request.py +32 -0
vllm/lora/utils.py +98 -0
vllm/lora/worker_manager.py +251 -0
vllm/model_executor/__init__.py +7 -0
vllm/model_executor/guided_decoding/__init__.py +25 -0
vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
vllm/model_executor/layers/__init__.py +0 -0
vllm/model_executor/layers/activation.py +173 -0
vllm/model_executor/layers/fused_moe/__init__.py +7 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
vllm/model_executor/layers/layernorm.py +71 -0
vllm/model_executor/layers/linear.py +709 -0
vllm/model_executor/layers/logits_processor.py +115 -0
vllm/model_executor/layers/ops/__init__.py +0 -0
vllm/model_executor/layers/ops/rand.py +157 -0
vllm/model_executor/layers/ops/sample.py +406 -0
vllm/model_executor/layers/quantization/__init__.py +35 -0
vllm/model_executor/layers/quantization/aqlm.py +376 -0
vllm/model_executor/layers/quantization/awq.py +175 -0
vllm/model_executor/layers/quantization/base_config.py +97 -0
vllm/model_executor/layers/quantization/fp8.py +265 -0
vllm/model_executor/layers/quantization/gptq.py +224 -0
vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
vllm/model_executor/layers/quantization/marlin.py +227 -0
vllm/model_executor/layers/quantization/schema.py +84 -0
vllm/model_executor/layers/quantization/squeezellm.py +137 -0
vllm/model_executor/layers/rejection_sampler.py +405 -0
vllm/model_executor/layers/rotary_embedding.py +525 -0
vllm/model_executor/layers/sampler.py +1051 -0
vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
vllm/model_executor/model_loader/__init__.py +30 -0
vllm/model_executor/model_loader/loader.py +362 -0
vllm/model_executor/model_loader/neuron.py +136 -0
vllm/model_executor/model_loader/tensorizer.py +368 -0
vllm/model_executor/model_loader/utils.py +41 -0
vllm/model_executor/model_loader/weight_utils.py +372 -0
vllm/model_executor/models/__init__.py +119 -0
vllm/model_executor/models/baichuan.py +410 -0
vllm/model_executor/models/bloom.py +327 -0
vllm/model_executor/models/chatglm.py +386 -0
vllm/model_executor/models/commandr.py +373 -0
vllm/model_executor/models/dbrx.py +413 -0
vllm/model_executor/models/decilm.py +122 -0
vllm/model_executor/models/deepseek.py +438 -0
vllm/model_executor/models/falcon.py +444 -0
vllm/model_executor/models/gemma.py +393 -0
vllm/model_executor/models/gpt2.py +266 -0
vllm/model_executor/models/gpt_bigcode.py +274 -0
vllm/model_executor/models/gpt_j.py +281 -0
vllm/model_executor/models/gpt_neox.py +295 -0
vllm/model_executor/models/internlm2.py +323 -0
vllm/model_executor/models/jais.py +333 -0
vllm/model_executor/models/llama.py +442 -0
vllm/model_executor/models/llava.py +239 -0
vllm/model_executor/models/minicpm.py +531 -0
vllm/model_executor/models/mixtral.py +583 -0
vllm/model_executor/models/mixtral_quant.py +404 -0
vllm/model_executor/models/mpt.py +295 -0
vllm/model_executor/models/olmo.py +356 -0
vllm/model_executor/models/opt.py +349 -0
vllm/model_executor/models/orion.py +319 -0
vllm/model_executor/models/phi.py +300 -0
vllm/model_executor/models/qwen.py +284 -0
vllm/model_executor/models/qwen2.py +367 -0
vllm/model_executor/models/qwen2_moe.py +447 -0
vllm/model_executor/models/stablelm.py +301 -0
vllm/model_executor/models/starcoder2.py +302 -0
vllm/model_executor/models/xverse.py +366 -0
vllm/model_executor/sampling_metadata.py +588 -0
vllm/model_executor/utils.py +35 -0
vllm/outputs.py +150 -0
vllm/py.typed +2 -0
vllm/sampling_params.py +340 -0
vllm/sequence.py +766 -0
vllm/spec_decode/__init__.py +0 -0
vllm/spec_decode/batch_expansion.py +397 -0
vllm/spec_decode/interfaces.py +73 -0
vllm/spec_decode/metrics.py +191 -0
vllm/spec_decode/multi_step_worker.py +203 -0
vllm/spec_decode/ngram_worker.py +176 -0
vllm/spec_decode/spec_decode_worker.py +472 -0
vllm/spec_decode/top1_proposer.py +200 -0
vllm/spec_decode/util.py +228 -0
vllm/test_utils.py +41 -0
vllm/transformers_utils/__init__.py +0 -0
vllm/transformers_utils/config.py +58 -0
vllm/transformers_utils/configs/__init__.py +16 -0
vllm/transformers_utils/configs/chatglm.py +68 -0
vllm/transformers_utils/configs/dbrx.py +278 -0
vllm/transformers_utils/configs/falcon.py +87 -0
vllm/transformers_utils/configs/jais.py +236 -0
vllm/transformers_utils/configs/mpt.py +178 -0
vllm/transformers_utils/detokenizer.py +313 -0
vllm/transformers_utils/tokenizer.py +149 -0
vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
vllm/transformers_utils/tokenizers/__init__.py +5 -0
vllm/transformers_utils/tokenizers/baichuan.py +255 -0
vllm/usage/__init__.py +0 -0
vllm/usage/usage_lib.py +209 -0
vllm/utils.py +677 -0
vllm/worker/__init__.py +0 -0
vllm/worker/cache_engine.py +105 -0
vllm/worker/cpu_model_runner.py +346 -0
vllm/worker/cpu_worker.py +321 -0
vllm/worker/model_runner.py +1168 -0
vllm/worker/neuron_model_runner.py +196 -0
vllm/worker/neuron_worker.py +98 -0
vllm/worker/worker.py +345 -0
vllm/worker/worker_base.py +146 -0
vllm_npu-0.4.2.dist-info/LICENSE +201 -0
vllm_npu-0.4.2.dist-info/METADATA +173 -0
vllm_npu-0.4.2.dist-info/RECORD +219 -0
vllm_npu-0.4.2.dist-info/WHEEL +5 -0
vllm_npu-0.4.2.dist-info/top_level.txt +1 -0

vllm/core/block/naive_block.py ADDED Viewed

@@ -0,0 +1,318 @@
+from typing import Dict, FrozenSet, Iterable, List, Optional, Set
+from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
+                                    get_all_blocks_recursively)
+from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
+Refcount = int
+class NaiveBlockAllocator(BlockAllocator):
+    """A simple block allocator that manages blocks of memory without prefix
+    caching.
+    Args:
+        create_block (Block.Factory): A factory function for creating new
+            blocks. This is used when a NaiveBlockAllocator is composed within
+            a prefix caching allocator -- the naive block allocator must
+            construct prefix caching blocks (but shouldn't know anything else
+            about them).
+        num_blocks (int): The total number of blocks to manage.
+        block_size (int): The size of each block in tokens.
+        block_ids (Optional[Iterable[int]], optional): An optional iterable of
+            block IDs. If not provided, block IDs will be assigned sequentially
+            from 0 to num_blocks - 1.
+    """
+    def __init__(
+        self,
+        create_block: Block.Factory,
+        num_blocks: int,
+        block_size: int,
+        block_ids: Optional[Iterable[int]] = None,
+    ):
+        if block_ids is None:
+            block_ids = range(num_blocks)
+        self._free_block_indices: Set[BlockId] = set(block_ids)
+        self._all_block_indices = frozenset(block_ids)
+        assert len(self._all_block_indices) == num_blocks
+        self._refcounter = RefCounter(
+            all_block_indices=self._free_block_indices)
+        self._create_block = create_block
+        self._block_size = block_size
+        self._cow_tracker = CopyOnWriteTracker(
+            refcounter=self._refcounter.as_readonly(),
+            allocator=self,
+        )
+    def allocate_immutable(self,
+                           prev_block: Optional[Block],
+                           token_ids: List[int],
+                           device: Optional[Device] = None) -> Block:
+        """Allocates a new immutable block with the given token IDs, linked to
+        the previous block.
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+            token_ids (List[int]): The token IDs to be stored in the new block.
+        Returns:
+            Block: The newly allocated immutable block.
+        """
+        assert device is None
+        block = self.allocate_mutable(prev_block=prev_block)
+        block.append_token_ids(token_ids)
+        return block
+    def allocate_mutable(self,
+                         prev_block: Optional[Block],
+                         device: Optional[Device] = None) -> Block:
+        """Allocates a new mutable block, linked to the previous block.
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+        Returns:
+            Block: The newly allocated mutable block.
+        """
+        assert device is None
+        block_id = self._allocate_new_block_id()
+        return self._create_block(
+            prev_block=prev_block,
+            token_ids=[],
+            block_id=block_id,
+            block_size=self._block_size,
+            allocator=self,
+        )
+    def free(self, block: Block) -> None:
+        assert block.block_id is not None
+        self._free_block_id(block.block_id)
+        # Mark the block as having no allocation.
+        block.block_id = None
+    def fork(self, last_block: Block) -> List[Block]:
+        """Creates a new sequence of blocks that shares the same underlying
+        memory as the original sequence.
+        Args:
+            last_block (Block): The last block in the original sequence.
+        Returns:
+            List[Block]: The new sequence of blocks that shares the same memory
+                as the original sequence.
+        """
+        source_blocks = get_all_blocks_recursively(last_block)
+        forked_blocks = []
+        prev_block = None
+        for block in source_blocks:
+            # Increment refcount for each block.
+            assert block.block_id is not None
+            refcount = self._refcounter.incr(block.block_id)
+            assert refcount != 1, "can't fork free'd block"
+            forked_blocks.append(
+                self._create_block(
+                    prev_block=prev_block,
+                    token_ids=block.token_ids,
+                    block_id=block.block_id,
+                    block_size=self._block_size,
+                    allocator=self,
+                ))
+            prev_block = forked_blocks[-1]
+        return forked_blocks
+    def get_num_free_blocks(self) -> int:
+        return len(self._free_block_indices)
+    def get_num_total_blocks(self) -> int:
+        return len(self._all_block_indices)
+    def _allocate_new_block_id(self) -> BlockId:
+        if not self._free_block_indices:
+            raise BlockAllocator.NoFreeBlocksError()
+        block_id = next(iter(self._free_block_indices))
+        self._refcounter.incr(block_id)
+        self._free_block_indices.remove(block_id)
+        return block_id
+    def _free_block_id(self, block_id: BlockId) -> None:
+        refcount = self._refcounter.decr(block_id)
+        if refcount == 0:
+            self._free_block_indices.add(block_id)
+    @property
+    def refcounter(self):
+        return self._refcounter
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
+        return self._all_block_indices
+    def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
+        """Performs a copy-on-write operation on the given block if it is not
+        appendable.
+        Args:
+            block (Block): The block to check for copy-on-write.
+        Returns:
+            Optional[BlockId]: The block index of the new block if a copy-on
+                -write operation was performed, or the original block index if
+                no copy-on-write was necessary.
+        """
+        return self._cow_tracker.cow_block_if_not_appendable(block)
+    def clear_copy_on_writes(self) -> Dict[BlockId, List[BlockId]]:
+        """Returns the copy-on-write source->destination mapping and clears it.
+        Returns:
+            Dict[BlockId, List[BlockId]]: A dictionary mapping source
+                block indices to lists of destination block indices.
+        """
+        return self._cow_tracker.clear_cows()
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, used in prefix caching.
+        Since the naive allocator does not implement prefix caching, we do
+        nothing.
+        """
+        pass
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        """Mark blocks as computed, used in prefix caching.
+        Since the naive allocator does not implement prefix caching, we do
+        nothing.
+        """
+        pass
+    def get_common_computed_block_ids(
+            self, seq_block_ids: List[List[int]]) -> List[int]:
+        """Determine blocks that can be skipped in prefill.
+        Since the naive allocator does not support prefix caching, always return
+        an empty list.
+        """
+        return []
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        raise NotImplementedError
+class NaiveBlock(Block):
+    """An implementation of the Block class that does not support prefix
+    caching.
+    The NaiveBlock class represents a block of token IDs with a fixed size. It
+    provides methods for appending token IDs to the block and manages copy-on
+    -write operations when necessary.
+    Args:
+        prev_block (Block): The previous block in the sequence.
+        token_ids (List[int]): The initial token IDs to be stored in the block.
+        block_size (int): The maximum number of token IDs that can be stored in
+            the block.
+        allocator (BlockAllocator): The block allocator associated with this
+            block.
+        block_id (Optional[int], optional): The physical block index
+            of this block. Defaults to None, which means no allocation has been
+            made.
+        _cow_target (Optional[Block], optional): The copy-on-write target block.
+            If not provided, it defaults to self.
+    """
+    def __init__(self,
+                 prev_block: Optional[Block],
+                 token_ids: List[int],
+                 block_size: int,
+                 allocator: BlockAllocator,
+                 block_id: Optional[int] = None,
+                 _cow_target: Optional[Block] = None):
+        self._token_ids: List[int] = []
+        self._block_size = block_size
+        self._prev_block = prev_block
+        self._block_id = block_id
+        self._allocator = allocator
+        self._cow_target = _cow_target if _cow_target is not None else self
+        self._append_token_ids_no_cow(token_ids)
+    def append_token_ids(self, token_ids: List[int]) -> None:
+        """Appends the given token IDs to the block, instructing the allocator
+        to perform a copy-on-write if necessary.
+        Args:
+            token_ids (List[int]): The token IDs to be appended to the block.
+        """
+        self._append_token_ids_no_cow(token_ids)
+        if self._block_id is not None:
+            self._block_id = (self._allocator.cow_block_if_not_appendable(
+                self._cow_target))
+    def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
+        assert self.num_empty_slots >= len(token_ids)
+        self._token_ids.extend(token_ids)
+    @property
+    def computed(self) -> bool:
+        raise NotImplementedError
+    @computed.setter
+    def computed(self, value) -> None:
+        raise NotImplementedError
+    @property
+    def last_accessed(self) -> float:
+        raise NotImplementedError
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        raise NotImplementedError
+    @property
+    def block_id(self) -> Optional[int]:
+        return self._block_id
+    @block_id.setter
+    def block_id(self, value: Optional[int]) -> None:
+        self._block_id = value
+    @property
+    def is_full(self) -> bool:
+        return self.num_empty_slots == 0
+    @property
+    def num_empty_slots(self) -> int:
+        return self._block_size - len(self._token_ids)
+    @property
+    def token_ids(self) -> List[int]:
+        return self._token_ids
+    @property
+    def block_size(self) -> int:
+        return self._block_size
+    @property
+    def prev_block(self) -> Optional["Block"]:
+        return self._prev_block
+    @property
+    def content_hash(self) -> Optional[int]:
+        return None