PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/model_executor/guided_decoding/__init__.py ADDED Viewed

@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from vllm.logger import init_logger
+from vllm.model_executor.guided_decoding.utils import (
+    convert_lark_to_gbnf, grammar_is_likely_lark,
+    has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
+from vllm.reasoning import ReasoningParserManager
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+    from vllm.config import ModelConfig
+    from vllm.logits_process import LogitsProcessor
+    from vllm.sampling_params import GuidedDecodingParams
+logger = init_logger(__name__)
+def maybe_backend_fallback(
+        guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
+    def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
+                          fallback: str) -> None:
+        """Change the backend to the specified fallback with a warning log,
+        or raise a ValueError if the `no-fallback` option is specified."""
+        if guided_params.no_fallback():
+            raise ValueError(message)
+        logger.warning("%s Falling back to use %s instead.", message, fallback)
+        guided_params.backend = fallback
+    # `auto` was added for V1 to explicitly declare a mode that has fallbacks
+    # in place. If that is specified with V0, treat it as `xgrammar`, as we have
+    # fallbacks enabled for that and it is the V0 default.
+    if guided_params.backend == "auto":
+        guided_params.backend = "xgrammar"
+    # lm-format-enforce doesn't support grammar, fallback to xgrammar
+    if guided_params.backend_name == "lm-format-enforcer":
+        if guided_params.grammar is not None:
+            fallback_or_error(
+                guided_params,
+                "lm-format-enforcer does not support grammar guided decoding.",
+                "xgrammar")
+        # lm-format-enforcer doesn't support some JSON schema features
+        elif (guided_params.json is not None
+              and has_lmf_unsupported_json_features(guided_params.json)):
+            fallback_or_error(
+                guided_params,
+                "lm-format-enforcer does not support advanced JSON schema "
+                "features like patterns or numeric ranges.", "outlines")
+    if guided_params.backend_name == "xgrammar":
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+            xgr_installed)
+        # xgrammar doesn't support some JSON schema features
+        if (guided_params.json is not None and
+                has_xgrammar_unsupported_json_features(guided_params.json)):
+            fallback_or_error(
+                guided_params,
+                "xgrammar does not support advanced JSON schema features like "
+                "string length, item limits, or property bounds.", "outlines")
+        # xgrammar only supports GBNF grammars, so we must convert Lark.
+        # We must check if the grammar is likely Lark and if that
+        # grammar is convertible to GBNF
+        elif (guided_params.grammar is not None
+              and grammar_is_likely_lark(guided_params.grammar)):
+            try:
+                convert_lark_to_gbnf(guided_params.grammar)
+            except Exception:
+                fallback_or_error(
+                    guided_params,
+                    "xgrammar does not support Lark grammars and the "
+                    "grammar failed to convert to GBNF.", "outlines")
+        # If the xgrammar module cannot be imported successfully,
+        # we should still allow users to use guided decoding with a fallback.
+        elif not xgr_installed:
+            fallback_or_error(
+                guided_params,
+                "xgrammar module cannot be imported successfully.", "outlines")
+    if (guided_params.backend_name == "outlines"
+            and guided_params.json_object is not None):
+        # outlines doesn't support json_object, fallback to guidance
+        fallback_or_error(guided_params,
+                          "outlines does not support json_object.", "guidance")
+    return guided_params
+async def get_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        reasoning_backend: str | None = None) -> LogitsProcessor | None:
+    reasoner = None
+    if reasoning_backend is not None:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
+    guided_params = maybe_backend_fallback(guided_params)
+    # CFG grammar not supported by LMFE, so we use outlines instead
+    if guided_params.backend_name == 'outlines':
+        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
+        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
+            get_outlines_guided_decoding_logits_processor)
+        return await get_outlines_guided_decoding_logits_processor(
+            guided_params, tokenizer, reasoner)
+    if guided_params.backend == 'lm-format-enforcer':
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_local_lm_format_enforcer_guided_decoding_logits_processor)
+        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    if guided_params.backend_name == 'xgrammar':
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
+            get_local_xgrammar_guided_decoding_logits_processor)
+        return get_local_xgrammar_guided_decoding_logits_processor(
+            guided_params, tokenizer, model_config, reasoner)
+    if guided_params.backend_name == 'guidance':
+        from vllm.model_executor.guided_decoding.guidance_decoding import (
+            get_local_guidance_guided_decoding_logits_processor)
+        return get_local_guidance_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    raise ValueError(
+        f"Unknown guided decoding backend '{guided_params.backend}'. "
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
+    )
+def get_local_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        reasoning_backend: str | None = None) -> LogitsProcessor | None:
+    guided_params = maybe_backend_fallback(guided_params)
+    reasoner = None
+    if reasoning_backend is not None:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
+    # CFG grammar not supported by LMFE, so we use outlines instead
+    if guided_params.backend_name == 'outlines':
+        # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
+        from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
+            get_local_outlines_guided_decoding_logits_processor)
+        return get_local_outlines_guided_decoding_logits_processor(
+            guided_params, tokenizer, reasoner)
+    if guided_params.backend_name == 'lm-format-enforcer':
+        from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
+            get_local_lm_format_enforcer_guided_decoding_logits_processor)
+        return get_local_lm_format_enforcer_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    if guided_params.backend_name == 'xgrammar':
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
+            get_local_xgrammar_guided_decoding_logits_processor)
+        return get_local_xgrammar_guided_decoding_logits_processor(
+            guided_params, tokenizer, model_config, reasoner)
+    if guided_params.backend_name == 'guidance':
+        from vllm.model_executor.guided_decoding.guidance_decoding import (
+            get_local_guidance_guided_decoding_logits_processor)
+        return get_local_guidance_guided_decoding_logits_processor(
+            guided_params, tokenizer)
+    raise ValueError(
+        f"Unknown guided decoding backend '{guided_params.backend}'. "
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
+    )

vllm/model_executor/guided_decoding/guidance_decoding.py ADDED Viewed

@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+import json
+from re import escape as regex_escape
+import llguidance
+from transformers import PreTrainedTokenizerBase
+from vllm.model_executor.guided_decoding.guidance_logits_processors import (
+    GuidanceLogitsProcessor)
+from vllm.sampling_params import GuidedDecodingParams
+from vllm.v1.structured_output.backend_guidance import (
+    process_for_additional_properties)
+def get_local_guidance_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizerBase) -> GuidanceLogitsProcessor:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    """
+    grm = ""
+    any_whitespace = 'disable-any-whitespace' not in \
+        guided_params.backend_options()
+    if (guide_json := guided_params.json) is not None:
+        # Optionally set additionalProperties to False at the top-level
+        # By default, other backends do not allow additional top-level
+        # properties, so this makes guidance more similar to other backends
+        if 'no-additional-properties' in guided_params.backend_options():
+            if not isinstance(guide_json, str):
+                guide_json = json.dumps(guide_json)
+            guide_json = process_for_additional_properties(guide_json)
+        grm = llguidance.LLMatcher.grammar_from_json_schema(
+            guide_json,
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
+    elif guided_params.json_object:
+        grm = llguidance.LLMatcher.grammar_from_json_schema(
+            '{"type": "object"}',
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
+    elif guided_params.regex:
+        grm = llguidance.grammar_from("regex", guided_params.regex)
+    elif guided_params.choice:
+        # choice just uses regex
+        choices = (regex_escape(str(choice))
+                   for choice in guided_params.choice)
+        choices_regex = "(" + "|".join(choices) + ")"
+        grm = llguidance.grammar_from("regex", choices_regex)
+    elif guided_params.grammar:
+        # this supports Lark and GBNF
+        grm = llguidance.grammar_from("grammar", guided_params.grammar)
+    if grm:
+        return GuidanceLogitsProcessor(grm, tokenizer)
+    raise ValueError("Unknown guided decoding mode")

vllm/model_executor/guided_decoding/guidance_logits_processors.py ADDED Viewed

@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Any, List
+import llguidance
+import llguidance.hf
+import llguidance.torch
+import torch
+from transformers import PreTrainedTokenizerBase
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+class GuidanceLogitsProcessor:
+    """Base Guidance Logits Processor"""
+    cached_tokenizers: dict[str, Any] = {}
+    def __init__(
+        self,
+        grammar: str,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> None:
+        """Base Guidance Logits Processor
+        Args:
+            grammar (str)
+                grammar to guide the generation
+            tokenizer (PreTrainedTokenizerBase)
+                model's tokenizer
+        """
+        self.grammar = grammar
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer.name_or_path
+        self.new_sampling = False
+        self.initialized = False
+    def _initialize(self):
+        if self.initialized:
+            return
+        ll_tokenizer = self.cached_tokenizers.get(self.tokenizer.name_or_path,
+                                                  None)
+        if ll_tokenizer is None:
+            ll_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None)
+            self.cached_tokenizers[self.tokenizer.name_or_path] = ll_tokenizer
+        self.ll_tokenizer = ll_tokenizer
+        self.ll_matcher = llguidance.LLMatcher(
+            self.ll_tokenizer,
+            self.grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+        # create reusable bitmask
+        self.bitmask = llguidance.torch.allocate_token_bitmask(
+            1, self.ll_tokenizer.vocab_size)
+        self.initialized = True
+    def __call__(
+        self,
+        input_ids: List[int],
+        scores: torch.Tensor,
+    ) -> torch.Tensor:
+        # we initialize the guidance model here
+        # to avoid pickling ll_tokenizer and ll_interpreter
+        self._initialize()
+        if self.new_sampling and len(input_ids) > 0:
+            self.ll_matcher.consume_token(input_ids[-1])
+            err = self.ll_matcher.get_error()
+            if err:
+                logger.warning("Error in LLMatcher: %s", err)
+        llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask,
+                                                 0)
+        llguidance.torch.apply_token_bitmask_inplace(
+            scores, self.bitmask.to(scores.device))
+        self.new_sampling = True
+        return scores

vllm/model_executor/guided_decoding/guided_fields.py ADDED Viewed

@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+from typing import Dict, List, Optional, TypedDict, Union
+from pydantic import BaseModel
+# These classes are deprecated, see SamplingParams
+class LLMGuidedOptions(TypedDict, total=False):
+    guided_json: Union[Dict, BaseModel, str]
+    guided_regex: str
+    guided_choice: List[str]
+    guided_grammar: str
+    guided_decoding_backend: str
+    guided_whitespace_pattern: str
+    guided_json_object: bool
+@dataclass
+class GuidedDecodingRequest:
+    """One of the fields will be used to retrieve the logit processor."""
+    guided_json: Optional[Union[Dict, BaseModel, str]] = None
+    guided_regex: Optional[str] = None
+    guided_choice: Optional[List[str]] = None
+    guided_grammar: Optional[str] = None
+    guided_decoding_backend: Optional[str] = None
+    guided_whitespace_pattern: Optional[str] = None
+    guided_json_object: Optional[bool] = None
+    structural_tag: Optional[str] = None
+    def __post_init__(self):
+        """Validate that some fields are mutually exclusive."""
+        guide_count = sum(x is not None
+                          for x in (self.guided_json, self.guided_regex,
+                                    self.guided_choice, self.guided_grammar,
+                                    self.guided_json_object,
+                                    self.structural_tag))
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding but multiple are "
+                f"specified: {self.__dict__}")

vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py ADDED Viewed

@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+from functools import lru_cache
+from json import loads as json_loads
+from typing import Optional, Union
+from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
+                              RegexParser, StringParser,
+                              TokenEnforcerTokenizerData, UnionParser)
+from lmformatenforcer.integrations.vllm import (
+    build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
+from transformers import PreTrainedTokenizerBase
+from vllm.logits_process import LogitsProcessor
+from vllm.sampling_params import GuidedDecodingParams
+def get_local_lm_format_enforcer_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer) -> Optional[LogitsProcessor]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer)
+    character_level_parser: CharacterLevelParser
+    if guided_params.json:
+        schema_dict = _normalize_json_schema_object(guided_params.json)
+        character_level_parser = JsonSchemaParser(schema_dict)
+    elif guided_params.choice:
+        character_level_parser = UnionParser(
+            [StringParser(choice) for choice in guided_params.choice])
+    elif guided_params.regex:
+        character_level_parser = RegexParser(guided_params.regex)
+    elif guided_params.grammar:
+        # CFG grammar not supported by LMFE
+        raise ValueError("Cannot construct a guided decoding logits processor"
+                         " using the grammar option with the"
+                         " lm_format_enforcer backend.")
+    elif guided_params.json_object:
+        # None means any json object
+        character_level_parser = JsonSchemaParser(None)
+    else:
+        return None
+    logits_processor = build_vllm_logits_processor(tokenizer_data,
+                                                   character_level_parser)
+    return logits_processor
+def _normalize_json_schema_object(schema: Union[str, dict]) -> dict:
+    if isinstance(schema, str):
+        return json_loads(schema)
+    if isinstance(schema, dict):
+        return schema
+    raise AssertionError(f"Unsupported schema type {schema}")
+@lru_cache
+def _cached_build_vllm_token_enforcer_tokenizer_data(
+        tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
+    return build_vllm_token_enforcer_tokenizer_data(tokenizer)

vllm/model_executor/guided_decoding/outlines_decoding.py ADDED Viewed

@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import concurrent.futures
+import os
+from enum import Enum
+from json import dumps as json_dumps
+from re import escape as regex_escape
+from typing import Optional, Tuple, Union
+from transformers import PreTrainedTokenizerBase
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+    CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
+from vllm.reasoning import ReasoningParser
+from vllm.sampling_params import GuidedDecodingParams
+class GuidedDecodingMode(Enum):
+    JSON = "json"
+    REGEX = "regex"
+    CHOICE = "choice"
+    GRAMMAR = "grammar"
+# https://github.com/outlines-dev/outlines/blob/main/outlines/grammars/json.lark
+# the main difference is that we changed the start: value to
+# start: object | array, so we are denying scalar values as the root of the
+# JSON. Starting with scalars as the root seems to cause llama to generate
+# without stop.
+JSON_GRAMMAR = r"""
+?start: object | array
+?value: object
+| array
+| UNESCAPED_STRING
+| SIGNED_NUMBER      -> number
+| "true"             -> true
+| "false"            -> false
+| "null"             -> null
+array  : "[" [value ("," value)*] "]"
+object : "{" [pair ("," pair)*] "}"
+pair   : UNESCAPED_STRING ":" value
+%import common.UNESCAPED_STRING
+%import common.SIGNED_NUMBER
+%import common.WS
+%ignore WS
+"""
+global_thread_pool = None  # used for generating logits processor fsm
+# It's not yet clear that using more provides a benefit, and it could
+# potentially starve other processes on the machine. We'll cap this for now and
+# adjust later if testing proves it to help overcome a bottleneck.
+_MAX_THREADPOOL_WORKERS = 16
+async def get_outlines_guided_decoding_logits_processor(
+    guided_params: GuidedDecodingParams,
+    tokenizer: PreTrainedTokenizerBase,
+    reasoner: Optional[ReasoningParser],
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
+           None]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    global global_thread_pool
+    guide, mode = _get_guide_and_mode(guided_params)
+    if not guide or not mode:
+        return None
+    if global_thread_pool is None:
+        max_workers = os.cpu_count() or 2
+        if max_workers > _MAX_THREADPOOL_WORKERS:
+            max_workers = _MAX_THREADPOOL_WORKERS
+        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=max_workers)
+    loop = asyncio.get_running_loop()
+    return await loop.run_in_executor(global_thread_pool,
+                                      _get_logits_processor, guide, tokenizer,
+                                      mode, guided_params.whitespace_pattern,
+                                      reasoner)
+def get_local_outlines_guided_decoding_logits_processor(
+    guided_params: GuidedDecodingParams,
+    tokenizer: PreTrainedTokenizerBase,
+    reasoner: Optional[ReasoningParser],
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
+           None]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    guide, mode = _get_guide_and_mode(guided_params)
+    if not guide or not mode:
+        return None
+    return _get_logits_processor(guide, tokenizer, mode,
+                                 guided_params.whitespace_pattern, reasoner)
+def _get_guide_and_mode(
+    guided_params: GuidedDecodingParams
+) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
+    if guided_params.json:
+        if isinstance(guided_params.json, dict):
+            # turn dict into hashable string
+            json = json_dumps(guided_params.json)
+        else:
+            json = guided_params.json
+        return json, GuidedDecodingMode.JSON
+    elif guided_params.regex:
+        return guided_params.regex, GuidedDecodingMode.REGEX
+    elif guided_params.choice:
+        # choice just uses regex
+        choices = [
+            regex_escape(str(choice)) for choice in guided_params.choice
+        ]
+        choices_regex = "(" + "|".join(choices) + ")"
+        return choices_regex, GuidedDecodingMode.CHOICE
+    elif guided_params.grammar:
+        return guided_params.grammar, GuidedDecodingMode.GRAMMAR
+    elif guided_params.json_object:
+        return JSON_GRAMMAR, GuidedDecodingMode.GRAMMAR
+    else:
+        return None, None
+def _get_logits_processor(
+    guide: str,
+    tokenizer: PreTrainedTokenizerBase,
+    mode: GuidedDecodingMode,
+    whitespace_pattern: Union[str, None],
+    reasoner: Optional[ReasoningParser],
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
+    if mode == GuidedDecodingMode.JSON:
+        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
+                                   reasoner)
+    elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
+        return RegexLogitsProcessor(guide, tokenizer, reasoner)
+    elif mode == GuidedDecodingMode.GRAMMAR:
+        return CFGLogitsProcessor(guide, tokenizer, reasoner)
+    else:
+        raise ValueError(f"Unknown guided decoding mode {mode}")