PyPI - xinference - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend - Supply Chain Defender

xinference 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (76) hide show

xinference/model/llm/ggml/ctransformers.py DELETED Viewed

@@ -1,281 +0,0 @@
-# Copyright 2022-2023 XProbe Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import os
-from typing import TYPE_CHECKING, Iterator, Optional, Sequence, TypedDict, Union
-if TYPE_CHECKING:
-    from ctransformers import AutoConfig
-from ....types import Completion, CompletionChunk, CreateCompletionCTransformers
-from ..core import LLM
-from ..llm_family import LLMFamilyV1, LLMSpecV1
-from .ctransformers_util import generate_stream
-logger = logging.getLogger(__name__)
-# all supported models for Ctransformers with their model type.
-# Please Strictly follows this name format when inputting new model to model_family.
-MODEL_TYPE_FOR_CTRANSFORMERS = {
-    "gpt-2": "gpt2",
-    "gpt-j": "gptj",
-    "gpt4all-j": "gptj",
-    "gpt-neox": "gpt_neox",
-    "stablelm": "gpt_neox",
-    "llama": "llama",
-    "llama-2": "llama",
-    "mpt": "mpt",
-    "dolly-v2": "dolly-v2",
-    "replit": "replit",
-    "starcoder": "starcoder",
-    "starchat": "starcoder",
-    "falcon": "falcon",
-}
-# these two constants subjects to change for future development and ctransformers updates.
-CTRANSFORMERS_SUPPORTED_MODEL = ["starcoder", "gpt-2"]
-CTRANSFORMERS_GPU_SUPPORT = ["llama", "llama-2", "mpt", "falcon"]
-SIZE_TO_GPU_LAYERS = {
-    3: 26,
-    7: 32,
-    13: 40,
-    30: 60,
-    65: 80,
-}
-class CtransformersModelConfig(TypedDict, total=False):
-    n_ctx: int
-    n_gpu_layers: int
-class CtransformersGenerateConfig(TypedDict, total=False):
-    max_tokens: Optional[int]
-    top_k: Optional[int]
-    top_p: Optional[float]
-    temperature: Optional[float]
-    repetition_penalty: Optional[float]
-    last_n_tokens: Optional[int]
-    seed: Optional[int]
-    batch_size: Optional[int]
-    threads: Optional[int]
-    stop: Optional[Sequence[str]]
-    stream: Optional[bool]
-    reset: Optional[bool]
-class CtransformersModel(LLM):
-    def __init__(
-        self,
-        model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
-        quantization: str,
-        model_path: str,
-        ctransformers_model_config: Optional[CtransformersModelConfig],
-    ):
-        super().__init__(model_uid, model_family, model_spec, quantization, model_path)
-        self._model_type = None
-        closest_size = min(
-            SIZE_TO_GPU_LAYERS.keys(),
-            key=lambda x: abs(
-                x - self.handle_model_size(model_spec.model_size_in_billions)
-            ),
-        )
-        self._model_family = model_family
-        self._model_uid = model_uid
-        self._llm = None
-        self._gpu_layers = SIZE_TO_GPU_LAYERS[closest_size]
-        self._ctransformer_model_config = self._sanitize_model_config(
-            model_path, ctransformers_model_config
-        )
-    def _sanitize_model_config(
-        self, model_path, ctransformers_model_config: Optional[CtransformersModelConfig]
-    ) -> "AutoConfig":
-        try:
-            from ctransformers import AutoConfig, Config
-        except ImportError:
-            error_message = (
-                "Failed to import module 'ctransformers - AutoConfig and Config'"
-            )
-            installation_guide = [
-                f"Please make sure 'ctransformers' is installed.",
-                f"You can install it by checking out the repository for command:"
-                f"https://github.com/marella/ctransformers",
-            ]
-            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        # if the model have customized config, we update it.
-        model_config_ret = Config()
-        potential_gpu_layers = None
-        if ctransformers_model_config:
-            potential_context_length = ctransformers_model_config.pop("n_ctx", None)
-            potential_gpu_layers = ctransformers_model_config.pop("n_gpu_layers", None)
-            model_config_ret.context_length = potential_context_length
-            model_config_ret.gpu_layers = potential_gpu_layers
-        # if user does not define gpu layers, we have to set it with our system if applicable.
-        if potential_gpu_layers is None:
-            if self._model_family.model_name not in CTRANSFORMERS_GPU_SUPPORT:
-                model_config_ret.gpu_layers = -1
-            elif self._is_darwin_and_apple_silicon():
-                model_config_ret.gpu_layers = 1
-            elif self._has_cuda_device():
-                model_config_ret.gpu_layers = self._gpu_layers
-        return AutoConfig(model_config_ret)
-    def _sanitize_generate_config(
-        self,
-        generate_config: Optional[CtransformersGenerateConfig],
-    ) -> CtransformersGenerateConfig:
-        # if the input config is not None, we try to copy the selected attributes to the ctransformersGenerateConfig.
-        if generate_config is None:
-            generate_config = CtransformersGenerateConfig(
-                **CreateCompletionCTransformers().dict()
-            )
-        else:
-            # Validate generate_config and fill default values to the generate config.
-            generate_config = CtransformersGenerateConfig(
-                **CreateCompletionCTransformers(**generate_config).dict()
-            )
-        # for our system, the threads will have to be set to 4
-        # all other parameters, if not specified, will be set to default when generate.
-        generate_config.setdefault("threads", 4)
-        return generate_config
-    def load(self):
-        try:
-            from ctransformers import AutoModelForCausalLM
-        except ImportError:
-            error_message = "Failed to import module 'ctransformers'"
-            installation_guide = [
-                f"Please make sure 'ctransformers' is installed.",
-                f"You can install it by checking out the repository for command."
-                f"https://github.com/marella/ctransformers",
-            ]
-            raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-        model_path = os.path.join(
-            self.model_path,
-            self.model_spec.model_file_name_template.format(
-                quantization=self.quantization
-            ),
-        )
-        self._model_type = self._determine_model_type()
-        self._llm = AutoModelForCausalLM.from_pretrained(
-            model_path_or_repo_id=model_path,
-            model_type=self._model_type,
-            config=self._ctransformer_model_config,
-        )
-    @classmethod
-    def match(
-        cls, llm_family: LLMFamilyV1, llm_spec: LLMSpecV1, quantization: str
-    ) -> bool:
-        if llm_spec.model_format != "ggmlv3" and llm_spec.model_format != "ggufv2":
-            return False
-        if llm_family.model_name not in CTRANSFORMERS_SUPPORTED_MODEL:
-            return False
-        if "generate" not in llm_family.model_ability:
-            return False
-        return True
-    def _determine_model_type(self):
-        if self._model_family.model_name not in MODEL_TYPE_FOR_CTRANSFORMERS:
-            raise ValueError(
-                f"The current model {self._model_family.model_name} is not supported, check your model name. "
-            )
-        return MODEL_TYPE_FOR_CTRANSFORMERS[self._model_family.model_name]
-    def generate(
-        self, prompt: str, generate_config_raw: CtransformersGenerateConfig
-    ) -> Union[Completion, Iterator[CompletionChunk]]:
-        def generator_wrapper(
-            _prompt: str,
-            _max_new_tokens: Union[int, None],
-            _generate_config: CtransformersGenerateConfig,
-        ) -> Iterator[CompletionChunk]:
-            assert self._model_uid is not None
-            for _completion_chunk, _ in generate_stream(
-                model=self._model_uid,
-                model_ref=self._llm,
-                prompt=_prompt,
-                max_new_tokens=_max_new_tokens,
-                **_generate_config,
-            ):
-                yield _completion_chunk
-        generate_config = self._sanitize_generate_config(generate_config_raw)
-        logger.debug(
-            "Enter generate, prompt: %s, generate config: %s", prompt, generate_config
-        )
-        max_new_tokens = generate_config.pop("max_tokens", None)
-        stream_or_not = generate_config.get("stream", False)
-        if stream_or_not:
-            return generator_wrapper(
-                _prompt=prompt,
-                _max_new_tokens=max_new_tokens,
-                _generate_config=generate_config,
-            )
-        else:
-            assert self.model_uid is not None
-            completion_chunk = None
-            completion_usage = None
-            for completion_chunk, completion_usage in generate_stream(
-                model=self.model_uid,
-                model_ref=self._llm,
-                prompt=prompt,
-                max_new_tokens=max_new_tokens,
-                **generate_config,
-            ):
-                pass
-            assert completion_chunk is not None
-            assert completion_usage is not None
-            completion = Completion(
-                id=completion_chunk["id"],
-                object=completion_chunk["object"],
-                created=completion_chunk["created"],
-                model=completion_chunk["model"],
-                choices=completion_chunk["choices"],
-                usage=completion_usage,
-            )
-            logger.debug(
-                "Generated, completion: %s, generate config: %s",
-                completion,
-                generate_config,
-            )
-            return completion

xinference/model/llm/ggml/ctransformers_util.py DELETED Viewed

@@ -1,161 +0,0 @@
-# Copyright 2022-2023 XProbe Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import re
-import time
-import uuid
-from typing import Iterator, Optional, Sequence, Tuple
-from ....types import CompletionChoice, CompletionChunk, CompletionUsage
-logger = logging.getLogger(__name__)
-def generate_stream(
-    model,
-    model_ref,
-    prompt: str,
-    *,
-    max_new_tokens: Optional[int] = None,
-    top_k: Optional[int] = None,
-    top_p: Optional[float] = None,
-    temperature: Optional[float] = None,
-    repetition_penalty: Optional[float] = None,
-    last_n_tokens: Optional[int] = None,
-    seed: Optional[int] = None,
-    batch_size: Optional[int] = None,
-    stream: Optional[bool] = False,
-    threads: Optional[int] = None,
-    stop: Optional[Sequence[str]] = None,
-    reset: Optional[bool] = None,
-    **kwargs,
-) -> Iterator[Tuple[CompletionChunk, CompletionUsage]]:
-    stop = stop or []
-    if isinstance(stop, str):
-        stop = [stop]
-    tokens = model_ref.tokenize(prompt)
-    stop_regex = re.compile("|".join(map(re.escape, stop)))
-    count = 0
-    text = ""
-    total_text = ""
-    incomplete = b""
-    # parameters needed for Xinference.
-    finish_reason = None
-    try:
-        from ctransformers.utils import utf8_split_incomplete
-    except ImportError:
-        error_message = (
-            "Failed to import module 'ctransformers - utf8_split_incomplete'"
-        )
-        installation_guide = [
-            "Please make sure 'ctransformers' is installed. You can install it by checking out the repository: "
-            "https://github.com/marella/ctransformers",
-        ]
-        raise ImportError(f"{error_message}\n\n{''.join(installation_guide)}")
-    for token in model_ref.generate(
-        tokens,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
-        reset=reset,
-    ):
-        # Handle incomplete UTF-8 multi-byte characters.
-        incomplete += model_ref.detokenize([token], decode=False)
-        complete, incomplete = utf8_split_incomplete(incomplete)
-        output = complete.decode(errors="ignore")
-        text += output
-        total_text += output
-        # https://github.com/abetlen/llama-cpp-python/blob/1a13d76c487df1c8560132d10bda62d6e2f4fa93/llama_cpp/llama.py#L686-L706
-        # Check if one of the stop sequences is part of the text.
-        # Note that the stop sequence may not always be at the end of text.
-        if stop:
-            match = stop_regex.search(text)
-            if match:
-                text = text[: match.start()]
-                finish_reason = "stop"
-                break
-        # Avoid sending the longest suffix of text which is also a prefix
-        # of a stop sequence, as it can form a stop sequence with the text
-        # generated later.
-        longest = 0
-        for s in stop:
-            for i in range(len(s), 0, -1):
-                if text.endswith(s[:i]):
-                    longest = max(i, longest)
-                    break
-        end = len(text) - longest
-        if end > 0:
-            output = text[:end]
-            completion_choice = CompletionChoice(
-                text=output, index=0, logprobs=None, finish_reason=None
-            )
-            completion_chunk = CompletionChunk(
-                id=str(uuid.uuid1()),
-                object="text_completion",
-                created=int(time.time()),
-                model=model,
-                choices=[completion_choice],
-            )
-            completion_usage = CompletionUsage(
-                prompt_tokens=len(tokens),
-                completion_tokens=count + 1,
-                total_tokens=count + 1 + len(tokens),
-            )
-            yield completion_chunk, completion_usage
-            text = text[end:]
-        count += 1
-        if max_new_tokens is not None and count >= max_new_tokens:
-            finish_reason = "length"
-            break
-    if stream is False:
-        completion_choice = CompletionChoice(
-            text=total_text, index=0, logprobs=None, finish_reason=finish_reason
-        )
-    else:
-        completion_choice = CompletionChoice(
-            text=text, index=0, logprobs=None, finish_reason=finish_reason
-        )
-    completion_chunk = CompletionChunk(
-        id=str(uuid.uuid1()),
-        object="text_completion",
-        created=int(time.time()),
-        model=model,
-        choices=[completion_choice],
-    )
-    completion_usage = CompletionUsage(
-        prompt_tokens=len(tokens),
-        completion_tokens=count,
-        total_tokens=count + len(tokens),
-    )
-    yield completion_chunk, completion_usage