PyPI - llama-cpp-python-win - Versions diffs - 0.3.16__cp314-cp314-win_amd64.whl - Mend

llama-cpp-python-win 0.3.16__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

bin/convert_hf_to_gguf.py +8751 -0
bin/ggml-base.dll +0 -0
bin/ggml-cpu.dll +0 -0
bin/ggml.dll +0 -0
bin/llama-mtmd-cli.exe +0 -0
bin/llama.dll +0 -0
bin/mtmd.dll +0 -0
include/ggml-alloc.h +76 -0
include/ggml-backend.h +354 -0
include/ggml-blas.h +25 -0
include/ggml-cann.h +123 -0
include/ggml-cpp.h +39 -0
include/ggml-cpu.h +145 -0
include/ggml-cuda.h +47 -0
include/ggml-metal.h +66 -0
include/ggml-opt.h +256 -0
include/ggml-rpc.h +33 -0
include/ggml-sycl.h +49 -0
include/ggml-vulkan.h +29 -0
include/ggml-webgpu.h +19 -0
include/ggml.h +2467 -0
include/gguf.h +202 -0
include/llama-cpp.h +30 -0
include/llama.h +1482 -0
include/mtmd-helper.h +91 -0
include/mtmd.h +298 -0
lib/cmake/ggml/ggml-config.cmake +328 -0
lib/cmake/ggml/ggml-version.cmake +65 -0
lib/cmake/llama/llama-config.cmake +54 -0
lib/cmake/llama/llama-version.cmake +65 -0
lib/ggml-base.lib +0 -0
lib/ggml-cpu.lib +0 -0
lib/ggml.lib +0 -0
lib/llama.lib +0 -0
lib/mtmd.lib +0 -0
lib/pkgconfig/llama.pc +10 -0
llama_cpp/__init__.py +4 -0
llama_cpp/_ctypes_extensions.py +131 -0
llama_cpp/_ggml.py +12 -0
llama_cpp/_internals.py +856 -0
llama_cpp/_logger.py +47 -0
llama_cpp/_utils.py +78 -0
llama_cpp/lib/ggml-base.dll +0 -0
llama_cpp/lib/ggml-base.lib +0 -0
llama_cpp/lib/ggml-cpu.dll +0 -0
llama_cpp/lib/ggml-cpu.lib +0 -0
llama_cpp/lib/ggml.dll +0 -0
llama_cpp/lib/ggml.lib +0 -0
llama_cpp/lib/llama.dll +0 -0
llama_cpp/lib/llama.lib +0 -0
llama_cpp/lib/mtmd.dll +0 -0
llama_cpp/lib/mtmd.lib +0 -0
llama_cpp/llama.py +2422 -0
llama_cpp/llama_cache.py +155 -0
llama_cpp/llama_chat_format.py +3962 -0
llama_cpp/llama_cpp.py +4374 -0
llama_cpp/llama_grammar.py +953 -0
llama_cpp/llama_speculative.py +64 -0
llama_cpp/llama_tokenizer.py +120 -0
llama_cpp/llama_types.py +316 -0
llama_cpp/llava_cpp.py +158 -0
llama_cpp/mtmd_cpp.py +280 -0
llama_cpp/py.typed +0 -0
llama_cpp/server/__init__.py +0 -0
llama_cpp/server/__main__.py +100 -0
llama_cpp/server/app.py +597 -0
llama_cpp/server/cli.py +97 -0
llama_cpp/server/errors.py +212 -0
llama_cpp/server/model.py +312 -0
llama_cpp/server/settings.py +240 -0
llama_cpp/server/types.py +316 -0
llama_cpp_python_win-0.3.16.dist-info/METADATA +856 -0
llama_cpp_python_win-0.3.16.dist-info/RECORD +75 -0
llama_cpp_python_win-0.3.16.dist-info/WHEEL +5 -0
llama_cpp_python_win-0.3.16.dist-info/licenses/LICENSE.md +9 -0

llama_cpp/server/settings.py ADDED Viewed

@@ -0,0 +1,240 @@
+from __future__ import annotations
+import multiprocessing
+from typing import Optional, List, Literal, Union, Dict, cast
+from typing_extensions import Self
+from pydantic import Field, model_validator
+from pydantic_settings import BaseSettings
+import llama_cpp
+# Disable warning for model and model_alias settings
+BaseSettings.model_config["protected_namespaces"] = ()
+class ModelSettings(BaseSettings):
+    """Model settings used to load a Llama model."""
+    model: str = Field(
+        description="The path to the model to use for generating completions."
+    )
+    model_alias: Optional[str] = Field(
+        default=None,
+        description="The alias of the model to use for generating completions.",
+    )
+    # Model Params
+    n_gpu_layers: int = Field(
+        default=0,
+        ge=-1,
+        description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
+    )
+    split_mode: int = Field(
+        default=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
+        description="The split mode to use.",
+    )
+    main_gpu: int = Field(
+        default=0,
+        ge=0,
+        description="Main GPU to use.",
+    )
+    tensor_split: Optional[List[float]] = Field(
+        default=None,
+        description="Split layers across multiple GPUs in proportion.",
+    )
+    vocab_only: bool = Field(
+        default=False, description="Whether to only return the vocabulary."
+    )
+    use_mmap: bool = Field(
+        default=llama_cpp.llama_supports_mmap(),
+        description="Use mmap.",
+    )
+    use_mlock: bool = Field(
+        default=llama_cpp.llama_supports_mlock(),
+        description="Use mlock.",
+    )
+    kv_overrides: Optional[List[str]] = Field(
+        default=None,
+        description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
+    )
+    rpc_servers: Optional[str] = Field(
+        default=None,
+        description="comma seperated list of rpc servers for offloading",
+    )
+    # Context Params
+    seed: int = Field(
+        default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
+    )
+    n_ctx: int = Field(default=2048, ge=0, description="The context size.")
+    n_batch: int = Field(
+        default=512, ge=1, description="The batch size to use per eval."
+    )
+    n_ubatch: int = Field(
+        default=512, ge=1, description="The physical batch size used by llama.cpp"
+    )
+    n_threads: int = Field(
+        default=max(multiprocessing.cpu_count() // 2, 1),
+        ge=1,
+        description="The number of threads to use. Use -1 for max cpu threads",
+    )
+    n_threads_batch: int = Field(
+        default=max(multiprocessing.cpu_count(), 1),
+        ge=0,
+        description="The number of threads to use when batch processing. Use -1 for max cpu threads",
+    )
+    rope_scaling_type: int = Field(
+        default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
+    )
+    rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
+    rope_freq_scale: float = Field(
+        default=0.0, description="RoPE frequency scaling factor"
+    )
+    yarn_ext_factor: float = Field(default=-1.0)
+    yarn_attn_factor: float = Field(default=1.0)
+    yarn_beta_fast: float = Field(default=32.0)
+    yarn_beta_slow: float = Field(default=1.0)
+    yarn_orig_ctx: int = Field(default=0)
+    mul_mat_q: bool = Field(
+        default=True, description="if true, use experimental mul_mat_q kernels"
+    )
+    logits_all: bool = Field(default=True, description="Whether to return logits.")
+    embedding: bool = Field(default=False, description="Whether to use embeddings.")
+    offload_kqv: bool = Field(
+        default=True, description="Whether to offload kqv to the GPU."
+    )
+    flash_attn: bool = Field(
+        default=False, description="Whether to use flash attention."
+    )
+    # Sampling Params
+    last_n_tokens_size: int = Field(
+        default=64,
+        ge=0,
+        description="Last n tokens to keep for repeat penalty calculation.",
+    )
+    # LoRA Params
+    lora_base: Optional[str] = Field(
+        default=None,
+        description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
+    )
+    lora_path: Optional[str] = Field(
+        default=None,
+        description="Path to a LoRA file to apply to the model.",
+    )
+    # Backend Params
+    numa: Union[bool, int] = Field(
+        default=False,
+        description="Enable NUMA support.",
+    )
+    # Chat Format Params
+    chat_format: Optional[str] = Field(
+        default=None,
+        description="Chat format to use.",
+    )
+    clip_model_path: Optional[str] = Field(
+        default=None,
+        description="Path to a CLIP model to use for multi-modal chat completion.",
+    )
+    # Cache Params
+    cache: bool = Field(
+        default=False,
+        description="Use a cache to reduce processing times for evaluated prompts.",
+    )
+    cache_type: Literal["ram", "disk"] = Field(
+        default="ram",
+        description="The type of cache to use. Only used if cache is True.",
+    )
+    cache_size: int = Field(
+        default=2 << 30,
+        description="The size of the cache in bytes. Only used if cache is True.",
+    )
+    # Tokenizer Options
+    hf_tokenizer_config_path: Optional[str] = Field(
+        default=None,
+        description="The path to a HuggingFace tokenizer_config.json file.",
+    )
+    hf_pretrained_model_name_or_path: Optional[str] = Field(
+        default=None,
+        description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
+    )
+    # Loading from HuggingFace Model Hub
+    hf_model_repo_id: Optional[str] = Field(
+        default=None,
+        description="The model repo id to use for the HuggingFace tokenizer model.",
+    )
+    # Speculative Decoding
+    draft_model: Optional[str] = Field(
+        default=None,
+        description="Method to use for speculative decoding. One of (prompt-lookup-decoding).",
+    )
+    draft_model_num_pred_tokens: int = Field(
+        default=10,
+        description="Number of tokens to predict using the draft model.",
+    )
+    # KV Cache Quantization
+    type_k: Optional[int] = Field(
+        default=None,
+        description="Type of the key cache quantization.",
+    )
+    type_v: Optional[int] = Field(
+        default=None,
+        description="Type of the value cache quantization.",
+    )
+    # Misc
+    verbose: bool = Field(
+        default=True, description="Whether to print debug information."
+    )
+    @model_validator(
+        mode="before"
+    )  # pre=True to ensure this runs before any other validation
+    def set_dynamic_defaults(self) -> Self:
+        # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
+        cpu_count = multiprocessing.cpu_count()
+        values = cast(Dict[str, int], self)
+        if values.get("n_threads", 0) == -1:
+            values["n_threads"] = cpu_count
+        if values.get("n_threads_batch", 0) == -1:
+            values["n_threads_batch"] = cpu_count
+        return self
+class ServerSettings(BaseSettings):
+    """Server settings used to configure the FastAPI and Uvicorn server."""
+    # Uvicorn Settings
+    host: str = Field(default="localhost", description="Listen address")
+    port: int = Field(default=8000, description="Listen port")
+    ssl_keyfile: Optional[str] = Field(
+        default=None, description="SSL key file for HTTPS"
+    )
+    ssl_certfile: Optional[str] = Field(
+        default=None, description="SSL certificate file for HTTPS"
+    )
+    # FastAPI Settings
+    api_key: Optional[str] = Field(
+        default=None,
+        description="API key for authentication. If set all requests need to be authenticated.",
+    )
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )
+    disable_ping_events: bool = Field(
+        default=False,
+        description="Disable EventSource pings (may be needed for some clients).",
+    )
+    root_path: str = Field(
+        default="",
+        description="The root path for the server. Useful when running behind a reverse proxy.",
+    )
+class Settings(ServerSettings, ModelSettings):
+    pass
+class ConfigFileSettings(ServerSettings):
+    """Configuration file format settings."""
+    models: List[ModelSettings] = Field(default=[], description="Model configs")

llama_cpp/server/types.py ADDED Viewed

@@ -0,0 +1,316 @@
+from __future__ import annotations
+from typing import List, Optional, Union, Dict
+from typing_extensions import TypedDict, Literal
+from pydantic import BaseModel, Field
+import llama_cpp
+model_field = Field(
+    description="The model to use for generating completions.", default=None
+)
+max_tokens_field = Field(
+    default=16, ge=1, description="The maximum number of tokens to generate."
+)
+min_tokens_field = Field(
+    default=0,
+    ge=0,
+    description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
+)
+temperature_field = Field(
+    default=0.8,
+    description="Adjust the randomness of the generated text.\n\n"
+    + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
+)
+top_p_field = Field(
+    default=0.95,
+    ge=0.0,
+    le=1.0,
+    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
+    + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
+)
+min_p_field = Field(
+    default=0.05,
+    ge=0.0,
+    le=1.0,
+    description="Sets a minimum base probability threshold for token selection.\n\n"
+    + "The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter min_p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with min_p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.",
+)
+stop_field = Field(
+    default=None,
+    description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
+)
+stream_field = Field(
+    default=False,
+    description="Whether to stream the results as they are generated. Useful for chatbots.",
+)
+top_k_field = Field(
+    default=40,
+    ge=0,
+    description="Limit the next token selection to the K most probable tokens.\n\n"
+    + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
+)
+repeat_penalty_field = Field(
+    default=1.1,
+    ge=0.0,
+    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
+    + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
+)
+presence_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+)
+frequency_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+)
+mirostat_mode_field = Field(
+    default=0,
+    ge=0,
+    le=2,
+    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
+)
+mirostat_tau_field = Field(
+    default=5.0,
+    ge=0.0,
+    le=10.0,
+    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text",
+)
+mirostat_eta_field = Field(
+    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
+)
+grammar = Field(
+    default=None,
+    description="A CBNF grammar (as string) to be used for formatting the model's output.",
+)
+class CreateCompletionRequest(BaseModel):
+    prompt: Union[str, List[str]] = Field(
+        default="", description="The prompt to generate completions for."
+    )
+    suffix: Optional[str] = Field(
+        default=None,
+        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
+    )
+    max_tokens: Optional[int] = Field(
+        default=16, ge=0, description="The maximum number of tokens to generate."
+    )
+    min_tokens: int = min_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    min_p: float = min_p_field
+    echo: bool = Field(
+        default=False,
+        description="Whether to echo the prompt in the generated text. Useful for chatbots.",
+    )
+    stop: Optional[Union[str, List[str]]] = stop_field
+    stream: bool = stream_field
+    logprobs: Optional[int] = Field(
+        default=None,
+        ge=0,
+        description="The number of logprobs to generate. If None, no logprobs are generated.",
+    )
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    seed: Optional[int] = Field(None)
+    # ignored or currently unsupported
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    best_of: Optional[int] = 1
+    user: Optional[str] = Field(default=None)
+    # llama.cpp specific parameters
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    mirostat_mode: int = mirostat_mode_field
+    mirostat_tau: float = mirostat_tau_field
+    mirostat_eta: float = mirostat_eta_field
+    grammar: Optional[str] = None
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                    "stop": ["\n", "###"],
+                }
+            ]
+        }
+    }
+class CreateEmbeddingRequest(BaseModel):
+    model: Optional[str] = model_field
+    input: Union[str, List[str]] = Field(description="The input to embed.")
+    user: Optional[str] = Field(default=None)
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "input": "The food was delicious and the waiter...",
+                }
+            ]
+        }
+    }
+class ChatCompletionRequestMessage(BaseModel):
+    role: Literal["system", "user", "assistant", "function"] = Field(
+        default="user", description="The role of the message."
+    )
+    content: Optional[str] = Field(
+        default="", description="The content of the message."
+    )
+class CreateChatCompletionRequest(BaseModel):
+    messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
+        default=[], description="A list of messages to generate completions for."
+    )
+    functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
+        default=None,
+        description="A list of functions to apply to the generated completions.",
+    )
+    function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
+        default=None,
+        description="A function to apply to the generated completions.",
+    )
+    tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
+        default=None,
+        description="A list of tools to apply to the generated completions.",
+    )
+    tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
+        default=None,
+        description="A tool to apply to the generated completions.",
+    )  # TODO: verify
+    max_tokens: Optional[int] = Field(
+        default=None,
+        description="The maximum number of tokens to generate. Defaults to inf",
+    )
+    min_tokens: int = min_tokens_field
+    logprobs: Optional[bool] = Field(
+        default=False,
+        description="Whether to output the logprobs or not. Default is True",
+    )
+    top_logprobs: Optional[int] = Field(
+        default=None,
+        ge=0,
+        description="The number of logprobs to generate. If None, no logprobs are generated. logprobs need to set to True.",
+    )
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    min_p: float = min_p_field
+    stop: Optional[Union[str, List[str]]] = stop_field
+    stream: bool = stream_field
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    seed: Optional[int] = Field(None)
+    response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field(
+        default=None,
+    )
+    # ignored or currently unsupported
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    user: Optional[str] = Field(None)
+    # llama.cpp specific parameters
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    mirostat_mode: int = mirostat_mode_field
+    mirostat_tau: float = mirostat_tau_field
+    mirostat_eta: float = mirostat_eta_field
+    grammar: Optional[str] = None
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ).model_dump(),
+                        ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ).model_dump(),
+                    ]
+                }
+            ]
+        }
+    }
+class ModelData(TypedDict):
+    id: str
+    object: Literal["model"]
+    owned_by: str
+    permissions: List[str]
+class ModelList(TypedDict):
+    object: Literal["list"]
+    data: List[ModelData]
+class TokenizeInputRequest(BaseModel):
+    model: Optional[str] = model_field
+    input: str = Field(description="The input to tokenize.")
+    model_config = {
+        "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
+    }
+class TokenizeInputResponse(BaseModel):
+    tokens: List[int] = Field(description="A list of tokens.")
+    model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}
+class TokenizeInputCountResponse(BaseModel):
+    count: int = Field(description="The number of tokens in the input.")
+    model_config = {"json_schema_extra": {"example": {"count": 5}}}
+class DetokenizeInputRequest(BaseModel):
+    model: Optional[str] = model_field
+    tokens: List[int] = Field(description="A list of toekns to detokenize.")
+    model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}
+class DetokenizeInputResponse(BaseModel):
+    text: str = Field(description="The detokenized text.")
+    model_config = {
+        "json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
+    }