PyPI - llama-cpp-python - Versions diffs - 0.2.23__tar.gz → 0.2.25__tar.gz - Mend

llama-cpp-python 0.2.23tar.gz → 0.2.25tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (578) hide show

llama_cpp_python-0.2.25/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 37556bf9c4f83f51e76682316ff4ea3aed58a279 '37556bf9c4f83f51e76682316ff4ea3aed58a279' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.25/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 37556bf9c4f83f51e76682316ff4ea3aed58a279

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX0JaZkpXV1hTeUJSUmU1UmRRNGJhWVhVaHh6WlJLZDJtR0pYVQ==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzMzOEY0bUxVSzA1UlkwTkhqNDgzaXlqZW8xRVBxMDFBQ1JITQ==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.25/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 37556bf9c4f83f51e76682316ff4ea3aed58a279 runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276562 +0000 checkout: moving from master to refs/tags/v0.2.25

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX0JaZkpXV1hTeUJSUmU1UmRRNGJhWVhVaHh6WlJLZDJtR0pYVQ==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzMzOEY0bUxVSzA1UlkwTkhqNDgzaXlqZW8xRVBxMDFBQ1JITQ==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 7082d24cec35e9ce9147535a2224dfc67ee0a78c runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276563 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c 7082d24cec35e9ce9147535a2224dfc67ee0a78c runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276563 +0000 checkout: moving from master to 7082d24cec35e9ce9147535a2224dfc67ee0a78c

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 7082d24cec35e9ce9147535a2224dfc67ee0a78c runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276563 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 7082d24cec35e9ce9147535a2224dfc67ee0a78c runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276563 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/objects/pack/pack-53054ebe4aff53f3127333b245dfe71db566f164.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.23/.git/modules/vendor/llama.cpp/objects/pack/pack-12e6daaf60fd1afc320675169bfed78e447d3fb5.pack → llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/objects/pack/pack-53054ebe4aff53f3127333b245dfe71db566f164.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/objects/pack/pack-53054ebe4aff53f3127333b245dfe71db566f164.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c refs/remotes/origin/master

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/refs/tags/b1691 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c

llama_cpp_python-0.2.23/.git/objects/bf/9eb24da4238dafc5bef7eabea39b0fcecf5fac → llama_cpp_python-0.2.25/.git/objects/0e/f132b07175867c07ad06fa22ca6b95eca67b59 RENAMED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/13/454a3a6bea90892a42064c32f7a1a60deb0806 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/1c/5efea21fad700ef81acb5682eb71efa64c7453 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/2b/14bc6783798c56c71db248c5a834c30fbbce21 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/34/6b4631ebd1f4af85e9988d4a528e00edba6375 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/37/556bf9c4f83f51e76682316ff4ea3aed58a279 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/64/b567b4f3142efeae284deeab2342122d7e62bd ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/78/8732bd3ba7ed71b0e49fb2dfe42d4ed781c0eb ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/7c/1dececc933fdfba00ee95b5bed81f447a21333 ADDED Viewed

Binary file

llama_cpp_python-0.2.23/.git/objects/3d/07614e35e03d55630abf4e92857441fdcaf91f → llama_cpp_python-0.2.25/.git/objects/80/7b0f57a8a873e58ade0ff0f5b0bcf0ff66b7f9 RENAMED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/85/21e7721390edb971bb04098cba2d50446b3d8f ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/8e/32d2c0edce725a47b5845463133919cd766a61 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/97/fe6e180b574c24eb4f07ef229981a3ac478bb7 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/9d/3d3559849603efda6f3c8181684e4d19e0ec79 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/af/f397f476fb7773d0e89b0e8913c8b1f97ca3e4 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/b5/affaa9d6087f3888dd9eedea209bb214b6e135 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/b9/373b7ac641e6e9c8d8cc64951139205d91d8bc ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/c5/4e4eb5ce2636abd78df46a7616cfe9196a1198 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/c8/4fd04498c2fb188ff7c2a59473035fc90eb990 ADDED Viewed

Binary file

llama_cpp_python-0.2.23/.git/objects/37/670106e61a8a77daff1cc7852e228b07f99293 → llama_cpp_python-0.2.25/.git/objects/ca/e7ebb7a833dafcd402a96bea3a9574f74f0ed5 RENAMED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/cd/351ba33849dcf6af35b493f7405962fa1625d4 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/da/c33b74dddf06fcfc01244044eebb102cfcea37 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/ef/1b2c0162e8edd321e2b9c1ce375d96f1f1d048 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/f0/827d762e852a21f6406c469300899d5f509b8f ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/fa/dfc5fb4fe6f5eb6d5d98b62519e374a5202b00 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/refs/tags/v0.2.25 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 37556bf9c4f83f51e76682316ff4ea3aed58a279

llama_cpp_python-0.2.25/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 37556bf9c4f83f51e76682316ff4ea3aed58a279

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.25]
+- feat(server): Multi model support by @D4ve-R in #931
+- feat(server): Support none defaulting to infinity for completions by @swg in #111
+- feat(server): Implement openai api compatible authentication by @docmeth2 in #1010
+- fix: text_offset of multi-token characters by @twaka in #1037
+- fix: ctypes bindings for kv override by @phiharri in #1011
+- fix: ctypes definitions of llama_kv_cache_view_update and llama_kv_cache_view_free. by @e-c-d in #1028
+## [0.2.24]
+- feat: Update llama.cpp to ggerganov/llama.cpp@0e18b2e7d0b5c0a509ea40098def234b8d4a938a
+- feat: Add offload_kqv option to llama and server by @abetlen in 095c65000642a3cf73055d7428232fb18b73c6f3
+- feat: n_ctx=0 now uses the n_ctx_train of the model by @DanieleMorotti in #1015
+- feat: logits_to_logprobs supports both 2-D and 3-D logits arrays by @kddubey in #1002
+- fix: Remove f16_kv, add offload_kqv fields in low level and llama apis by @brandonrobertz in #1019
+- perf: Don't convert logprobs arrays to lists by @kddubey in #1021
+- docs: Fix README.md functionary demo typo by @evelynmitchell in #996
+- examples: Update low_level_api_llama_cpp.py to match current API by @jsoma in #1023
 ## [0.2.23]
 - Update llama.cpp to ggerganov/llama.cpp@948ff137ec37f1ec74c02905917fa0afc9b97514

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.23
+Version: 0.2.25
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT
@@ -25,6 +25,7 @@ Requires-Dist: sse-starlette>=1.6.1; extra == "server"
 Requires-Dist: starlette-context<0.4,>=0.3.6; extra == "server"
 Requires-Dist: pytest>=7.4.0; extra == "test"
 Requires-Dist: httpx>=0.24.1; extra == "test"
+Requires-Dist: scipy>=1.10; extra == "test"
 Requires-Dist: black>=23.3.0; extra == "dev"
 Requires-Dist: twine>=4.0.2; extra == "dev"
 Requires-Dist: mkdocs>=1.4.3; extra == "dev"
@@ -59,11 +60,13 @@ This package provides:
     - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
     - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
     - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 ## Installation
 `llama-cpp-python` can be installed directly from PyPI as a source distribution by running:
@@ -261,7 +264,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
         "function": {
           "name": "UserDetail",
           "parameters": {
-            "type": "object"
+            "type": "object",
             "title": "UserDetail",
             "properties": {
               "name": {
@@ -373,6 +376,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
 - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
 - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+- [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 ## Docker image

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/README.md RENAMED Viewed

@@ -18,11 +18,13 @@ This package provides:
     - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
     - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
     - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 ## Installation
 `llama-cpp-python` can be installed directly from PyPI as a source distribution by running:
@@ -220,7 +222,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
         "function": {
           "name": "UserDetail",
           "parameters": {
-            "type": "object"
+            "type": "object",
             "title": "UserDetail",
             "properties": {
               "name": {
@@ -332,6 +334,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
 - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
 - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+- [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 ## Docker image

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/docs/server.md RENAMED Viewed

@@ -32,6 +32,12 @@ python3 -m llama_cpp.server --help
 NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
+Check out the server config reference below settings for more information on the available options.
+CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings)
+Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples.
 ## Guides
 ### Code Completion
@@ -121,4 +127,92 @@ response = client.chat.completions.create(
     ],
 )
 print(response)
-```
+```
+## Configuration and Multi-Model Support
+The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable.
+```bash
+python3 -m llama_cpp.server --config_file <config_file>
+```
+Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models.
+The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file.
+At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
+```json
+{
+    "host": "0.0.0.0",
+    "port": 8080,
+    "models": [
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-3.5-turbo",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-4",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf",
+            "model_alias": "gpt-4-vision-preview",
+            "chat_format": "llava-1-5",
+            "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf",
+            "model_alias": "text-davinci-003",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf",
+            "model_alias": "copilot-codex",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 1024,
+            "n_ctx": 9216
+        }
+    ]
+}
+```
+The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class.
+## Server Options Reference
+::: llama_cpp.server.settings.ConfigFileSettings
+    options:
+        show_if_no_docstring: true
+::: llama_cpp.server.settings.ServerSettings
+    options:
+        show_if_no_docstring: true
+::: llama_cpp.server.settings.ModelSettings
+    options:
+        show_if_no_docstring: true

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/examples/low_level_api/low_level_api_llama_cpp.py RENAMED Viewed

@@ -73,7 +73,7 @@ while remaining_tokens > 0:
     embd = []
     if len(embd_inp) <= input_consumed:
         logits = llama_cpp.llama_get_logits(ctx)
-        n_vocab = llama_cpp.llama_n_vocab(ctx)
+        n_vocab = llama_cpp.llama_n_vocab(model)
         _arr = (llama_cpp.llama_token_data * n_vocab)(*[
             llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
@@ -83,12 +83,12 @@ while remaining_tokens > 0:
             llama_cpp.llama_token_data_array(_arr, len(_arr), False))
         _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data)
-        llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p,
+        llama_cpp.llama_sample_repetition_penalties(ctx, candidates_p,
             _arr,
-            last_n_repeat, repeat_penalty)
-        llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p,
-            _arr,
-            last_n_repeat, frequency_penalty, presence_penalty)
+            penalty_last_n=last_n_repeat,
+            penalty_repeat=repeat_penalty,
+            penalty_freq=frequency_penalty,
+            penalty_present=presence_penalty)
         llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1)
         llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1)
@@ -126,4 +126,4 @@ print()
 llama_cpp.llama_print_timings(ctx)
-llama_cpp.llama_free(ctx)
+llama_cpp.llama_free(ctx)

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.23"
+__version__ = "0.2.25"

{llama_cpp_python-0.2.23 → llama_cpp_python-0.2.25}/llama_cpp/llama.py RENAMED Viewed

@@ -2,7 +2,6 @@ import os
 import sys
 import uuid
 import time
-import math
 import multiprocessing
 from abc import ABC, abstractmethod
 from typing import (
@@ -751,9 +750,9 @@ class Llama:
         yarn_beta_slow: float = 1.0,
         yarn_orig_ctx: int = 0,
         mul_mat_q: bool = True,
-        f16_kv: bool = True,
         logits_all: bool = False,
         embedding: bool = False,
+        offload_kqv: bool = False,
         # Sampling Params
         last_n_tokens_size: int = 64,
         # LoRA Params
@@ -771,7 +770,7 @@ class Llama:
         **kwargs,  # type: ignore
     ):
         """Load a llama.cpp model from `model_path`.
         Examples:
             Basic usage
@@ -817,9 +816,9 @@ class Llama:
             yarn_beta_fast: YaRN low correction dim
             yarn_beta_slow: YaRN high correction dim
             yarn_orig_ctx: YaRN original context size
-            f16_kv: Use fp16 for KV cache, fp32 otherwise
             logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
+            offload_kqv: Offload K, Q, V to GPU.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
@@ -904,9 +903,9 @@ class Llama:
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
         self.context_params.mul_mat_q = mul_mat_q
-        # self.context_params.f16_kv = f16_kv
         self.context_params.logits_all = logits_all
         self.context_params.embedding = embedding
+        self.context_params.offload_kqv = offload_kqv
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
@@ -923,6 +922,12 @@ class Llama:
         self._model = _LlamaModel(
             path_model=self.model_path, params=self.model_params, verbose=self.verbose
         )
+        # Set the default value for the context and correct the batch
+        if n_ctx == 0:
+            n_ctx = self._model.n_ctx_train()
+            self.n_batch = min(n_ctx, n_batch)
+            self.context_params.n_ctx = self._model.n_ctx_train()
+            self.context_params.n_batch = self.n_batch
         self._ctx = _LlamaContext(
             model=self._model,
@@ -1546,11 +1551,13 @@ class Llama:
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
-                            self.detokenize(completion_tokens[:returned_tokens])
+                            self.detokenize(completion_tokens[:returned_tokens]).decode(
+                                "utf-8", errors="ignore"
+                            )
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
-                        logits = self._scores[token_offset - 1, :].tolist()
-                        current_logprobs = Llama.logits_to_logprobs(logits)
+                        logits = self._scores[token_offset - 1, :]
+                        current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                         sorted_logprobs = list(
                             sorted(
                                 zip(current_logprobs, range(len(current_logprobs))),
@@ -1668,8 +1675,8 @@ class Llama:
                         self.detokenize(completion_tokens[:returned_tokens])
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
-                    logits = self._scores[token_offset, :].tolist()
-                    current_logprobs = Llama.logits_to_logprobs(logits)
+                    logits = self._scores[token_offset, :]
+                    current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                     sorted_logprobs = list(
                         sorted(
                             zip(current_logprobs, range(len(current_logprobs))),
@@ -1782,16 +1789,21 @@ class Llama:
                 self.detokenize([token]).decode("utf-8", errors="ignore")
                 for token in all_tokens
             ]
-            all_logprobs = [
-                Llama.logits_to_logprobs(row.tolist()) for row in self._scores
-            ][token_offset:]
-            for token, token_str, logprobs_token in zip(
-                all_tokens, all_token_strs, all_logprobs
+            all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
+            # TODO: may be able to change this loop to use np.take_along_dim
+            for idx, (token, token_str, logprobs_token) in enumerate(
+                zip(all_tokens, all_token_strs, all_logprobs)
             ):
                 if token == self.token_bos():
                     continue
-                text_offsets.append(text_offset)
-                text_offset += len(token_str)
+                text_offsets.append(
+                    text_offset
+                    + len(
+                        self.detokenize(all_tokens[:idx]).decode(
+                            "utf-8", errors="ignore"
+                        )
+                    )
+                )
                 tokens.append(token_str)
                 sorted_logprobs = list(
                     sorted(
@@ -1905,7 +1917,7 @@ class Llama:
         completion_or_chunks = self._create_completion(
             prompt=prompt,
             suffix=suffix,
-            max_tokens=max_tokens,
+            max_tokens=-1 if max_tokens is None else max_tokens,
             temperature=temperature,
             top_p=top_p,
             min_p=min_p,
@@ -1939,7 +1951,7 @@ class Llama:
         self,
         prompt: str,
         suffix: Optional[str] = None,
-        max_tokens: int = 128,
+        max_tokens: Optional[int] = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,
@@ -2149,7 +2161,6 @@ class Llama:
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
             mul_mat_q=self.context_params.mul_mat_q,
-            f16_kv=self.context_params.f16_kv,
             logits_all=self.context_params.logits_all,
             embedding=self.context_params.embedding,
             # Sampling Params
@@ -2192,7 +2203,6 @@ class Llama:
             yarn_beta_slow=state["yarn_beta_slow"],
             yarn_orig_ctx=state["yarn_orig_ctx"],
             mul_mat_q=state["mul_mat_q"],
-            f16_kv=state["f16_kv"],
             logits_all=state["logits_all"],
             embedding=state["embedding"],
             # Sampling Params
@@ -2280,14 +2290,22 @@ class Llama:
         return self._model.token_nl()
     @staticmethod
-    def logits_to_logprobs(logits: npt.NDArray[np.single]) -> npt.NDArray[np.single]:
-        maximum = np.max(logits)
-        tmp = np.subtract(logits, maximum, dtype=np.single)
-        np.exp(tmp, out=tmp)
-        normalizer = 1.0 / np.sum(tmp)
-        np.multiply(normalizer, tmp, out=tmp)
-        np.log(tmp, out=tmp)
-        return tmp
+    def logits_to_logprobs(
+        logits: Union[npt.NDArray[np.single], List], axis: int = -1
+    ) -> npt.NDArray[np.single]:
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.log_softmax.html
+        logits_maxs: np.ndarray = np.amax(logits, axis=axis, keepdims=True)
+        if logits_maxs.ndim > 0:
+            logits_maxs[~np.isfinite(logits_maxs)] = 0
+        elif not np.isfinite(logits_maxs):
+            logits_maxs = 0
+        subtract_maxs = np.subtract(logits, logits_maxs, dtype=np.single)
+        exp = np.exp(subtract_maxs)
+        # Suppress warnings about log of zero
+        with np.errstate(divide="ignore"):
+            summed = np.sum(exp, axis=axis, keepdims=True)
+            out = np.log(summed)
+        return subtract_maxs - out
     @staticmethod
     def longest_token_prefix(a: Sequence[int], b: Sequence[int]):

llama-cpp-python 0.2.23__tar.gz → 0.2.25__tar.gz

llama-cpp-python 0.2.23tar.gz → 0.2.25tar.gz