PyPI - llama-cpp-python - Versions diffs - 0.2.28__tar.gz → 0.2.29__tar.gz - Mend

llama-cpp-python 0.2.28tar.gz → 0.2.29tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (612) hide show

llama_cpp_python-0.2.29/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4b11fa83c00a3c04cfb47775ffcd226167d52044 '4b11fa83c00a3c04cfb47775ffcd226167d52044' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.29/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4b11fa83c00a3c04cfb47775ffcd226167d52044

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX3o0cnV1YlBtbkRCRjlFeGplTTNOREdVajlMSUVudjE4NXhveA==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzg0QXZkVFUxdjRsWmlJSlFjZHVISHVlZnBtUnA3STFDcUxkeA==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.29/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 4b11fa83c00a3c04cfb47775ffcd226167d52044 runner <runner@fv-az651-417.mi4nuvdbnkburlvweydx3iqepe.ex.internal.cloudapp.net> 1705341384 +0000 checkout: moving from master to refs/tags/v0.2.29

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4483396751c79dea540808b9cb9238245d06da2b

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX3o0cnV1YlBtbkRCRjlFeGplTTNOREdVajlMSUVudjE4NXhveA==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzg0QXZkVFUxdjRsWmlJSlFjZHVISHVlZnBtUnA3STFDcUxkeA==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 4483396751c79dea540808b9cb9238245d06da2b runner <runner@fv-az651-417.mi4nuvdbnkburlvweydx3iqepe.ex.internal.cloudapp.net> 1705341385 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ 4483396751c79dea540808b9cb9238245d06da2b 4483396751c79dea540808b9cb9238245d06da2b runner <runner@fv-az651-417.mi4nuvdbnkburlvweydx3iqepe.ex.internal.cloudapp.net> 1705341385 +0000 checkout: moving from master to 4483396751c79dea540808b9cb9238245d06da2b

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 4483396751c79dea540808b9cb9238245d06da2b runner <runner@fv-az651-417.mi4nuvdbnkburlvweydx3iqepe.ex.internal.cloudapp.net> 1705341385 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 4483396751c79dea540808b9cb9238245d06da2b runner <runner@fv-az651-417.mi4nuvdbnkburlvweydx3iqepe.ex.internal.cloudapp.net> 1705341385 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/objects/pack/pack-089f43df9dfbcbef5693213d630b739a6bd96e17.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.28/.git/modules/vendor/llama.cpp/objects/pack/pack-51c0c02c987a3a975ed4edb03bbd1999de104e16.pack → llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/objects/pack/pack-089f43df9dfbcbef5693213d630b739a6bd96e17.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/objects/pack/pack-089f43df9dfbcbef5693213d630b739a6bd96e17.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ 4483396751c79dea540808b9cb9238245d06da2b refs/remotes/origin/master

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4483396751c79dea540808b9cb9238245d06da2b

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/refs/tags/b1878 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4483396751c79dea540808b9cb9238245d06da2b

llama_cpp_python-0.2.29/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4483396751c79dea540808b9cb9238245d06da2b

llama_cpp_python-0.2.29/.git/objects/02/c09afb0bf5559d3fe64ce67f4ff82af32ff50f ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/03/6b2ddd1f34297fc03f715b82fd66945e9147ea ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/1a/5152530cfbde487c928b60269a29fa5219f617 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/1b/78703a3fcd25c369a170cc7c94ca5a0a0e3baf ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/4b/11fa83c00a3c04cfb47775ffcd226167d52044 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/50/6ab1f7ef5b87b110a38090b844b19b63c5bb8c ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/65/206bf28d4bcaebea79c68bbd4e526aed5da6b3 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/6e/7ace3b7938f7c58a9ef2bf593c13691c03cf45 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/90/2a43919c5d79d1d418aee2d1512c2e92e8a00b ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/9e/8e3cec752c06b0fd8cebdebbc6dcf7cade1a5a ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/a4/5e5d77363eb85144d4aff1a3cbe86ce94d5c92 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/ba/14c5172dd87bbfcee083a5eac8c0511bb0c633 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/c0/2e656426ce672792fd20cb55c9616067974520 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/c0/748ee14e43d553bccbf402a4ed6c13b563b453 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/d1/ae9b564d3ab02c6b91162e52d822d36524edb3 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/e4/be9d1c98b789ed81b2d0fbdd28a441f267f489 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/ef/9392b7a3d1f6802cef539eea18185eb50c61af ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ x}TM��0�Y�bPj��%�[ ��K�S��R�lM-��J��w4�Xg7�A��潑�u5��w�띏ж�SU��b,<g�� -�=l6�)��N#o��d&�k�#s)�s��"��1��U�)_��,��g��Ζ�O��ǁS,9�^�4R��]T��l��& D�#�*c5�J��[�ɐ �0�gq�q4��Z�h�)�(EO��o�@0Ny�~��1MK9��8��W06.q��JK�z �%AZ��戝��y��wV��!R;�R�mU�g,Xz��#�2jN֥8��Dճ3�Q{�i��<�&��{�,��5 r�(��M�˽w=��A�H�P2��"�d)�A�s��~�4�#=�<�eM��d��V`��'
2	+ .��-��3��O9V<��R.��n~1��L��_�H��y�W9/�{9ݕ�_��i��ҺЧ��P��A�Յ��/]�X%

llama_cpp_python-0.2.29/.git/objects/f7/b6ba6b1950c0c0a796d996f980e763bd9641cb ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/objects/f9/be3237d374ae1fa7e6d7e9587f17636b0a4f35 ADDED Viewed

Binary file

llama_cpp_python-0.2.29/.git/refs/tags/v0.2.29 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4b11fa83c00a3c04cfb47775ffcd226167d52044

llama_cpp_python-0.2.29/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 4b11fa83c00a3c04cfb47775ffcd226167d52044

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.29]
+- feat: Update llama.cpp to ggerganov/llama.cpp@4483396751c79dea540808b9cb9238245d06da2b
+- feat: Add split_mode option by @abetlen in 84615adbc6855c8384807c42f0130f9a1763f99d
+- feat: Implement GGUF metadata KV overrides by @phiharri in #1011
+- fix: Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor by @yieldthought in #1012
+- fix: Fix low_level_api_chat_cpp example to match current API by @aniljava in #1086
+- fix: Fix Pydantic model parsing by @DeNeutoy in #1087
 ## [0.2.28]
 - feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.28
+Version: 0.2.29
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/examples/low_level_api/common.py RENAMED Viewed

@@ -106,7 +106,7 @@ def gpt_params_parse(argv = None):
     parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
     parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-p", "--prompt", type=str, default=None, help="initial prompt",dest="prompt")
     parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
     parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
     parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/examples/low_level_api/low_level_api_chat_cpp.py RENAMED Viewed

@@ -62,7 +62,7 @@ specified) expect poor results""", file=sys.stderr)
 		self.multibyte_fix = []
 		# model load
-		self.lparams = llama_cpp.llama_context_default_params()
+		self.lparams = llama_cpp.llama_model_default_params()
 		self.lparams.n_ctx = self.params.n_ctx
 		self.lparams.n_parts = self.params.n_parts
 		self.lparams.seed = self.params.seed
@@ -72,7 +72,11 @@ specified) expect poor results""", file=sys.stderr)
 		self.model = llama_cpp.llama_load_model_from_file(
 			self.params.model.encode("utf8"), self.lparams)
-		self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.lparams)
+		# Context Params.
+		self.cparams = llama_cpp.llama_context_default_params()
+		self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
 		if (not self.ctx):
 			raise RuntimeError(f"error: failed to load model '{self.params.model}'")
@@ -244,7 +248,7 @@ n_keep = {self.params.n_keep}
 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
 		_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
-		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
+		_n = llama_cpp.llama_tokenize(self.model, prompt.encode("utf8", errors="ignore"), len(prompt), _arr, len(_arr), bos, False)
 		return _arr[:_n]
 	def set_color(self, c):
@@ -304,7 +308,7 @@ n_keep = {self.params.n_keep}
 					self.n_past += n_eval"""
 				if (llama_cpp.llama_eval(
-					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
+					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past
 				) != 0):
 					raise Exception("Failed to llama_eval!")
@@ -332,7 +336,7 @@ n_keep = {self.params.n_keep}
 				id = 0
 				logits = llama_cpp.llama_get_logits(self.ctx)
-				n_vocab = llama_cpp.llama_n_vocab(self.ctx)
+				n_vocab = llama_cpp.llama_n_vocab(self.model)
 				# Apply params.logit_bias map
 				for key, value in self.params.logit_bias.items():
@@ -349,12 +353,20 @@ n_keep = {self.params.n_keep}
 				last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 				_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
-				llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
-					_arr,
-					last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
-				llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
-					_arr,
-					last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
+				llama_cpp.llama_sample_repetition_penalties(
+					ctx=self.ctx,
+					candidates=candidates_p,
+					last_tokens_data = _arr,
+					penalty_last_n = last_n_repeat,
+					penalty_repeat = llama_cpp.c_float(self.params.repeat_penalty),
+					penalty_freq = llama_cpp.c_float(self.params.frequency_penalty),
+					penalty_present = llama_cpp.c_float(self.params.presence_penalty),
+				)
+				# NOT PRESENT IN CURRENT VERSION ?
+				# llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p,
+				# 	_arr,
+				# 	last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
 				if not self.params.penalize_nl:
 					logits[llama_cpp.llama_token_nl()] = nl_logit
@@ -473,7 +485,7 @@ n_keep = {self.params.n_keep}
 	def token_to_str(self, token_id: int) -> bytes:
 		size = 32
 		buffer = (ctypes.c_char * size)()
-		n = llama_cpp.llama_token_to_piece_with_model(
+		n = llama_cpp.llama_token_to_piece(
 			self.model, llama_cpp.llama_token(token_id), buffer, size)
 		assert n <= size
 		return bytes(buffer[:n])
@@ -532,6 +544,9 @@ n_keep = {self.params.n_keep}
 			print(i,end="",flush=True)
 		self.params.input_echo = False
+        # Using string instead of tokens to check for antiprompt,
+		# It is more reliable than tokens for interactive mode.
+		generated_str = ""
 		while self.params.interactive:
 			self.set_color(util.CONSOLE_COLOR_USER_INPUT)
 			if (self.params.instruct):
@@ -546,6 +561,10 @@ n_keep = {self.params.n_keep}
 			try:
 				for i in self.output():
 					print(i,end="",flush=True)
+					generated_str += i
+					for ap in self.params.antiprompt:
+						if generated_str.endswith(ap):
+							raise KeyboardInterrupt
 			except KeyboardInterrupt:
 				self.set_color(util.CONSOLE_COLOR_DEFAULT)
 				if not self.params.instruct:
@@ -561,7 +580,7 @@ if __name__ == "__main__":
 	time_now = datetime.now()
 	prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
 {AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+Transcript below contains only the recorded dialog between two, without any annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
 The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
 The transcript only includes text, it does not include markup like HTML and Markdown.
@@ -575,8 +594,11 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
-{USER_NAME}:"""
+{USER_NAME}:   """
 	params = gpt_params_parse()
+	if params.prompt is None and params.file is None:
+		params.prompt = prompt
 	with LLaMAInteract(params) as m:
 		m.interact()

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.28"
+__version__ = "0.2.29"

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/llama_cpp/_utils.py RENAMED Viewed

@@ -1,11 +1,15 @@
 import os
 import sys
+import sys, traceback
+# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
+outnull_file = open(os.devnull, "w")
+errnull_file = open(os.devnull, "w")
 class suppress_stdout_stderr(object):
     # NOTE: these must be "saved" here to avoid exceptions when using
     #       this context manager inside of a __del__ method
-    open = open
     sys = sys
     os = os
@@ -21,9 +25,6 @@ class suppress_stdout_stderr(object):
         if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
             return self  # Return the instance without making changes
-        self.outnull_file = self.open(self.os.devnull, "w")
-        self.errnull_file = self.open(self.os.devnull, "w")
         self.old_stdout_fileno_undup = self.sys.stdout.fileno()
         self.old_stderr_fileno_undup = self.sys.stderr.fileno()
@@ -33,11 +34,11 @@ class suppress_stdout_stderr(object):
         self.old_stdout = self.sys.stdout
         self.old_stderr = self.sys.stderr
-        self.os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
-        self.os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
+        self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup)
+        self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup)
-        self.sys.stdout = self.outnull_file
-        self.sys.stderr = self.errnull_file
+        self.sys.stdout = outnull_file
+        self.sys.stderr = errnull_file
         return self
     def __exit__(self, *_):
@@ -54,6 +55,3 @@ class suppress_stdout_stderr(object):
             self.os.close(self.old_stdout_fileno)
             self.os.close(self.old_stderr_fileno)
-            self.outnull_file.close()
-            self.errnull_file.close()

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/llama_cpp/llama.py RENAMED Viewed

@@ -730,11 +730,13 @@ class Llama:
         *,
         # Model Params
         n_gpu_layers: int = 0,
+        split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
         main_gpu: int = 0,
         tensor_split: Optional[List[float]] = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
+        kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
         # Context Params
         seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
         n_ctx: int = 512,
@@ -798,11 +800,13 @@ class Llama:
         Args:
             model_path: Path to the model.
             n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
-            main_gpu: The GPU that is used for scratch and small tensors.
+            split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
+            main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
             tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
             vocab_only: Only load the vocabulary no weights.
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
+            kv_overrides: Key-value overrides for the model.
             seed: RNG seed, -1 for random
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
@@ -848,6 +852,7 @@ class Llama:
         self.model_params.n_gpu_layers = (
             0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
+        self.model_params.split_mode = split_mode
         self.model_params.main_gpu = main_gpu
         self.tensor_split = tensor_split
         self._c_tensor_split = None
@@ -866,6 +871,34 @@ class Llama:
         self.model_params.use_mmap = use_mmap if lora_path is None else False
         self.model_params.use_mlock = use_mlock
+        self.kv_overrides = kv_overrides
+        if kv_overrides is not None:
+            n_overrides = len(kv_overrides)
+            self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1)
+            self._kv_overrides_array_keys = []
+            for k, v in kv_overrides.items():
+                key_buf = ctypes.create_string_buffer(k.encode("utf-8"))
+                self._kv_overrides_array_keys.append(key_buf)
+                self._kv_overrides_array[i].key = key_buf
+                if isinstance(v, int):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
+                    self._kv_overrides_array[i].value.int_value = v
+                elif isinstance(v, float):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
+                    self._kv_overrides_array[i].value.float_value = v
+                elif isinstance(v, bool):
+                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
+                    self._kv_overrides_array[i].value.bool_value = v
+                else:
+                    raise ValueError(f"Unknown value type for {k}: {v}")
+            self._kv_overrides_array_sentinel_key = b'\0'
+            # null array sentinel
+            self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
+            self.model_params.kv_overrides = self._kv_overrides_array
         self.n_batch = min(n_ctx, n_batch)  # ???
         self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
         self.n_threads_batch = n_threads_batch or max(
@@ -2143,11 +2176,13 @@ class Llama:
             model_path=self.model_path,
             # Model Params
             n_gpu_layers=self.model_params.n_gpu_layers,
+            split_mode=self.model_params.split_mode,
             main_gpu=self.model_params.main_gpu,
             tensor_split=self.tensor_split,
             vocab_only=self.model_params.vocab_only,
             use_mmap=self.model_params.use_mmap,
             use_mlock=self.model_params.use_mlock,
+            kv_overrides=self.kv_overrides,
             # Context Params
             seed=self.context_params.seed,
             n_ctx=self.context_params.n_ctx,
@@ -2185,11 +2220,13 @@ class Llama:
             model_path=state["model_path"],
             # Model Params
             n_gpu_layers=state["n_gpu_layers"],
+            split_mode=state["split_mode"],
             main_gpu=state["main_gpu"],
             tensor_split=state["tensor_split"],
             vocab_only=state["vocab_only"],
             use_mmap=state["use_mmap"],
             use_mlock=state["use_mlock"],
+            kv_overrides=state["kv_overrides"],
             # Context Params
             seed=state["seed"],
             n_ctx=state["n_ctx"],

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/llama_cpp/llama_cpp.py RENAMED Viewed

@@ -112,8 +112,8 @@ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
 # define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-# define LLAMA_SESSION_VERSION 3
-LLAMA_SESSION_VERSION = 3
+# define LLAMA_SESSION_VERSION 4
+LLAMA_SESSION_VERSION = 4
 # struct llama_model;
@@ -180,6 +180,8 @@ LLAMA_TOKEN_TYPE_BYTE = 6
 #     LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@@ -200,6 +202,9 @@ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
 LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
 LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
 LLAMA_FTYPE_MOSTLY_Q6_K = 18
+LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
+LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
+LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
 LLAMA_FTYPE_GUESSED = 1024
 # enum llama_rope_scaling_type {
@@ -215,6 +220,15 @@ LLAMA_ROPE_SCALING_LINEAR = 1
 LLAMA_ROPE_SCALING_YARN = 2
 LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN
+# enum llama_split_mode {
+#     LLAMA_SPLIT_NONE    = 0, // single GPU
+#     LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
+#     LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
+# };
+LLAMA_SPLIT_NONE = 0
+LLAMA_SPLIT_LAYER = 1
+LLAMA_SPLIT_ROW = 2
 # typedef struct llama_token_data {
 #     llama_token id; // token id
@@ -360,13 +374,22 @@ class llama_model_kv_override(Structure):
 # struct llama_model_params {
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
-#     int32_t main_gpu;     // the GPU that is used for scratch and small tensors
-#     const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+#     enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+#     // main_gpu interpretation depends on split_mode:
+#     // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
+#     // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
+#     // LLAMA_SPLIT_LAYER: ignored
+#     int32_t main_gpu;
+#     // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+#     const float * tensor_split;
 #     // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 #     // If the provided progress_callback returns true, model loading continues.
 #     // If it returns false, model loading is immediately aborted.
 #     llama_progress_callback progress_callback;
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
@@ -384,8 +407,9 @@ class llama_model_params(Structure):
     Attributes:
         n_gpu_layers (int): number of layers to store in VRAM
-        main_gpu (int): the GPU that is used for scratch and small tensors
-        tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+        split_mode (int): how to split the model across multiple GPUs
+        main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
+        tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
         progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
         progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
         kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -395,6 +419,7 @@ class llama_model_params(Structure):
     _fields_ = [
         ("n_gpu_layers", c_int32),
+        ("split_mode", c_int),
         ("main_gpu", c_int32),
         ("tensor_split", c_float_p),
         ("progress_callback", llama_progress_callback),
@@ -503,6 +528,7 @@ It might not exist for progress report where '.' is output repeatedly."""
 #     bool quantize_output_tensor; // quantize output.weight
 #     bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 #     bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
+#     void * imatrix;              // pointer to importance matrix data
 # } llama_model_quantize_params;
 class llama_model_quantize_params(Structure):
     """Parameters for llama_model_quantize
@@ -514,6 +540,7 @@ class llama_model_quantize_params(Structure):
         quantize_output_tensor (bool): quantize output.weight
         only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         pure (bool): disable k-quant mixtures and quantize all tensors to the same type
+        imatrix (ctypes.c_void_p): pointer to importance matrix data
     """
     _fields_ = [
@@ -522,6 +549,8 @@ class llama_model_quantize_params(Structure):
         ("allow_requantize", c_bool),
         ("quantize_output_tensor", c_bool),
         ("only_copy", c_bool),
+        ("pure", c_bool),
+        ("imatrix", c_void_p),
     ]
@@ -1933,14 +1962,39 @@ _lib.llama_sample_repetition_penalties.restype = None
 # /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
-# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-# LLAMA_API void llama_sample_classifier_free_guidance(
-#             struct llama_context * ctx,
+# /// @param logits Logits extracted from the original generation context.
+# /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+# /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+# LLAMA_API void llama_sample_apply_guidance(
+#           struct llama_context * ctx,
+#                          float * logits,
+#                          float * logits_guidance,
+#                          float   scale);
+def llama_sample_apply_guidance(
+    ctx: llama_context_p,
+    logits,  # type: _Pointer[c_float]
+    logits_guidance,  # type: _Pointer[c_float]
+    scale: Union[c_float, float],
+):
+    """Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
+    return _lib.llama_sample_apply_guidance(ctx, logits, logits_guidance, scale)
+_lib.llama_sample_apply_guidance.argtypes = [
+    llama_context_p,
+    c_float_p,
+    c_float_p,
+    c_float,
+]
+_lib.llama_sample_apply_guidance.restype = None
+# LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
+#           struct llama_context * ctx,
 #         llama_token_data_array * candidates,
-#             struct llama_context * guidance_ctx,
-#                             float   scale);
+#           struct llama_context * guidance_ctx,
+#                          float   scale),
+#           "use llama_sample_apply_guidance() instead");
 def llama_sample_classifier_free_guidance(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]

{llama_cpp_python-0.2.28 → llama_cpp_python-0.2.29}/llama_cpp/llama_grammar.py RENAMED Viewed

@@ -1433,7 +1433,6 @@ class SchemaConverter:
     def visit(self, schema: Dict[str, Any], name: str) -> str:
         schema_type: Optional[str] = schema.get("type") # type: ignore
-        assert isinstance(schema_type, str), f"Unrecognized schema: {schema}"
         rule_name = name or "root"
         if "$defs" in schema:

llama-cpp-python 0.2.28__tar.gz → 0.2.29__tar.gz

llama-cpp-python 0.2.28tar.gz → 0.2.29tar.gz