llama-cpp-python 0.1.14__tar.gz → 0.1.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/PKG-INFO +1 -1
  2. llama_cpp_python-0.1.16/_skbuild/linux-x86_64-3.8/cmake-install/llama_cpp/libllama.so +0 -0
  3. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp/llama.py +21 -16
  4. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp/llama_cpp.py +31 -1
  5. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp_python.egg-info/PKG-INFO +1 -1
  6. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/pyproject.toml +1 -1
  7. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/setup.py +1 -1
  8. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/Makefile +2 -89
  9. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/README.md +4 -2
  10. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/ggml.c +346 -345
  11. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/ggml.h +26 -26
  12. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/llama.cpp +31 -4
  13. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/llama.h +17 -0
  14. llama_cpp_python-0.1.14/_skbuild/linux-x86_64-3.8/cmake-install/llama_cpp/libllama.so +0 -0
  15. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/.gitignore +0 -0
  16. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/.gitmodules +0 -0
  17. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/CMakeLists.txt +0 -0
  18. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/LICENSE.md +0 -0
  19. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/README.md +0 -0
  20. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp/__init__.py +0 -0
  21. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp/llama_types.py +0 -0
  22. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp_python.egg-info/SOURCES.txt +0 -0
  23. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
  24. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp_python.egg-info/requires.txt +0 -0
  25. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp_python.egg-info/top_level.txt +0 -0
  26. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/poetry.lock +0 -0
  27. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/setup.cfg +0 -0
  28. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
  29. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
  30. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/.devops/tools.sh +0 -0
  31. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/.dockerignore +0 -0
  32. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
  33. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/.github/workflows/build.yml +0 -0
  34. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
  35. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/.gitignore +0 -0
  36. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/CMakeLists.txt +0 -0
  37. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/LICENSE +0 -0
  38. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/SHA256SUMS +0 -0
  39. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/convert-gptq-to-ggml.py +0 -0
  40. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/convert-pth-to-ggml.py +0 -0
  41. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/flake.lock +0 -0
  42. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/flake.nix +0 -0
  43. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
  44. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
  45. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
  46. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/tests/CMakeLists.txt +0 -0
  47. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/tests/test-quantize.c +0 -0
  48. {llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama_cpp_python
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: A Python wrapper for llama.cpp
5
5
  Author: Andrei Betlen
6
6
  Author-email: abetlen@gmail.com
@@ -286,6 +286,7 @@ class Llama:
286
286
  # Add blank space to start of prompt to match OG llama tokenizer
287
287
  prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8"))
288
288
  text = b""
289
+ returned_characters = 0
289
290
 
290
291
  if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
291
292
  raise ValueError(
@@ -293,9 +294,9 @@ class Llama:
293
294
  )
294
295
 
295
296
  if stop != []:
296
- stop_bytes = [s.encode("utf-8") for s in stop]
297
+ stop_sequences = [s.encode("utf-8") for s in stop]
297
298
  else:
298
- stop_bytes = []
299
+ stop_sequences = []
299
300
 
300
301
  finish_reason = None
301
302
  for token in self.generate(
@@ -306,28 +307,33 @@ class Llama:
306
307
  repeat_penalty=repeat_penalty,
307
308
  ):
308
309
  if token == llama_cpp.llama_token_eos():
310
+ text = self.detokenize(completion_tokens)
309
311
  finish_reason = "stop"
310
312
  break
311
313
  completion_tokens.append(token)
312
314
 
313
- text = self.detokenize(completion_tokens)
314
- any_stop = [s for s in stop_bytes if s in text]
315
+ all_text = self.detokenize(completion_tokens)
316
+ any_stop = [s for s in stop_sequences if s in all_text]
315
317
  if len(any_stop) > 0:
316
318
  first_stop = any_stop[0]
317
- text = text[: text.index(first_stop)]
319
+ text = all_text[: all_text.index(first_stop)]
318
320
  finish_reason = "stop"
319
321
  break
320
322
 
321
323
  if stream:
322
- start = len(self.detokenize(completion_tokens[:-1]))
324
+ start = returned_characters
323
325
  longest = 0
324
- # TODO: Clean up this mess
325
- for s in stop_bytes:
326
+ # We want to avoid yielding any characters from
327
+ # the generated text if they are part of a stop
328
+ # sequence.
329
+ for s in stop_sequences:
326
330
  for i in range(len(s), 0, -1):
327
- if s[-i:] == text[-i:]:
331
+ if all_text.endswith(s[:i]):
328
332
  if i > longest:
329
333
  longest = i
330
334
  break
335
+ text = all_text[: len(all_text) - longest]
336
+ returned_characters += len(text[start:])
331
337
  yield {
332
338
  "id": completion_id,
333
339
  "object": "text_completion",
@@ -335,23 +341,22 @@ class Llama:
335
341
  "model": self.model_path,
336
342
  "choices": [
337
343
  {
338
- "text": text[start : len(text) - longest].decode("utf-8"),
344
+ "text": text[start :].decode("utf-8"),
339
345
  "index": 0,
340
346
  "logprobs": None,
341
347
  "finish_reason": None,
342
348
  }
343
349
  ],
344
350
  }
351
+ if len(completion_tokens) >= max_tokens:
352
+ text = self.detokenize(completion_tokens)
353
+ finish_reason = "length"
354
+ break
345
355
 
346
356
  if finish_reason is None:
347
357
  finish_reason = "length"
348
358
 
349
359
  if stream:
350
- if finish_reason == "stop":
351
- start = len(self.detokenize(completion_tokens[:-1]))
352
- text = text[start:].decode("utf-8")
353
- else:
354
- text = ""
355
360
  yield {
356
361
  "id": completion_id,
357
362
  "object": "text_completion",
@@ -359,7 +364,7 @@ class Llama:
359
364
  "model": self.model_path,
360
365
  "choices": [
361
366
  {
362
- "text": text,
367
+ "text": text[returned_characters:].decode("utf-8"),
363
368
  "index": 0,
364
369
  "logprobs": None,
365
370
  "finish_reason": finish_reason,
@@ -1,6 +1,6 @@
1
1
  import ctypes
2
2
 
3
- from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array
3
+ from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array, c_uint8, c_size_t
4
4
 
5
5
  import pathlib
6
6
  from itertools import chain
@@ -101,6 +101,36 @@ def llama_model_quantize(
101
101
  _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
102
102
  _lib.llama_model_quantize.restype = c_int
103
103
 
104
+ # Returns the KV cache that will contain the context for the
105
+ # ongoing prediction with the model.
106
+ def llama_get_kv_cache(ctx: llama_context_p):
107
+ return _lib.llama_get_kv_cache(ctx)
108
+
109
+ _lib.llama_get_kv_cache.argtypes = [llama_context_p]
110
+ _lib.llama_get_kv_cache.restype = POINTER(c_uint8)
111
+
112
+ # Returns the size of the KV cache
113
+ def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
114
+ return _lib.llama_get_kv_cache_size(ctx)
115
+
116
+ _lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
117
+ _lib.llama_get_kv_cache_size.restype = c_size_t
118
+
119
+ # Returns the number of tokens in the KV cache
120
+ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
121
+ return _lib.llama_get_kv_cache_token_count(ctx)
122
+
123
+ _lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
124
+ _lib.llama_get_kv_cache_token_count.restype = c_int
125
+
126
+
127
+ # Sets the KV cache containing the current context for the model
128
+ def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):
129
+ return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
130
+
131
+ _lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
132
+ _lib.llama_set_kv_cache.restype = None
133
+
104
134
 
105
135
  # Run the llama inference to obtain the logits and probabilities for the next token.
106
136
  # tokens + n_tokens is the provided batch of new tokens to process
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-cpp-python
3
- Version: 0.1.14
3
+ Version: 0.1.16
4
4
  Summary: A Python wrapper for llama.cpp
5
5
  Author: Andrei Betlen
6
6
  Author-email: abetlen@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llama_cpp"
3
- version = "0.1.14"
3
+ version = "0.1.16"
4
4
  description = "Python bindings for the llama.cpp library"
5
5
  authors = ["Andrei Betlen <abetlen@gmail.com>"]
6
6
  license = "MIT"
@@ -10,7 +10,7 @@ setup(
10
10
  description="A Python wrapper for llama.cpp",
11
11
  long_description=long_description,
12
12
  long_description_content_type="text/markdown",
13
- version="0.1.14",
13
+ version="0.1.16",
14
14
  author="Andrei Betlen",
15
15
  author_email="abetlen@gmail.com",
16
16
  license="MIT",
@@ -70,95 +70,8 @@ endif
70
70
  # TODO: probably these flags need to be tweaked on some architectures
71
71
  # feel free to update the Makefile for your architecture and send a pull request or issue
72
72
  ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
73
- ifeq ($(UNAME_S),Darwin)
74
- F16C_M := $(shell sysctl machdep.cpu.features)
75
- ifneq (,$(findstring F16C,$(F16C_M)))
76
- CFLAGS += -mf16c
77
- endif
78
- AVX1_M := $(shell sysctl machdep.cpu.features)
79
- ifneq (,$(findstring FMA,$(AVX1_M)))
80
- CFLAGS += -mfma
81
- endif
82
- ifneq (,$(findstring AVX1.0,$(AVX1_M)))
83
- CFLAGS += -mavx
84
- endif
85
- AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
86
- ifneq (,$(findstring AVX2,$(AVX2_M)))
87
- CFLAGS += -mavx2
88
- endif
89
- else ifeq ($(UNAME_S),Linux)
90
- AVX1_M := $(shell grep "avx " /proc/cpuinfo)
91
- ifneq (,$(findstring avx,$(AVX1_M)))
92
- CFLAGS += -mavx
93
- endif
94
- AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
95
- ifneq (,$(findstring avx2,$(AVX2_M)))
96
- CFLAGS += -mavx2
97
- endif
98
- FMA_M := $(shell grep "fma " /proc/cpuinfo)
99
- ifneq (,$(findstring fma,$(FMA_M)))
100
- CFLAGS += -mfma
101
- endif
102
- F16C_M := $(shell grep "f16c " /proc/cpuinfo)
103
- ifneq (,$(findstring f16c,$(F16C_M)))
104
- CFLAGS += -mf16c
105
- endif
106
- SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
107
- ifneq (,$(findstring sse3,$(SSE3_M)))
108
- CFLAGS += -msse3
109
- endif
110
- AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
111
- ifneq (,$(findstring avx512f,$(AVX512F_M)))
112
- CFLAGS += -mavx512f
113
- endif
114
- AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
115
- ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
116
- CFLAGS += -mavx512bw
117
- endif
118
- AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
119
- ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
120
- CFLAGS += -mavx512dq
121
- endif
122
- AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
123
- ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
124
- CFLAGS += -mavx512vl
125
- endif
126
- AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
127
- ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
128
- CFLAGS += -mavx512cd
129
- endif
130
- AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
131
- ifneq (,$(findstring avx512er,$(AVX512ER_M)))
132
- CFLAGS += -mavx512er
133
- endif
134
- AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
135
- ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
136
- CFLAGS += -mavx512ifma
137
- endif
138
- AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
139
- ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
140
- CFLAGS += -mavx512pf
141
- endif
142
- else ifeq ($(UNAME_S),Haiku)
143
- AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
144
- ifneq (,$(findstring AVX,$(AVX1_M)))
145
- CFLAGS += -mavx
146
- endif
147
- AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
148
- ifneq (,$(findstring AVX2,$(AVX2_M)))
149
- CFLAGS += -mavx2
150
- endif
151
- FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
152
- ifneq (,$(findstring FMA,$(FMA_M)))
153
- CFLAGS += -mfma
154
- endif
155
- F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
156
- ifneq (,$(findstring F16C,$(F16C_M)))
157
- CFLAGS += -mf16c
158
- endif
159
- else
160
- CFLAGS += -mfma -mf16c -mavx -mavx2
161
- endif
73
+ # Use all CPU extensions that are available:
74
+ CFLAGS += -march=native -mtune=native
162
75
  endif
163
76
  ifneq ($(filter ppc64%,$(UNAME_M)),)
164
77
  POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
@@ -232,13 +232,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
232
232
 
233
233
  - Obtain the `gpt4all-lora-quantized.bin` model
234
234
  - It is distributed in the old `ggml` format which is now obsoleted
235
- - You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py):
235
+ - You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
236
+ convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
236
237
 
237
238
  ```bash
238
239
  python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
240
+ python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
239
241
  ```
240
242
 
241
- - You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models
243
+ - You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
242
244
  - The original model is saved in the same folder with a suffix `.orig`
243
245
 
244
246
  ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data