llama-cpp-python 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/PKG-INFO +1 -1
  2. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/README.md +1 -0
  3. llama_cpp_python-0.1.9/llama_cpp/llama.py +343 -0
  4. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/llama_cpp_python.egg-info/PKG-INFO +1 -1
  5. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/pyproject.toml +1 -1
  6. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/setup.py +1 -1
  7. llama_cpp_python-0.1.7/llama_cpp/llama.py +0 -216
  8. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/.gitignore +0 -0
  9. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/.gitmodules +0 -0
  10. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/CMakeLists.txt +0 -0
  11. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/LICENSE.md +0 -0
  12. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/_skbuild/linux-x86_64-3.8/cmake-install/llama_cpp/libllama.so +0 -0
  13. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/llama_cpp/__init__.py +0 -0
  14. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/llama_cpp/llama_cpp.py +0 -0
  15. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/llama_cpp_python.egg-info/SOURCES.txt +0 -0
  16. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
  17. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/llama_cpp_python.egg-info/top_level.txt +0 -0
  18. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/poetry.lock +0 -0
  19. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/setup.cfg +0 -0
  20. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
  21. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
  22. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/.devops/tools.sh +0 -0
  23. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/.dockerignore +0 -0
  24. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
  25. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/.github/workflows/build.yml +0 -0
  26. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
  27. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/.gitignore +0 -0
  28. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/CMakeLists.txt +0 -0
  29. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/LICENSE +0 -0
  30. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/Makefile +0 -0
  31. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/README.md +0 -0
  32. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/SHA256SUMS +0 -0
  33. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/convert-gptq-to-ggml.py +0 -0
  34. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/convert-pth-to-ggml.py +0 -0
  35. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/flake.lock +0 -0
  36. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/flake.nix +0 -0
  37. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/ggml.c +0 -0
  38. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/ggml.h +0 -0
  39. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/llama.cpp +0 -0
  40. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/llama.h +0 -0
  41. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
  42. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
  43. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
  44. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/quantize.py +0 -0
  45. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/tests/CMakeLists.txt +0 -0
  46. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/tests/test-quantize.c +0 -0
  47. {llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama_cpp_python
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: A Python wrapper for llama.cpp
5
5
  Author: Andrei Betlen
6
6
  Author-email: abetlen@gmail.com
@@ -1,5 +1,6 @@
1
1
  # 🦙 Python Bindings for `llama.cpp`
2
2
 
3
+ [![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python)
3
4
  [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
4
5
  [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
5
6
  [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
@@ -0,0 +1,343 @@
1
+ import os
2
+ import uuid
3
+ import time
4
+ import multiprocessing
5
+ from typing import List, Optional
6
+ from collections import deque
7
+
8
+ from . import llama_cpp
9
+
10
+
11
+ class Llama:
12
+ """High-level Python wrapper for a llama.cpp model."""
13
+
14
+ def __init__(
15
+ self,
16
+ model_path: str,
17
+ # NOTE: The following parameters are likely to change in the future.
18
+ n_ctx: int = 512,
19
+ n_parts: int = -1,
20
+ seed: int = 1337,
21
+ f16_kv: bool = False,
22
+ logits_all: bool = False,
23
+ vocab_only: bool = False,
24
+ use_mlock: bool = False,
25
+ embedding: bool = False,
26
+ n_threads: Optional[int] = None,
27
+ ) -> "Llama":
28
+ """Load a llama.cpp model from `model_path`.
29
+
30
+ Args:
31
+ model_path: Path to the model.
32
+ n_ctx: Maximum context size.
33
+ n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
34
+ seed: Random seed. 0 for random.
35
+ f16_kv: Use half-precision for key/value cache.
36
+ logits_all: Return logits for all tokens, not just the last token.
37
+ vocab_only: Only load the vocabulary no weights.
38
+ use_mlock: Force the system to keep the model in RAM.
39
+ embedding: Embedding mode only.
40
+ n_threads: Number of threads to use. If None, the number of threads is automatically determined.
41
+
42
+ Raises:
43
+ ValueError: If the model path does not exist.
44
+
45
+ Returns:
46
+ A Llama instance.
47
+ """
48
+ self.model_path = model_path
49
+
50
+ self.params = llama_cpp.llama_context_default_params()
51
+ self.params.n_ctx = n_ctx
52
+ self.params.n_parts = n_parts
53
+ self.params.seed = seed
54
+ self.params.f16_kv = f16_kv
55
+ self.params.logits_all = logits_all
56
+ self.params.vocab_only = vocab_only
57
+ self.params.use_mlock = use_mlock
58
+ self.params.embedding = embedding
59
+
60
+ self.last_n = 64
61
+ self.max_chunk_size = n_ctx
62
+
63
+ self.n_threads = n_threads or multiprocessing.cpu_count()
64
+
65
+ if not os.path.exists(model_path):
66
+ raise ValueError(f"Model path does not exist: {model_path}")
67
+
68
+ self.ctx = llama_cpp.llama_init_from_file(
69
+ self.model_path.encode("utf-8"), self.params
70
+ )
71
+
72
+ def tokenize(self, text: bytes) -> List[int]:
73
+ """Tokenize a string.
74
+
75
+ Args:
76
+ text: The utf-8 encoded string to tokenize.
77
+
78
+ Returns:
79
+ A list of tokens.
80
+ """
81
+ n_ctx = llama_cpp.llama_n_ctx(self.ctx)
82
+ tokens = (llama_cpp.llama_token * n_ctx)()
83
+ n_tokens = llama_cpp.llama_tokenize(
84
+ self.ctx,
85
+ text,
86
+ tokens,
87
+ n_ctx,
88
+ True,
89
+ )
90
+ if n_tokens < 0:
91
+ raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
92
+ return list(tokens[:n_tokens])
93
+
94
+ def detokenize(self, tokens: List[int]) -> bytes:
95
+ """Detokenize a list of tokens.
96
+
97
+ Args:
98
+ tokens: The list of tokens to detokenize.
99
+
100
+ Returns:
101
+ The detokenized string.
102
+ """
103
+ output = b""
104
+ for token in tokens:
105
+ output += llama_cpp.llama_token_to_str(self.ctx, token)
106
+ return output
107
+
108
+ def embed(self, text: str):
109
+ """Embed a string.
110
+
111
+ Args:
112
+ text: The utf-8 encoded string to embed.
113
+
114
+ Returns:
115
+ A list of embeddings.
116
+ """
117
+ tokens = self.tokenize(text.encode("utf-8"))
118
+ self._eval(tokens, 0)
119
+ embeddings = llama_cpp.llama_get_embeddings(self.ctx)
120
+ return embeddings[:llama_cpp.llama_n_embd(self.ctx)]
121
+
122
+ def _eval(self, tokens: List[int], n_past):
123
+ rc = llama_cpp.llama_eval(
124
+ self.ctx,
125
+ (llama_cpp.llama_token * len(tokens))(*tokens),
126
+ len(tokens),
127
+ n_past,
128
+ self.n_threads,
129
+ )
130
+ if rc != 0:
131
+ raise RuntimeError(f"Failed to evaluate: {rc}")
132
+
133
+ def _sample(self, last_n_tokens, top_p, top_k, temp, repeat_penalty):
134
+ return llama_cpp.llama_sample_top_p_top_k(
135
+ self.ctx,
136
+ (llama_cpp.llama_token * len(last_n_tokens))(*last_n_tokens),
137
+ len(last_n_tokens),
138
+ top_k=top_k,
139
+ top_p=top_p,
140
+ temp=temp,
141
+ repeat_penalty=repeat_penalty,
142
+ )
143
+
144
+ def _generate(self, past_tokens, max_tokens, top_p, top_k, temp, repeat_penalty):
145
+ last_n_tokens = deque([0] * self.last_n, maxlen=self.last_n)
146
+ last_n_tokens.extend(past_tokens)
147
+ for i in range(max_tokens):
148
+ token = self._sample(
149
+ last_n_tokens,
150
+ top_p=top_p,
151
+ top_k=top_k,
152
+ temp=temp,
153
+ repeat_penalty=repeat_penalty,
154
+ )
155
+ yield token
156
+ self._eval([token], len(past_tokens) + i)
157
+
158
+ def _call(
159
+ self,
160
+ prompt: str,
161
+ suffix: Optional[str] = None,
162
+ max_tokens: int = 16,
163
+ temperature: float = 0.8,
164
+ top_p: float = 0.95,
165
+ logprobs: Optional[int] = None,
166
+ echo: bool = False,
167
+ stop: List[str] = [],
168
+ repeat_penalty: float = 1.1,
169
+ top_k: int = 40,
170
+ stream: bool = False,
171
+ ):
172
+ completion_id = f"cmpl-{str(uuid.uuid4())}"
173
+ created = int(time.time())
174
+ completion_tokens = []
175
+ prompt_tokens = self.tokenize(prompt.encode("utf-8"))
176
+
177
+ if len(prompt_tokens) + max_tokens > llama_cpp.llama_n_ctx(self.ctx):
178
+ raise ValueError(
179
+ f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
180
+ )
181
+
182
+ # Process prompt in chunks to avoid running out of memory
183
+ for i in range(0, len(prompt_tokens), self.max_chunk_size):
184
+ chunk = prompt_tokens[i : min(len(prompt_tokens), i + self.max_chunk_size)]
185
+ self._eval(chunk, n_past=i)
186
+
187
+ if stop is not None:
188
+ stop = [s.encode("utf-8") for s in stop]
189
+
190
+ finish_reason = None
191
+ for token in self._generate(
192
+ prompt_tokens, max_tokens, top_p, top_k, temperature, repeat_penalty
193
+ ):
194
+ if token == llama_cpp.llama_token_eos():
195
+ finish_reason = "stop"
196
+ break
197
+ completion_tokens.append(token)
198
+
199
+ text = self.detokenize(completion_tokens)
200
+ any_stop = [s for s in stop if s in text]
201
+ if len(any_stop) > 0:
202
+ first_stop = any_stop[0]
203
+ text = text[: text.index(first_stop)]
204
+ finish_reason = "stop"
205
+ break
206
+
207
+ if stream:
208
+ start = len(self.detokenize(completion_tokens[:-1]))
209
+ longest = 0
210
+ for s in stop:
211
+ for i in range(len(s), 0, -1):
212
+ if s[-i:] == text[-i:]:
213
+ if i > longest:
214
+ longest = i
215
+ break
216
+ yield {
217
+ "id": completion_id,
218
+ "object": "text_completion",
219
+ "created": created,
220
+ "model": self.model_path,
221
+ "choices": [
222
+ {
223
+ "text": text[start : len(text) - longest].decode("utf-8"),
224
+ "index": 0,
225
+ "logprobs": None,
226
+ "finish_reason": None,
227
+ }
228
+ ],
229
+ }
230
+
231
+ if finish_reason is None:
232
+ finish_reason = "length"
233
+
234
+ if stream:
235
+ if finish_reason == "stop":
236
+ start = len(self.detokenize(completion_tokens[:-1]))
237
+ text = text[start:].decode("utf-8")
238
+ else:
239
+ text = ""
240
+ yield {
241
+ "id": completion_id,
242
+ "object": "text_completion",
243
+ "created": created,
244
+ "model": self.model_path,
245
+ "choices": [
246
+ {
247
+ "text": text,
248
+ "index": 0,
249
+ "logprobs": None,
250
+ "finish_reason": finish_reason,
251
+ }
252
+ ],
253
+ }
254
+ return
255
+
256
+ text = text.decode("utf-8")
257
+
258
+ if echo:
259
+ text = prompt + text
260
+
261
+ if suffix is not None:
262
+ text = text + suffix
263
+
264
+ if logprobs is not None:
265
+ logprobs = llama_cpp.llama_get_logits(
266
+ self.ctx,
267
+ )[:logprobs]
268
+
269
+ yield {
270
+ "id": completion_id,
271
+ "object": "text_completion",
272
+ "created": created,
273
+ "model": self.model_path,
274
+ "choices": [
275
+ {
276
+ "text": text,
277
+ "index": 0,
278
+ "logprobs": logprobs,
279
+ "finish_reason": finish_reason,
280
+ }
281
+ ],
282
+ "usage": {
283
+ "prompt_tokens": len(prompt_tokens),
284
+ "completion_tokens": len(completion_tokens),
285
+ "total_tokens": len(prompt_tokens) + len(completion_tokens),
286
+ },
287
+ }
288
+
289
+ def __call__(
290
+ self,
291
+ prompt: str,
292
+ suffix: Optional[str] = None,
293
+ max_tokens: int = 16,
294
+ temperature: float = 0.8,
295
+ top_p: float = 0.95,
296
+ logprobs: Optional[int] = None,
297
+ echo: bool = False,
298
+ stop: List[str] = [],
299
+ repeat_penalty: float = 1.1,
300
+ top_k: int = 40,
301
+ stream: bool = False,
302
+ ):
303
+ """Generate text from a prompt.
304
+
305
+ Args:
306
+ prompt: The prompt to generate text from.
307
+ suffix: A suffix to append to the generated text. If None, no suffix is appended.
308
+ max_tokens: The maximum number of tokens to generate.
309
+ temperature: The temperature to use for sampling.
310
+ top_p: The top-p value to use for sampling.
311
+ logprobs: The number of logprobs to return. If None, no logprobs are returned.
312
+ echo: Whether to echo the prompt.
313
+ stop: A list of strings to stop generation when encountered.
314
+ repeat_penalty: The penalty to apply to repeated tokens.
315
+ top_k: The top-k value to use for sampling.
316
+ stream: Whether to stream the results.
317
+
318
+ Raises:
319
+ ValueError: If the requested tokens exceed the context window.
320
+ RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
321
+
322
+ Returns:
323
+ Response object containing the generated text.
324
+ """
325
+ call = self._call(
326
+ prompt=prompt,
327
+ suffix=suffix,
328
+ max_tokens=max_tokens,
329
+ temperature=temperature,
330
+ top_p=top_p,
331
+ logprobs=logprobs,
332
+ echo=echo,
333
+ stop=stop,
334
+ repeat_penalty=repeat_penalty,
335
+ top_k=top_k,
336
+ stream=stream,
337
+ )
338
+ if stream:
339
+ return call
340
+ return next(call)
341
+
342
+ def __del__(self):
343
+ llama_cpp.llama_free(self.ctx)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-cpp-python
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: A Python wrapper for llama.cpp
5
5
  Author: Andrei Betlen
6
6
  Author-email: abetlen@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llama_cpp"
3
- version = "0.1.7"
3
+ version = "0.1.9"
4
4
  description = "Python bindings for the llama.cpp library"
5
5
  authors = ["Andrei Betlen <abetlen@gmail.com>"]
6
6
  license = "MIT"
@@ -3,7 +3,7 @@ from skbuild import setup
3
3
  setup(
4
4
  name="llama_cpp_python",
5
5
  description="A Python wrapper for llama.cpp",
6
- version="0.1.7",
6
+ version="0.1.9",
7
7
  author="Andrei Betlen",
8
8
  author_email="abetlen@gmail.com",
9
9
  license="MIT",
@@ -1,216 +0,0 @@
1
- import os
2
- import uuid
3
- import time
4
- import multiprocessing
5
- from typing import List, Optional
6
-
7
- from . import llama_cpp
8
-
9
-
10
- class Llama:
11
- """High-level Python wrapper for a llama.cpp model."""
12
-
13
- def __init__(
14
- self,
15
- model_path: str,
16
- # NOTE: The following parameters are likely to change in the future.
17
- n_ctx: int = 512,
18
- n_parts: int = -1,
19
- seed: int = 1337,
20
- f16_kv: bool = False,
21
- logits_all: bool = False,
22
- vocab_only: bool = False,
23
- use_mlock: bool = False,
24
- embedding: bool = False,
25
- n_threads: Optional[int] = None,
26
- ) -> "Llama":
27
- """Load a llama.cpp model from `model_path`.
28
-
29
- Args:
30
- model_path: Path to the model.
31
- n_ctx: Maximum context size.
32
- n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
33
- seed: Random seed. 0 for random.
34
- f16_kv: Use half-precision for key/value cache.
35
- logits_all: Return logits for all tokens, not just the last token.
36
- vocab_only: Only load the vocabulary no weights.
37
- use_mlock: Force the system to keep the model in RAM.
38
- embedding: Embedding mode only.
39
- n_threads: Number of threads to use. If None, the number of threads is automatically determined.
40
-
41
- Raises:
42
- ValueError: If the model path does not exist.
43
-
44
- Returns:
45
- A Llama instance.
46
- """
47
- self.model_path = model_path
48
-
49
- self.last_n = 64
50
- self.max_chunk_size = 32
51
-
52
- self.params = llama_cpp.llama_context_default_params()
53
- self.params.n_ctx = n_ctx
54
- self.params.n_parts = n_parts
55
- self.params.seed = seed
56
- self.params.f16_kv = f16_kv
57
- self.params.logits_all = logits_all
58
- self.params.vocab_only = vocab_only
59
- self.params.use_mlock = use_mlock
60
- self.params.embedding = embedding
61
-
62
- self.n_threads = n_threads or multiprocessing.cpu_count()
63
-
64
- self.tokens = (llama_cpp.llama_token * self.params.n_ctx)()
65
-
66
- if not os.path.exists(model_path):
67
- raise ValueError(f"Model path does not exist: {model_path}")
68
-
69
- self.ctx = llama_cpp.llama_init_from_file(
70
- self.model_path.encode("utf-8"), self.params
71
- )
72
-
73
- def __call__(
74
- self,
75
- prompt: str,
76
- suffix: Optional[str] = None,
77
- max_tokens: int = 16,
78
- temperature: float = 0.8,
79
- top_p: float = 0.95,
80
- logprobs: Optional[int] = None,
81
- echo: bool = False,
82
- stop: List[str] = [],
83
- repeat_penalty: float = 1.1,
84
- top_k: int = 40,
85
- ):
86
- """Generate text from a prompt.
87
-
88
- Args:
89
- prompt: The prompt to generate text from.
90
- suffix: A suffix to append to the generated text. If None, no suffix is appended.
91
- max_tokens: The maximum number of tokens to generate.
92
- temperature: The temperature to use for sampling.
93
- top_p: The top-p value to use for sampling.
94
- logprobs: The number of logprobs to return. If None, no logprobs are returned.
95
- echo: Whether to echo the prompt.
96
- stop: A list of strings to stop generation when encountered.
97
- repeat_penalty: The penalty to apply to repeated tokens.
98
- top_k: The top-k value to use for sampling.
99
-
100
- Raises:
101
- ValueError: If the requested tokens exceed the context window.
102
- RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
103
-
104
- Returns:
105
- Response object containing the generated text.
106
- """
107
- text = b""
108
- finish_reason = "length"
109
- completion_tokens = 0
110
-
111
- if stop is not None:
112
- stop = [s.encode("utf-8") for s in stop]
113
-
114
- prompt_tokens = llama_cpp.llama_tokenize(
115
- self.ctx,
116
- prompt.encode("utf-8"),
117
- self.tokens,
118
- llama_cpp.llama_n_ctx(self.ctx),
119
- True,
120
- )
121
- if prompt_tokens < 0:
122
- raise RuntimeError(f"Failed to tokenize prompt: {prompt_tokens}")
123
-
124
- if prompt_tokens + max_tokens > self.params.n_ctx:
125
- raise ValueError(
126
- f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
127
- )
128
-
129
- # Process prompt in chunks to avoid running out of memory
130
- for i in range(0, prompt_tokens, self.max_chunk_size):
131
- chunk = self.tokens[i : min(prompt_tokens, i + self.max_chunk_size)]
132
- rc = llama_cpp.llama_eval(
133
- self.ctx,
134
- (llama_cpp.llama_token * len(chunk))(*chunk),
135
- len(chunk),
136
- max(0, i - 1),
137
- self.n_threads,
138
- )
139
- if rc != 0:
140
- raise RuntimeError(f"Failed to evaluate prompt: {rc}")
141
-
142
- for i in range(max_tokens):
143
- tokens_seen = prompt_tokens + completion_tokens
144
- last_n_tokens = [0] * max(0, self.last_n - tokens_seen) + [
145
- self.tokens[j]
146
- for j in range(max(tokens_seen - self.last_n, 0), tokens_seen)
147
- ]
148
-
149
- token = llama_cpp.llama_sample_top_p_top_k(
150
- self.ctx,
151
- (llama_cpp.llama_token * len(last_n_tokens))(*last_n_tokens),
152
- len(last_n_tokens),
153
- top_k=top_k,
154
- top_p=top_p,
155
- temp=temperature,
156
- repeat_penalty=repeat_penalty,
157
- )
158
- if token == llama_cpp.llama_token_eos():
159
- finish_reason = "stop"
160
- break
161
- text += llama_cpp.llama_token_to_str(self.ctx, token)
162
- self.tokens[prompt_tokens + i] = token
163
- completion_tokens += 1
164
-
165
- any_stop = [s for s in stop if s in text]
166
- if len(any_stop) > 0:
167
- first_stop = any_stop[0]
168
- text = text[: text.index(first_stop)]
169
- finish_reason = "stop"
170
- break
171
-
172
- rc = llama_cpp.llama_eval(
173
- self.ctx,
174
- (llama_cpp.llama_token * 1)(self.tokens[prompt_tokens + i]),
175
- 1,
176
- prompt_tokens + completion_tokens,
177
- self.n_threads,
178
- )
179
- if rc != 0:
180
- raise RuntimeError(f"Failed to evaluate next token: {rc}")
181
-
182
- text = text.decode("utf-8")
183
-
184
- if echo:
185
- text = prompt + text
186
-
187
- if suffix is not None:
188
- text = text + suffix
189
-
190
- if logprobs is not None:
191
- logprobs = llama_cpp.llama_get_logits(
192
- self.ctx,
193
- )[:logprobs]
194
-
195
- return {
196
- "id": f"cmpl-{str(uuid.uuid4())}", # Likely to change
197
- "object": "text_completion",
198
- "created": int(time.time()),
199
- "model": self.model_path,
200
- "choices": [
201
- {
202
- "text": text,
203
- "index": 0,
204
- "logprobs": logprobs,
205
- "finish_reason": finish_reason,
206
- }
207
- ],
208
- "usage": {
209
- "prompt_tokens": prompt_tokens,
210
- "completion_tokens": completion_tokens,
211
- "total_tokens": prompt_tokens + completion_tokens,
212
- },
213
- }
214
-
215
- def __del__(self):
216
- llama_cpp.llama_free(self.ctx)