llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -61,6 +61,7 @@ extern "C" {
61
61
  enum llama_vocab_type {
62
62
  LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
63
63
  LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
64
+ LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
64
65
  };
65
66
 
66
67
  enum llama_token_type {
@@ -99,6 +100,8 @@ extern "C" {
99
100
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
100
101
  LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
101
102
  LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
103
+ LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
104
+ LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
102
105
 
103
106
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
104
107
  };
@@ -111,6 +114,12 @@ extern "C" {
111
114
  LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
112
115
  };
113
116
 
117
+ enum llama_pooling_type {
118
+ LLAMA_POOLING_NONE = 0,
119
+ LLAMA_POOLING_MEAN = 1,
120
+ LLAMA_POOLING_CLS = 2,
121
+ };
122
+
114
123
  enum llama_split_mode {
115
124
  LLAMA_SPLIT_NONE = 0, // single GPU
116
125
  LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
@@ -235,6 +244,7 @@ extern "C" {
235
244
  bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
236
245
  bool embedding; // embedding mode only
237
246
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
247
+ bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
238
248
  };
239
249
 
240
250
  // model quantization parameters
@@ -296,6 +306,12 @@ extern "C" {
296
306
  int32_t n_eval;
297
307
  };
298
308
 
309
+ // used in chat template
310
+ typedef struct llama_chat_message {
311
+ const char * role;
312
+ const char * content;
313
+ } llama_chat_message;
314
+
299
315
  // Helpers for getting default parameters
300
316
  LLAMA_API struct llama_model_params llama_model_default_params(void);
301
317
  LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -304,7 +320,10 @@ extern "C" {
304
320
  // Initialize the llama + ggml backend
305
321
  // If numa is true, use NUMA optimizations
306
322
  // Call once at the start of the program
307
- LLAMA_API void llama_backend_init(bool numa);
323
+ LLAMA_API void llama_backend_init(void);
324
+
325
+ //optional:
326
+ LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
308
327
 
309
328
  // Call once at the end of the program - currently only used for MPI
310
329
  LLAMA_API void llama_backend_free(void);
@@ -627,6 +646,10 @@ extern "C" {
627
646
  // shape: [n_embd] (1-dimensional)
628
647
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
629
648
 
649
+ // Get the embeddings for the ith sequence
650
+ // llama_get_embeddings(ctx) + i*n_embd
651
+ LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
652
+
630
653
  //
631
654
  // Vocab
632
655
  //
@@ -683,6 +706,25 @@ extern "C" {
683
706
  char * buf,
684
707
  int32_t length);
685
708
 
709
+ /// Apply chat template. Inspired by hf apply_chat_template() on python.
710
+ /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
711
+ /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
712
+ /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
713
+ /// @param chat Pointer to a list of multiple llama_chat_message
714
+ /// @param n_msg Number of llama_chat_message in this chat
715
+ /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
716
+ /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
717
+ /// @param length The size of the allocated buffer
718
+ /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
719
+ LLAMA_API int32_t llama_chat_apply_template(
720
+ const struct llama_model * model,
721
+ const char * tmpl,
722
+ const struct llama_chat_message * chat,
723
+ size_t n_msg,
724
+ bool add_ass,
725
+ char * buf,
726
+ int32_t length);
727
+
686
728
  //
687
729
  // Grammar
688
730
  //
@@ -1,6 +1,6 @@
1
1
  ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
2
2
  GF_CC_IS_GCC = 1
3
- GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
3
+ GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
4
4
  else
5
5
  GF_CC_IS_CLANG = 1
6
6
  ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
@@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
264
264
  offset += 1;
265
265
  return result;
266
266
  }
267
- else if (!(utf8[offset + 0] & 0x40)) {
267
+ if (!(utf8[offset + 0] & 0x40)) {
268
268
  throw std::invalid_argument("invalid character");
269
269
  }
270
- else if (!(utf8[offset + 0] & 0x20)) {
271
- if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
270
+ if (!(utf8[offset + 0] & 0x20)) {
271
+ if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
272
272
  throw std::invalid_argument("invalid character");
273
+ }
273
274
  auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
274
275
  offset += 2;
275
276
  return result;
276
277
  }
277
- else if (!(utf8[offset + 0] & 0x10)) {
278
- if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
278
+ if (!(utf8[offset + 0] & 0x10)) {
279
+ if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
279
280
  throw std::invalid_argument("invalid character");
281
+ }
280
282
  auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
281
283
  offset += 3;
282
284
  return result;
283
285
  }
284
- else if (!(utf8[offset + 0] & 0x08)) {
285
- if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
286
+ if (!(utf8[offset + 0] & 0x08)) {
287
+ if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
286
288
  throw std::invalid_argument("invalid character");
289
+ }
287
290
  auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
288
291
  offset += 4;
289
292
  return result;
@@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
331
334
  offset += 1;
332
335
  return result;
333
336
  }
334
- else {
335
- if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
336
- throw std::invalid_argument("invalid character");
337
- auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
338
- offset += 2;
339
- return result;
337
+
338
+ if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
339
+ throw std::invalid_argument("invalid character");
340
340
  }
341
- throw std::invalid_argument("invalid string");
341
+
342
+ auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
343
+ offset += 2;
344
+ return result;
342
345
  }
343
346
 
344
347
  static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
345
348
  std::vector<uint32_t> result;
346
349
  size_t offset = 0;
347
- while (offset < utf16.size())
350
+ while (offset < utf16.size()) {
348
351
  result.push_back(codepoint_from_utf16(utf16, offset));
352
+ }
349
353
  return result;
350
354
  }
351
355
 
@@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
361
365
  static std::unordered_map<uint32_t, int> codepoint_type_map() {
362
366
  std::unordered_map<uint32_t, int> codepoint_types;
363
367
  for (auto p : digit_ranges) {
364
- for(auto i = p.first; i <= p.second; ++ i)
368
+ for (auto i = p.first; i <= p.second; ++ i) {
365
369
  codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
370
+ }
366
371
  }
367
- for(auto p : letter_ranges) {
368
- for(auto i = p.first; i <= p.second; ++ i)
372
+ for (auto p : letter_ranges) {
373
+ for (auto i = p.first; i <= p.second; ++ i) {
369
374
  codepoint_types[i] = CODEPOINT_TYPE_LETTER;
375
+ }
370
376
  }
371
- for(auto p : whitespace_ranges) {
372
- for(auto i = p.first; i <= p.second; ++ i)
377
+ for (auto p : whitespace_ranges) {
378
+ for (auto i = p.first; i <= p.second; ++ i) {
373
379
  codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
380
+ }
374
381
  }
375
- for(auto p : accent_mark_ranges) {
376
- for(auto i = p.first; i <= p.second; ++ i)
382
+ for (auto p : accent_mark_ranges) {
383
+ for (auto i = p.first; i <= p.second; ++ i) {
377
384
  codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
385
+ }
378
386
  }
379
- for(auto p : punctuation_ranges) {
380
- for(auto i = p.first; i <= p.second; ++ i)
387
+ for (auto p : punctuation_ranges) {
388
+ for (auto i = p.first; i <= p.second; ++ i) {
381
389
  codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
390
+ }
382
391
  }
383
- for (auto p : symbol_ranges) {
384
- for (auto i = p.first; i <= p.second; ++i)
392
+ for (auto p : symbol_ranges) {
393
+ for (auto i = p.first; i <= p.second; ++i) {
385
394
  codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
395
+ }
386
396
  }
387
- for(auto p : control_ranges) {
388
- for(auto i = p.first; i <= p.second; ++ i)
397
+ for (auto p : control_ranges) {
398
+ for (auto i = p.first; i <= p.second; ++ i) {
389
399
  codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
400
+ }
390
401
  }
391
402
  return codepoint_types;
392
403
  }
393
404
 
394
405
  static int codepoint_type(uint32_t cp) {
395
406
  static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
396
- return codepoint_types[cp];
407
+ return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
397
408
  }
398
409
 
399
410
  static int codepoint_type(const std::string & utf8) {
400
- if (utf8.length() == 0)
411
+ if (utf8.length() == 0) {
401
412
  return CODEPOINT_TYPE_UNIDENTIFIED;
413
+ }
402
414
  size_t offset = 0;
403
415
  return codepoint_type(codepoint_from_utf8(utf8, offset));
404
416
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.5
4
+ version: 0.12.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-02-09 00:00:00.000000000 Z
11
+ date: 2024-02-24 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: