llama_cpp 0.10.3 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/LICENSE.txt +1 -1
- data/ext/llama_cpp/extconf.rb +35 -110
- data/ext/llama_cpp/llama_cpp.cpp +52 -28
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -1
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/Makefile +758 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.c +6 -2
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.cu +73 -63
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-impl.h +1 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.m +43 -20
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.metal +464 -245
- data/vendor/tmp/llama.cpp/ggml-opencl.h +25 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.c +61 -57
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.c +171 -5
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml.h +1 -0
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.cpp +222 -105
- data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/llama.h +31 -32
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +38 -0
- metadata +30 -27
- data/ext/llama_cpp/src/ggml-opencl.h +0 -25
- data/ext/llama_cpp/src/llama-util.h +0 -546
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/LICENSE +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.c +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-alloc.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend-impl.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-backend.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-cuda.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-metal.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.c +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-mpi.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-opencl.cpp +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/ggml-quants.h +0 -0
- /data/{ext/llama_cpp/src → vendor/tmp/llama.cpp}/unicode.h +0 -0
@@ -226,7 +226,7 @@ extern "C" {
|
|
226
226
|
|
227
227
|
// model quantization parameters
|
228
228
|
typedef struct llama_model_quantize_params {
|
229
|
-
|
229
|
+
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
230
230
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
231
231
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
232
232
|
bool quantize_output_tensor; // quantize output.weight
|
@@ -310,21 +310,20 @@ extern "C" {
|
|
310
310
|
|
311
311
|
LLAMA_API int64_t llama_time_us(void);
|
312
312
|
|
313
|
-
LLAMA_API
|
313
|
+
LLAMA_API int32_t llama_max_devices(void);
|
314
314
|
LLAMA_API bool llama_mmap_supported (void);
|
315
315
|
LLAMA_API bool llama_mlock_supported(void);
|
316
316
|
|
317
317
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
318
318
|
|
319
|
-
// TODO: become more consistent with returned int types across the API
|
320
319
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
321
320
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
322
321
|
|
323
322
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
324
323
|
|
325
|
-
LLAMA_API
|
326
|
-
LLAMA_API
|
327
|
-
LLAMA_API
|
324
|
+
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
325
|
+
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
326
|
+
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
328
327
|
|
329
328
|
// Get the model's RoPE frequency scaling factor
|
330
329
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
@@ -335,19 +334,19 @@ extern "C" {
|
|
335
334
|
// - GGUF array values are not supported by these functions
|
336
335
|
|
337
336
|
// Get metadata value as a string by key name
|
338
|
-
LLAMA_API
|
337
|
+
LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
339
338
|
|
340
339
|
// Get the number of metadata key/value pairs
|
341
|
-
LLAMA_API
|
340
|
+
LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
|
342
341
|
|
343
342
|
// Get metadata key name by index
|
344
|
-
LLAMA_API
|
343
|
+
LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
|
345
344
|
|
346
345
|
// Get metadata value as a string by index
|
347
|
-
LLAMA_API
|
346
|
+
LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
|
348
347
|
|
349
348
|
// Get a string describing the model type
|
350
|
-
LLAMA_API
|
349
|
+
LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
351
350
|
|
352
351
|
// Returns the total size of all the tensors in the model in bytes
|
353
352
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
@@ -359,7 +358,7 @@ extern "C" {
|
|
359
358
|
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
360
359
|
|
361
360
|
// Returns 0 on success
|
362
|
-
LLAMA_API
|
361
|
+
LLAMA_API uint32_t llama_model_quantize(
|
363
362
|
const char * fname_inp,
|
364
363
|
const char * fname_out,
|
365
364
|
const llama_model_quantize_params * params);
|
@@ -370,20 +369,20 @@ extern "C" {
|
|
370
369
|
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
371
370
|
// will be applied on top of the previous one
|
372
371
|
// Returns 0 on success
|
373
|
-
LLAMA_API DEPRECATED(
|
372
|
+
LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
|
374
373
|
struct llama_context * ctx,
|
375
374
|
const char * path_lora,
|
376
375
|
float scale,
|
377
376
|
const char * path_base_model,
|
378
|
-
|
377
|
+
int32_t n_threads),
|
379
378
|
"use llama_model_apply_lora_from_file instead");
|
380
379
|
|
381
|
-
LLAMA_API
|
380
|
+
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
382
381
|
const struct llama_model * model,
|
383
382
|
const char * path_lora,
|
384
383
|
float scale,
|
385
384
|
const char * path_base_model,
|
386
|
-
|
385
|
+
int32_t n_threads);
|
387
386
|
|
388
387
|
//
|
389
388
|
// KV cache
|
@@ -439,10 +438,10 @@ extern "C" {
|
|
439
438
|
|
440
439
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
441
440
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
442
|
-
LLAMA_API
|
441
|
+
LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
443
442
|
|
444
443
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
445
|
-
LLAMA_API
|
444
|
+
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
446
445
|
|
447
446
|
// Clear the KV cache
|
448
447
|
LLAMA_API void llama_kv_cache_clear(
|
@@ -533,7 +532,7 @@ extern "C" {
|
|
533
532
|
struct llama_context * ctx,
|
534
533
|
llama_token * tokens,
|
535
534
|
int32_t n_tokens,
|
536
|
-
|
535
|
+
int32_t n_past),
|
537
536
|
"use llama_decode() instead");
|
538
537
|
|
539
538
|
// Same as llama_eval, but use float matrix input directly.
|
@@ -542,7 +541,7 @@ extern "C" {
|
|
542
541
|
struct llama_context * ctx,
|
543
542
|
float * embd,
|
544
543
|
int32_t n_tokens,
|
545
|
-
|
544
|
+
int32_t n_past),
|
546
545
|
"use llama_decode() instead");
|
547
546
|
|
548
547
|
// Return batch for single sequence of tokens starting at pos_0
|
@@ -574,7 +573,7 @@ extern "C" {
|
|
574
573
|
// 0 - success
|
575
574
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
576
575
|
// < 0 - error
|
577
|
-
LLAMA_API
|
576
|
+
LLAMA_API int32_t llama_decode(
|
578
577
|
struct llama_context * ctx,
|
579
578
|
struct llama_batch batch);
|
580
579
|
|
@@ -614,10 +613,10 @@ extern "C" {
|
|
614
613
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
615
614
|
|
616
615
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
617
|
-
LLAMA_API
|
616
|
+
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
618
617
|
|
619
618
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
620
|
-
LLAMA_API
|
619
|
+
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
621
620
|
|
622
621
|
// codellama infill tokens
|
623
622
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
@@ -635,12 +634,12 @@ extern "C" {
|
|
635
634
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
636
635
|
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
637
636
|
/// Does not insert a leading space.
|
638
|
-
LLAMA_API
|
637
|
+
LLAMA_API int32_t llama_tokenize(
|
639
638
|
const struct llama_model * model,
|
640
639
|
const char * text,
|
641
|
-
|
640
|
+
int32_t text_len,
|
642
641
|
llama_token * tokens,
|
643
|
-
|
642
|
+
int32_t n_max_tokens,
|
644
643
|
bool add_bos,
|
645
644
|
bool special);
|
646
645
|
|
@@ -648,11 +647,11 @@ extern "C" {
|
|
648
647
|
// Uses the vocabulary in the provided context.
|
649
648
|
// Does not write null terminator to the buffer.
|
650
649
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
651
|
-
LLAMA_API
|
650
|
+
LLAMA_API int32_t llama_token_to_piece(
|
652
651
|
const struct llama_model * model,
|
653
652
|
llama_token token,
|
654
653
|
char * buf,
|
655
|
-
|
654
|
+
int32_t length);
|
656
655
|
|
657
656
|
//
|
658
657
|
// Grammar
|
@@ -704,7 +703,7 @@ extern "C" {
|
|
704
703
|
LLAMA_API void llama_sample_top_k(
|
705
704
|
struct llama_context * ctx,
|
706
705
|
llama_token_data_array * candidates,
|
707
|
-
|
706
|
+
int32_t k,
|
708
707
|
size_t min_keep);
|
709
708
|
|
710
709
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
@@ -763,7 +762,7 @@ extern "C" {
|
|
763
762
|
llama_token_data_array * candidates,
|
764
763
|
float tau,
|
765
764
|
float eta,
|
766
|
-
|
765
|
+
int32_t m,
|
767
766
|
float * mu);
|
768
767
|
|
769
768
|
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
@@ -836,8 +835,8 @@ extern "C" {
|
|
836
835
|
llama_beam_search_callback_fn_t callback,
|
837
836
|
void * callback_data,
|
838
837
|
size_t n_beams,
|
839
|
-
|
840
|
-
|
838
|
+
int32_t n_past,
|
839
|
+
int32_t n_predict);
|
841
840
|
|
842
841
|
// Performance information
|
843
842
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
@@ -0,0 +1,38 @@
|
|
1
|
+
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
|
2
|
+
GF_CC_IS_GCC = 1
|
3
|
+
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
4
|
+
else
|
5
|
+
GF_CC_IS_CLANG = 1
|
6
|
+
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
|
7
|
+
GF_CC_IS_LLVM_CLANG = 1
|
8
|
+
else
|
9
|
+
GF_CC_IS_APPLE_CLANG = 1
|
10
|
+
endif
|
11
|
+
GF_CC_VER := \
|
12
|
+
$(shell $(GF_CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
|
13
|
+
| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
14
|
+
endif
|
15
|
+
|
16
|
+
ifeq ($(GF_CC_IS_CLANG), 1)
|
17
|
+
# clang options
|
18
|
+
GF_CFLAGS = -Wunreachable-code-break -Wunreachable-code-return
|
19
|
+
GF_CXXFLAGS = -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
|
20
|
+
|
21
|
+
ifneq '' '$(and $(GF_CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 030800)))'
|
22
|
+
GF_CFLAGS += -Wdouble-promotion
|
23
|
+
endif
|
24
|
+
ifneq '' '$(and $(GF_CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 070300)))'
|
25
|
+
GF_CFLAGS += -Wdouble-promotion
|
26
|
+
endif
|
27
|
+
else
|
28
|
+
# gcc options
|
29
|
+
GF_CFLAGS = -Wdouble-promotion
|
30
|
+
GF_CXXFLAGS = -Wno-array-bounds
|
31
|
+
|
32
|
+
ifeq ($(shell expr $(GF_CC_VER) \>= 070100), 1)
|
33
|
+
GF_CXXFLAGS += -Wno-format-truncation
|
34
|
+
endif
|
35
|
+
ifeq ($(shell expr $(GF_CC_VER) \>= 080100), 1)
|
36
|
+
GF_CXXFLAGS += -Wextra-semi
|
37
|
+
endif
|
38
|
+
endif
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -29,33 +29,36 @@ files:
|
|
29
29
|
- ext/llama_cpp/extconf.rb
|
30
30
|
- ext/llama_cpp/llama_cpp.cpp
|
31
31
|
- ext/llama_cpp/llama_cpp.h
|
32
|
-
- ext/llama_cpp/src/LICENSE
|
33
|
-
- ext/llama_cpp/src/ggml-alloc.c
|
34
|
-
- ext/llama_cpp/src/ggml-alloc.h
|
35
|
-
- ext/llama_cpp/src/ggml-backend-impl.h
|
36
|
-
- ext/llama_cpp/src/ggml-backend.c
|
37
|
-
- ext/llama_cpp/src/ggml-backend.h
|
38
|
-
- ext/llama_cpp/src/ggml-cuda.cu
|
39
|
-
- ext/llama_cpp/src/ggml-cuda.h
|
40
|
-
- ext/llama_cpp/src/ggml-impl.h
|
41
|
-
- ext/llama_cpp/src/ggml-metal.h
|
42
|
-
- ext/llama_cpp/src/ggml-metal.m
|
43
|
-
- ext/llama_cpp/src/ggml-metal.metal
|
44
|
-
- ext/llama_cpp/src/ggml-mpi.c
|
45
|
-
- ext/llama_cpp/src/ggml-mpi.h
|
46
|
-
- ext/llama_cpp/src/ggml-opencl.cpp
|
47
|
-
- ext/llama_cpp/src/ggml-opencl.h
|
48
|
-
- ext/llama_cpp/src/ggml-quants.c
|
49
|
-
- ext/llama_cpp/src/ggml-quants.h
|
50
|
-
- ext/llama_cpp/src/ggml.c
|
51
|
-
- ext/llama_cpp/src/ggml.h
|
52
|
-
- ext/llama_cpp/src/llama-util.h
|
53
|
-
- ext/llama_cpp/src/llama.cpp
|
54
|
-
- ext/llama_cpp/src/llama.h
|
55
|
-
- ext/llama_cpp/src/unicode.h
|
56
32
|
- lib/llama_cpp.rb
|
57
33
|
- lib/llama_cpp/version.rb
|
58
34
|
- sig/llama_cpp.rbs
|
35
|
+
- vendor/include/.gitkeep
|
36
|
+
- vendor/lib/.gitkeep
|
37
|
+
- vendor/tmp/llama.cpp/LICENSE
|
38
|
+
- vendor/tmp/llama.cpp/Makefile
|
39
|
+
- vendor/tmp/llama.cpp/ggml-alloc.c
|
40
|
+
- vendor/tmp/llama.cpp/ggml-alloc.h
|
41
|
+
- vendor/tmp/llama.cpp/ggml-backend-impl.h
|
42
|
+
- vendor/tmp/llama.cpp/ggml-backend.c
|
43
|
+
- vendor/tmp/llama.cpp/ggml-backend.h
|
44
|
+
- vendor/tmp/llama.cpp/ggml-cuda.cu
|
45
|
+
- vendor/tmp/llama.cpp/ggml-cuda.h
|
46
|
+
- vendor/tmp/llama.cpp/ggml-impl.h
|
47
|
+
- vendor/tmp/llama.cpp/ggml-metal.h
|
48
|
+
- vendor/tmp/llama.cpp/ggml-metal.m
|
49
|
+
- vendor/tmp/llama.cpp/ggml-metal.metal
|
50
|
+
- vendor/tmp/llama.cpp/ggml-mpi.c
|
51
|
+
- vendor/tmp/llama.cpp/ggml-mpi.h
|
52
|
+
- vendor/tmp/llama.cpp/ggml-opencl.cpp
|
53
|
+
- vendor/tmp/llama.cpp/ggml-opencl.h
|
54
|
+
- vendor/tmp/llama.cpp/ggml-quants.c
|
55
|
+
- vendor/tmp/llama.cpp/ggml-quants.h
|
56
|
+
- vendor/tmp/llama.cpp/ggml.c
|
57
|
+
- vendor/tmp/llama.cpp/ggml.h
|
58
|
+
- vendor/tmp/llama.cpp/llama.cpp
|
59
|
+
- vendor/tmp/llama.cpp/llama.h
|
60
|
+
- vendor/tmp/llama.cpp/scripts/get-flags.mk
|
61
|
+
- vendor/tmp/llama.cpp/unicode.h
|
59
62
|
homepage: https://github.com/yoshoku/llama_cpp.rb
|
60
63
|
licenses:
|
61
64
|
- MIT
|
@@ -80,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
83
|
- !ruby/object:Gem::Version
|
81
84
|
version: '0'
|
82
85
|
requirements: []
|
83
|
-
rubygems_version: 3.
|
86
|
+
rubygems_version: 3.5.3
|
84
87
|
signing_key:
|
85
88
|
specification_version: 4
|
86
89
|
summary: Ruby bindings for the llama.cpp.
|
@@ -1,25 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include "ggml.h"
|
4
|
-
|
5
|
-
#ifdef __cplusplus
|
6
|
-
extern "C" {
|
7
|
-
#endif
|
8
|
-
|
9
|
-
void ggml_cl_init(void);
|
10
|
-
|
11
|
-
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
12
|
-
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
13
|
-
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
14
|
-
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
15
|
-
|
16
|
-
void * ggml_cl_host_malloc(size_t size);
|
17
|
-
void ggml_cl_host_free(void * ptr);
|
18
|
-
|
19
|
-
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
20
|
-
|
21
|
-
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
22
|
-
|
23
|
-
#ifdef __cplusplus
|
24
|
-
}
|
25
|
-
#endif
|