cui-llama.rn 1.4.3 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/README.md +93 -114
  2. package/android/src/main/CMakeLists.txt +5 -0
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +91 -17
  4. package/android/src/main/java/com/rnllama/RNLlama.java +37 -4
  5. package/android/src/main/jni-utils.h +6 -0
  6. package/android/src/main/jni.cpp +289 -31
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  15. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +7 -2
  16. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +7 -2
  17. package/cpp/chat-template.hpp +529 -0
  18. package/cpp/chat.cpp +1779 -0
  19. package/cpp/chat.h +135 -0
  20. package/cpp/common.cpp +2064 -1873
  21. package/cpp/common.h +700 -699
  22. package/cpp/ggml-alloc.c +1039 -1042
  23. package/cpp/ggml-alloc.h +1 -1
  24. package/cpp/ggml-backend-impl.h +255 -255
  25. package/cpp/ggml-backend-reg.cpp +586 -582
  26. package/cpp/ggml-backend.cpp +2004 -2002
  27. package/cpp/ggml-backend.h +354 -354
  28. package/cpp/ggml-common.h +1851 -1853
  29. package/cpp/ggml-cpp.h +39 -39
  30. package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
  31. package/cpp/ggml-cpu-aarch64.h +8 -8
  32. package/cpp/ggml-cpu-impl.h +531 -386
  33. package/cpp/ggml-cpu-quants.c +12527 -10920
  34. package/cpp/ggml-cpu-traits.cpp +36 -36
  35. package/cpp/ggml-cpu-traits.h +38 -38
  36. package/cpp/ggml-cpu.c +15766 -14391
  37. package/cpp/ggml-cpu.cpp +655 -635
  38. package/cpp/ggml-cpu.h +138 -135
  39. package/cpp/ggml-impl.h +567 -567
  40. package/cpp/ggml-metal-impl.h +235 -0
  41. package/cpp/ggml-metal.h +1 -1
  42. package/cpp/ggml-metal.m +5146 -4884
  43. package/cpp/ggml-opt.cpp +854 -854
  44. package/cpp/ggml-opt.h +216 -216
  45. package/cpp/ggml-quants.c +5238 -5238
  46. package/cpp/ggml-threading.h +14 -14
  47. package/cpp/ggml.c +6529 -6514
  48. package/cpp/ggml.h +2198 -2194
  49. package/cpp/gguf.cpp +1329 -1329
  50. package/cpp/gguf.h +202 -202
  51. package/cpp/json-schema-to-grammar.cpp +1024 -1045
  52. package/cpp/json-schema-to-grammar.h +21 -8
  53. package/cpp/json.hpp +24766 -24766
  54. package/cpp/llama-adapter.cpp +347 -347
  55. package/cpp/llama-adapter.h +74 -74
  56. package/cpp/llama-arch.cpp +1513 -1487
  57. package/cpp/llama-arch.h +403 -400
  58. package/cpp/llama-batch.cpp +368 -368
  59. package/cpp/llama-batch.h +88 -88
  60. package/cpp/llama-chat.cpp +588 -578
  61. package/cpp/llama-chat.h +53 -52
  62. package/cpp/llama-context.cpp +1775 -1775
  63. package/cpp/llama-context.h +128 -128
  64. package/cpp/llama-cparams.cpp +1 -1
  65. package/cpp/llama-cparams.h +37 -37
  66. package/cpp/llama-cpp.h +30 -30
  67. package/cpp/llama-grammar.cpp +1219 -1139
  68. package/cpp/llama-grammar.h +173 -143
  69. package/cpp/llama-hparams.cpp +71 -71
  70. package/cpp/llama-hparams.h +139 -139
  71. package/cpp/llama-impl.cpp +167 -167
  72. package/cpp/llama-impl.h +61 -61
  73. package/cpp/llama-kv-cache.cpp +718 -718
  74. package/cpp/llama-kv-cache.h +219 -218
  75. package/cpp/llama-mmap.cpp +600 -590
  76. package/cpp/llama-mmap.h +68 -67
  77. package/cpp/llama-model-loader.cpp +1124 -1124
  78. package/cpp/llama-model-loader.h +167 -167
  79. package/cpp/llama-model.cpp +4087 -3997
  80. package/cpp/llama-model.h +370 -370
  81. package/cpp/llama-sampling.cpp +2558 -2408
  82. package/cpp/llama-sampling.h +32 -32
  83. package/cpp/llama-vocab.cpp +3264 -3247
  84. package/cpp/llama-vocab.h +125 -125
  85. package/cpp/llama.cpp +10284 -10077
  86. package/cpp/llama.h +1354 -1323
  87. package/cpp/log.cpp +393 -401
  88. package/cpp/log.h +132 -121
  89. package/cpp/minja/chat-template.hpp +529 -0
  90. package/cpp/minja/minja.hpp +2915 -0
  91. package/cpp/minja.hpp +2915 -0
  92. package/cpp/rn-llama.cpp +66 -6
  93. package/cpp/rn-llama.h +26 -1
  94. package/cpp/sampling.cpp +570 -505
  95. package/cpp/sampling.h +3 -0
  96. package/cpp/sgemm.cpp +2598 -2597
  97. package/cpp/sgemm.h +14 -14
  98. package/cpp/speculative.cpp +278 -277
  99. package/cpp/speculative.h +28 -28
  100. package/cpp/unicode.cpp +9 -2
  101. package/ios/CMakeLists.txt +6 -0
  102. package/ios/RNLlama.h +0 -8
  103. package/ios/RNLlama.mm +27 -3
  104. package/ios/RNLlamaContext.h +10 -1
  105. package/ios/RNLlamaContext.mm +269 -57
  106. package/jest/mock.js +21 -2
  107. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  108. package/lib/commonjs/grammar.js +3 -0
  109. package/lib/commonjs/grammar.js.map +1 -1
  110. package/lib/commonjs/index.js +87 -13
  111. package/lib/commonjs/index.js.map +1 -1
  112. package/lib/module/NativeRNLlama.js.map +1 -1
  113. package/lib/module/grammar.js +3 -0
  114. package/lib/module/grammar.js.map +1 -1
  115. package/lib/module/index.js +86 -13
  116. package/lib/module/index.js.map +1 -1
  117. package/lib/typescript/NativeRNLlama.d.ts +107 -2
  118. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  119. package/lib/typescript/grammar.d.ts.map +1 -1
  120. package/lib/typescript/index.d.ts +32 -7
  121. package/lib/typescript/index.d.ts.map +1 -1
  122. package/llama-rn.podspec +1 -1
  123. package/package.json +3 -2
  124. package/src/NativeRNLlama.ts +115 -3
  125. package/src/grammar.ts +3 -0
  126. package/src/index.ts +138 -21
  127. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  128. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  129. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  130. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  131. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  132. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  133. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -55
  134. package/cpp/rn-llama.hpp +0 -913
@@ -1,128 +1,128 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
- #include "llama-batch.h"
5
- #include "llama-cparams.h"
6
- #include "llama-model.h"
7
- #include "llama-kv-cache.h"
8
- #include "llama-adapter.h"
9
-
10
- #include "ggml-cpp.h"
11
-
12
- #include <map>
13
- #include <unordered_map>
14
- #include <vector>
15
- #include <set>
16
-
17
- struct llama_context {
18
- llama_context(const llama_model & model)
19
- : model(model)
20
- , t_start_us(model.t_start_us)
21
- , t_load_us(model.t_load_us) {}
22
-
23
- const struct llama_model & model;
24
-
25
- struct llama_cparams cparams;
26
- struct llama_sbatch sbatch; // TODO: revisit if needed
27
- struct llama_kv_cache kv_self;
28
- struct llama_adapter_cvec cvec;
29
-
30
- std::unordered_map<struct llama_adapter_lora *, float> lora;
31
-
32
- std::vector<lm_ggml_backend_ptr> backends;
33
- std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
-
35
- lm_ggml_backend_t backend_cpu = nullptr;
36
-
37
- lm_ggml_threadpool_t threadpool = nullptr;
38
- lm_ggml_threadpool_t threadpool_batch = nullptr;
39
-
40
- bool has_evaluated_once = false;
41
-
42
- mutable int64_t t_start_us;
43
- mutable int64_t t_load_us;
44
- mutable int64_t t_p_eval_us = 0;
45
- mutable int64_t t_eval_us = 0;
46
-
47
- mutable int64_t t_compute_start_us = 0;
48
- mutable int64_t n_queued_tokens = 0;
49
-
50
- mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
- mutable int32_t n_eval = 0; // number of eval calls
52
-
53
- // host buffer for the model output (logits and embeddings)
54
- lm_ggml_backend_buffer_ptr buf_output;
55
-
56
- // decode output (2-dimensional array: [n_outputs][n_vocab])
57
- size_t logits_size = 0; // capacity (of floats) for logits
58
- float * logits = nullptr;
59
-
60
- std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
- size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
- int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
-
64
- bool logits_all = false;
65
-
66
- // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
- // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
- size_t embd_size = 0; // capacity (of floats) for embeddings
69
- float * embd = nullptr;
70
-
71
- // sequence embeddings output (map of [n_embd] vectors)
72
- // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
- std::map<llama_seq_id, std::vector<float>> embd_seq;
74
-
75
- // whether we are computing encoder output or decoder output
76
- bool is_encoding = false;
77
-
78
- // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
- // number of position id each token get, 1 for each token in most cases.
80
- // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
- int n_pos_per_token = 1;
82
-
83
- // output of the encoder part of the encoder-decoder models
84
- std::vector<float> embd_enc;
85
- std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
-
87
- // memory buffers used to evaluate the model
88
- std::vector<uint8_t> buf_compute_meta;
89
- lm_ggml_backend_sched_ptr sched;
90
-
91
- lm_ggml_abort_callback abort_callback = nullptr;
92
- void * abort_callback_data = nullptr;
93
-
94
- // input tensors
95
- struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
- struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
- struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
- struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
- struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
- struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
- struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
- struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
- struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
- struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
- struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
- struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
- struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
- struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
- struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
- };
111
-
112
- // TODO: make these methods of llama_context
113
- void llama_set_k_shift(struct llama_context & lctx);
114
-
115
- void llama_set_s_copy(struct llama_context & lctx);
116
-
117
- void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
-
119
- // Make sure enough space is available for outputs.
120
- // Returns max number of outputs for which space was reserved.
121
- size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
-
123
- // make the outputs have the same order they had in the user-provided batch
124
- void llama_output_reorder(struct llama_context & ctx);
125
-
126
- // For internal test use
127
- // TODO: remove
128
- const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-model.h"
7
+ #include "llama-kv-cache.h"
8
+ #include "llama-adapter.h"
9
+
10
+ #include "ggml-cpp.h"
11
+
12
+ #include <map>
13
+ #include <unordered_map>
14
+ #include <vector>
15
+ #include <set>
16
+
17
+ struct llama_context {
18
+ llama_context(const llama_model & model)
19
+ : model(model)
20
+ , t_start_us(model.t_start_us)
21
+ , t_load_us(model.t_load_us) {}
22
+
23
+ const struct llama_model & model;
24
+
25
+ struct llama_cparams cparams;
26
+ struct llama_sbatch sbatch; // TODO: revisit if needed
27
+ struct llama_kv_cache kv_self;
28
+ struct llama_adapter_cvec cvec;
29
+
30
+ std::unordered_map<struct llama_adapter_lora *, float> lora;
31
+
32
+ std::vector<lm_ggml_backend_ptr> backends;
33
+ std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
+
35
+ lm_ggml_backend_t backend_cpu = nullptr;
36
+
37
+ lm_ggml_threadpool_t threadpool = nullptr;
38
+ lm_ggml_threadpool_t threadpool_batch = nullptr;
39
+
40
+ bool has_evaluated_once = false;
41
+
42
+ mutable int64_t t_start_us;
43
+ mutable int64_t t_load_us;
44
+ mutable int64_t t_p_eval_us = 0;
45
+ mutable int64_t t_eval_us = 0;
46
+
47
+ mutable int64_t t_compute_start_us = 0;
48
+ mutable int64_t n_queued_tokens = 0;
49
+
50
+ mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
+ mutable int32_t n_eval = 0; // number of eval calls
52
+
53
+ // host buffer for the model output (logits and embeddings)
54
+ lm_ggml_backend_buffer_ptr buf_output;
55
+
56
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
57
+ size_t logits_size = 0; // capacity (of floats) for logits
58
+ float * logits = nullptr;
59
+
60
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
+
64
+ bool logits_all = false;
65
+
66
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
+ size_t embd_size = 0; // capacity (of floats) for embeddings
69
+ float * embd = nullptr;
70
+
71
+ // sequence embeddings output (map of [n_embd] vectors)
72
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
74
+
75
+ // whether we are computing encoder output or decoder output
76
+ bool is_encoding = false;
77
+
78
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
+ // number of position id each token get, 1 for each token in most cases.
80
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
+ int n_pos_per_token = 1;
82
+
83
+ // output of the encoder part of the encoder-decoder models
84
+ std::vector<float> embd_enc;
85
+ std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
+
87
+ // memory buffers used to evaluate the model
88
+ std::vector<uint8_t> buf_compute_meta;
89
+ lm_ggml_backend_sched_ptr sched;
90
+
91
+ lm_ggml_abort_callback abort_callback = nullptr;
92
+ void * abort_callback_data = nullptr;
93
+
94
+ // input tensors
95
+ struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
+ struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
+ struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
+ struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
+ struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
+ struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
+ struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
+ struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
+ struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
+ struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
+ struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
+ struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
+ struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
+ struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
+ struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
+ };
111
+
112
+ // TODO: make these methods of llama_context
113
+ void llama_set_k_shift(struct llama_context & lctx);
114
+
115
+ void llama_set_s_copy(struct llama_context & lctx);
116
+
117
+ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
+
119
+ // Make sure enough space is available for outputs.
120
+ // Returns max number of outputs for which space was reserved.
121
+ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
+
123
+ // make the outputs have the same order they had in the user-provided batch
124
+ void llama_output_reorder(struct llama_context & ctx);
125
+
126
+ // For internal test use
127
+ // TODO: remove
128
+ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
@@ -1 +1 @@
1
- #include "llama-cparams.h"
1
+ #include "llama-cparams.h"
@@ -1,37 +1,37 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
-
5
- #include <cstdint>
6
-
7
- struct llama_cparams {
8
- uint32_t n_ctx; // context size used during inference
9
- uint32_t n_batch;
10
- uint32_t n_ubatch;
11
- uint32_t n_seq_max;
12
- int n_threads; // number of threads to use for generation
13
- int n_threads_batch; // number of threads to use for batch processing
14
-
15
- float rope_freq_base;
16
- float rope_freq_scale;
17
-
18
- uint32_t n_ctx_orig_yarn;
19
- // These hyperparameters are not exposed in GGUF, because all
20
- // existing YaRN models use the same values for them.
21
- float yarn_ext_factor;
22
- float yarn_attn_factor;
23
- float yarn_beta_fast;
24
- float yarn_beta_slow;
25
- float defrag_thold;
26
-
27
- bool embeddings;
28
- bool causal_attn;
29
- bool offload_kqv;
30
- bool flash_attn;
31
- bool no_perf;
32
-
33
- enum llama_pooling_type pooling_type;
34
-
35
- lm_ggml_backend_sched_eval_callback cb_eval;
36
- void * cb_eval_user_data;
37
- };
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <cstdint>
6
+
7
+ struct llama_cparams {
8
+ uint32_t n_ctx; // context size used during inference
9
+ uint32_t n_batch;
10
+ uint32_t n_ubatch;
11
+ uint32_t n_seq_max;
12
+ int n_threads; // number of threads to use for generation
13
+ int n_threads_batch; // number of threads to use for batch processing
14
+
15
+ float rope_freq_base;
16
+ float rope_freq_scale;
17
+
18
+ uint32_t n_ctx_orig_yarn;
19
+ // These hyperparameters are not exposed in GGUF, because all
20
+ // existing YaRN models use the same values for them.
21
+ float yarn_ext_factor;
22
+ float yarn_attn_factor;
23
+ float yarn_beta_fast;
24
+ float yarn_beta_slow;
25
+ float defrag_thold;
26
+
27
+ bool embeddings;
28
+ bool causal_attn;
29
+ bool offload_kqv;
30
+ bool flash_attn;
31
+ bool no_perf;
32
+
33
+ enum llama_pooling_type pooling_type;
34
+
35
+ lm_ggml_backend_sched_eval_callback cb_eval;
36
+ void * cb_eval_user_data;
37
+ };
package/cpp/llama-cpp.h CHANGED
@@ -1,30 +1,30 @@
1
- #pragma once
2
-
3
- #ifndef __cplusplus
4
- #error "This header is for C++ only"
5
- #endif
6
-
7
- #include <memory>
8
-
9
- #include "llama.h"
10
-
11
- struct llama_model_deleter {
12
- void operator()(llama_model * model) { llama_model_free(model); }
13
- };
14
-
15
- struct llama_context_deleter {
16
- void operator()(llama_context * context) { llama_free(context); }
17
- };
18
-
19
- struct llama_sampler_deleter {
20
- void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
- };
22
-
23
- struct llama_adapter_lora_deleter {
24
- void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
25
- };
26
-
27
- typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
- typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
- typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
- typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
1
+ #pragma once
2
+
3
+ #ifndef __cplusplus
4
+ #error "This header is for C++ only"
5
+ #endif
6
+
7
+ #include <memory>
8
+
9
+ #include "llama.h"
10
+
11
+ struct llama_model_deleter {
12
+ void operator()(llama_model * model) { llama_model_free(model); }
13
+ };
14
+
15
+ struct llama_context_deleter {
16
+ void operator()(llama_context * context) { llama_free(context); }
17
+ };
18
+
19
+ struct llama_sampler_deleter {
20
+ void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
+ };
22
+
23
+ struct llama_adapter_lora_deleter {
24
+ void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
25
+ };
26
+
27
+ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
+ typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
+ typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
+ typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;