cui-llama.rn 1.0.7 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +1 -1
- package/cpp/common.cpp +67 -34
- package/cpp/common.h +23 -8
- package/cpp/ggml-aarch64.c +16 -14
- package/cpp/ggml-backend.c +15 -10
- package/cpp/ggml-impl.h +4 -6
- package/cpp/ggml-metal.h +2 -0
- package/cpp/ggml-metal.m +55 -22
- package/cpp/ggml-quants.c +2 -2
- package/cpp/ggml-quants.h +4 -0
- package/cpp/ggml.c +40 -15
- package/cpp/ggml.h +10 -6
- package/cpp/grammar-parser.cpp +3 -0
- package/cpp/llama-impl.h +15 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +14 -18
- package/cpp/llama-vocab.h +4 -2
- package/cpp/llama.cpp +466 -280
- package/cpp/llama.h +10 -11
- package/cpp/rn-llama.hpp +23 -10
- package/package.json +1 -1
package/cpp/llama.h
CHANGED
@@ -93,15 +93,14 @@ extern "C" {
|
|
93
93
|
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
94
94
|
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
95
95
|
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
96
|
+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
97
|
+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
96
98
|
};
|
97
99
|
|
98
|
-
// note: these values should be synchronized with lm_ggml_rope
|
99
|
-
// TODO: maybe move this enum to ggml.h (lm_ggml_rope_type)
|
100
100
|
enum llama_rope_type {
|
101
101
|
LLAMA_ROPE_TYPE_NONE = -1,
|
102
|
-
LLAMA_ROPE_TYPE_NORM =
|
103
|
-
LLAMA_ROPE_TYPE_NEOX =
|
104
|
-
LLAMA_ROPE_TYPE_GLM = 4,
|
102
|
+
LLAMA_ROPE_TYPE_NORM = 0,
|
103
|
+
LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX,
|
105
104
|
};
|
106
105
|
|
107
106
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
@@ -345,7 +344,7 @@ extern "C" {
|
|
345
344
|
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
346
345
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
347
346
|
enum lm_ggml_type output_tensor_type; // output tensor type
|
348
|
-
enum lm_ggml_type token_embedding_type; //
|
347
|
+
enum lm_ggml_type token_embedding_type; // token embeddings tensor type
|
349
348
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
350
349
|
bool quantize_output_tensor; // quantize output.weight
|
351
350
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
@@ -504,6 +503,9 @@ extern "C" {
|
|
504
503
|
// Returns true if the model contains an encoder that requires llama_encode() call
|
505
504
|
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
506
505
|
|
506
|
+
// Returns true if the model contains a decoder that requires llama_decode() call
|
507
|
+
LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
|
508
|
+
|
507
509
|
// For encoder-decoder models, this function returns id of the token that must be provided
|
508
510
|
// to the decoder to start generating output sequence. For other models, it returns -1.
|
509
511
|
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
@@ -912,11 +914,8 @@ extern "C" {
|
|
912
914
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
913
915
|
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
914
916
|
|
915
|
-
|
916
|
-
LLAMA_API
|
917
|
-
|
918
|
-
// Returns -1 if unknown, 1 for true or 0 for false.
|
919
|
-
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
917
|
+
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
918
|
+
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
920
919
|
|
921
920
|
// Codellama infill tokens
|
922
921
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
package/cpp/rn-llama.hpp
CHANGED
@@ -6,13 +6,10 @@
|
|
6
6
|
#include "common.h"
|
7
7
|
#include "llama.h"
|
8
8
|
|
9
|
-
|
10
9
|
#include <android/log.h>
|
11
10
|
#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
|
12
11
|
#define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
13
12
|
|
14
|
-
|
15
|
-
|
16
13
|
namespace rnllama {
|
17
14
|
|
18
15
|
static void llama_batch_clear(llama_batch *batch) {
|
@@ -227,7 +224,9 @@ struct llama_rn_context
|
|
227
224
|
bool loadModel(gpt_params ¶ms_)
|
228
225
|
{
|
229
226
|
params = params_;
|
230
|
-
|
227
|
+
llama_init_result result = llama_init_from_gpt_params(params);
|
228
|
+
model = result.model;
|
229
|
+
ctx = result.context;
|
231
230
|
if (model == nullptr)
|
232
231
|
{
|
233
232
|
LOG_ERROR("unable to load model: %s", params_.model.c_str());
|
@@ -298,7 +297,9 @@ struct llama_rn_context
|
|
298
297
|
}
|
299
298
|
|
300
299
|
// do Context Shift , may be buggy! TODO: Verify functionality
|
301
|
-
|
300
|
+
if(!params.embedding){
|
301
|
+
purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
|
302
|
+
}
|
302
303
|
|
303
304
|
// push the prompt into the sampling context (do not apply grammar)
|
304
305
|
for (auto & token : prompt_tokens)
|
@@ -306,7 +307,7 @@ struct llama_rn_context
|
|
306
307
|
llama_sampling_accept(ctx_sampling, ctx, token, false);
|
307
308
|
}
|
308
309
|
// compare the evaluated prompt with the new prompt
|
309
|
-
n_past = common_part(embd, prompt_tokens);
|
310
|
+
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
310
311
|
LLAMA_LOG_INFO("%s: n_past: %zu", __func__, n_past);
|
311
312
|
LLAMA_LOG_INFO("%s: embd size: %zu", __func__, embd.size());
|
312
313
|
LLAMA_LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
|
@@ -343,9 +344,9 @@ struct llama_rn_context
|
|
343
344
|
completion_token_output result;
|
344
345
|
result.tok = -1;
|
345
346
|
|
347
|
+
// this truncation should never trigger with good context shifting
|
346
348
|
if (embd.size() >= (size_t)params.n_ctx)
|
347
349
|
{
|
348
|
-
// Shift context
|
349
350
|
|
350
351
|
const int n_left = n_past - params.n_keep - 1;
|
351
352
|
const int n_discard = n_left/2;
|
@@ -547,9 +548,21 @@ struct llama_rn_context
|
|
547
548
|
LOG_WARNING("embedding disabled, embedding: %s", params.embedding);
|
548
549
|
return std::vector<float>(n_embd, 0.0f);
|
549
550
|
}
|
550
|
-
|
551
|
-
|
552
|
-
|
551
|
+
float *data;
|
552
|
+
|
553
|
+
if(params.pooling_type == 0){
|
554
|
+
data = llama_get_embeddings(ctx);
|
555
|
+
}
|
556
|
+
else {
|
557
|
+
data = llama_get_embeddings_seq(ctx, 0);
|
558
|
+
}
|
559
|
+
|
560
|
+
if(!data) {
|
561
|
+
return std::vector<float>(n_embd, 0.0f);
|
562
|
+
}
|
563
|
+
std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
|
564
|
+
llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
|
565
|
+
return out;
|
553
566
|
}
|
554
567
|
|
555
568
|
std::string bench(int pp, int tg, int pl, int nr)
|