llama-cpp-capacitor 0.0.13 → 0.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LlamaCpp.podspec +17 -17
- package/Package.swift +27 -27
- package/README.md +717 -574
- package/android/build.gradle +88 -69
- package/android/src/main/AndroidManifest.xml +2 -2
- package/android/src/main/CMakeLists-arm64.txt +131 -0
- package/android/src/main/CMakeLists-x86_64.txt +135 -0
- package/android/src/main/CMakeLists.txt +35 -52
- package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCpp.java +956 -717
- package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCppPlugin.java +710 -590
- package/android/src/main/jni-utils.h +7 -7
- package/android/src/main/jni.cpp +868 -127
- package/cpp/{rn-completion.cpp → cap-completion.cpp} +202 -24
- package/cpp/{rn-completion.h → cap-completion.h} +22 -11
- package/cpp/{rn-llama.cpp → cap-llama.cpp} +81 -27
- package/cpp/{rn-llama.h → cap-llama.h} +32 -20
- package/cpp/{rn-mtmd.hpp → cap-mtmd.hpp} +15 -15
- package/cpp/{rn-tts.cpp → cap-tts.cpp} +12 -12
- package/cpp/{rn-tts.h → cap-tts.h} +14 -14
- package/cpp/ggml-cpu/ggml-cpu-impl.h +30 -0
- package/dist/docs.json +100 -3
- package/dist/esm/definitions.d.ts +45 -2
- package/dist/esm/definitions.js.map +1 -1
- package/dist/esm/index.d.ts +22 -0
- package/dist/esm/index.js +66 -3
- package/dist/esm/index.js.map +1 -1
- package/dist/plugin.cjs.js +71 -3
- package/dist/plugin.cjs.js.map +1 -1
- package/dist/plugin.js +71 -3
- package/dist/plugin.js.map +1 -1
- package/ios/Sources/LlamaCppPlugin/LlamaCpp.swift +596 -596
- package/ios/Sources/LlamaCppPlugin/LlamaCppPlugin.swift +591 -514
- package/ios/Tests/LlamaCppPluginTests/LlamaCppPluginTests.swift +15 -15
- package/package.json +111 -110
|
@@ -1,14 +1,15 @@
|
|
|
1
|
-
#include "
|
|
2
|
-
#include "
|
|
3
|
-
#include "
|
|
4
|
-
#include "
|
|
1
|
+
#include "cap-completion.h"
|
|
2
|
+
#include "cap-llama.h"
|
|
3
|
+
#include "cap-tts.h"
|
|
4
|
+
#include "cap-mtmd.hpp"
|
|
5
|
+
#include <algorithm> // For std::sort in speculative decoding
|
|
5
6
|
|
|
6
7
|
// Include multimodal support
|
|
7
8
|
#include "tools/mtmd/mtmd.h"
|
|
8
9
|
#include "tools/mtmd/mtmd-helper.h"
|
|
9
10
|
#include "tools/mtmd/clip.h"
|
|
10
11
|
|
|
11
|
-
namespace
|
|
12
|
+
namespace capllama {
|
|
12
13
|
|
|
13
14
|
static bool ends_with(const std::string &str, const std::string &suffix)
|
|
14
15
|
{
|
|
@@ -67,19 +68,19 @@ static std::vector<llama_token> format_rerank(const llama_vocab * vocab, const s
|
|
|
67
68
|
}
|
|
68
69
|
|
|
69
70
|
// Constructor
|
|
70
|
-
|
|
71
|
+
llama_cap_context_completion::llama_cap_context_completion(llama_cap_context* parent)
|
|
71
72
|
: parent_ctx(parent) {
|
|
72
73
|
}
|
|
73
74
|
|
|
74
75
|
// Destructor
|
|
75
|
-
|
|
76
|
+
llama_cap_context_completion::~llama_cap_context_completion() {
|
|
76
77
|
if (ctx_sampling != nullptr) {
|
|
77
78
|
common_sampler_free(ctx_sampling);
|
|
78
79
|
ctx_sampling = nullptr;
|
|
79
80
|
}
|
|
80
81
|
}
|
|
81
82
|
|
|
82
|
-
void
|
|
83
|
+
void llama_cap_context_completion::rewind() {
|
|
83
84
|
is_interrupted = false;
|
|
84
85
|
parent_ctx->params.antiprompt.clear();
|
|
85
86
|
parent_ctx->params.sampling.grammar.clear();
|
|
@@ -105,7 +106,7 @@ void llama_rn_context_completion::rewind() {
|
|
|
105
106
|
}
|
|
106
107
|
}
|
|
107
108
|
|
|
108
|
-
bool
|
|
109
|
+
bool llama_cap_context_completion::initSampling() {
|
|
109
110
|
if (ctx_sampling != nullptr) {
|
|
110
111
|
common_sampler_free(ctx_sampling);
|
|
111
112
|
}
|
|
@@ -113,7 +114,7 @@ bool llama_rn_context_completion::initSampling() {
|
|
|
113
114
|
return ctx_sampling != nullptr;
|
|
114
115
|
}
|
|
115
116
|
|
|
116
|
-
void
|
|
117
|
+
void llama_cap_context_completion::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
|
|
117
118
|
const int n_left = parent_ctx->n_ctx - parent_ctx->params.n_keep;
|
|
118
119
|
const int n_block_size = n_left / 2;
|
|
119
120
|
const int erased_blocks = (prompt_tokens.size() - parent_ctx->params.n_keep - n_block_size) / n_block_size;
|
|
@@ -135,7 +136,7 @@ void llama_rn_context_completion::truncatePrompt(std::vector<llama_token> &promp
|
|
|
135
136
|
prompt_tokens = new_tokens;
|
|
136
137
|
}
|
|
137
138
|
|
|
138
|
-
void
|
|
139
|
+
void llama_cap_context_completion::loadPrompt(const std::vector<std::string> &media_paths) {
|
|
139
140
|
bool has_media = !media_paths.empty();
|
|
140
141
|
|
|
141
142
|
if (!has_media) {
|
|
@@ -203,11 +204,11 @@ void llama_rn_context_completion::loadPrompt(const std::vector<std::string> &med
|
|
|
203
204
|
n_past, embd.size(), num_prompt_tokens, has_media ? 1 : 0);
|
|
204
205
|
}
|
|
205
206
|
|
|
206
|
-
void
|
|
207
|
+
void llama_cap_context_completion::beginCompletion() {
|
|
207
208
|
beginCompletion(COMMON_CHAT_FORMAT_CONTENT_ONLY, COMMON_REASONING_FORMAT_NONE, false);
|
|
208
209
|
}
|
|
209
210
|
|
|
210
|
-
void
|
|
211
|
+
void llama_cap_context_completion::beginCompletion(int chat_format, common_reasoning_format reasoning_format, bool thinking_forced_open) {
|
|
211
212
|
// number of tokens to keep when resetting context
|
|
212
213
|
n_remain = parent_ctx->params.n_predict;
|
|
213
214
|
llama_perf_context_reset(parent_ctx->ctx);
|
|
@@ -218,11 +219,11 @@ void llama_rn_context_completion::beginCompletion(int chat_format, common_reason
|
|
|
218
219
|
current_thinking_forced_open = thinking_forced_open;
|
|
219
220
|
}
|
|
220
221
|
|
|
221
|
-
void
|
|
222
|
+
void llama_cap_context_completion::endCompletion() {
|
|
222
223
|
is_predicting = false;
|
|
223
224
|
}
|
|
224
225
|
|
|
225
|
-
completion_token_output
|
|
226
|
+
completion_token_output llama_cap_context_completion::nextToken()
|
|
226
227
|
{
|
|
227
228
|
completion_token_output result;
|
|
228
229
|
result.tok = -1;
|
|
@@ -344,7 +345,7 @@ completion_token_output llama_rn_context_completion::nextToken()
|
|
|
344
345
|
return result;
|
|
345
346
|
}
|
|
346
347
|
|
|
347
|
-
size_t
|
|
348
|
+
size_t llama_cap_context_completion::findStoppingStrings(const std::string &text, const size_t last_token_size,
|
|
348
349
|
const stop_type type)
|
|
349
350
|
{
|
|
350
351
|
size_t stop_pos = std::string::npos;
|
|
@@ -376,7 +377,7 @@ size_t llama_rn_context_completion::findStoppingStrings(const std::string &text,
|
|
|
376
377
|
return stop_pos;
|
|
377
378
|
}
|
|
378
379
|
|
|
379
|
-
completion_token_output
|
|
380
|
+
completion_token_output llama_cap_context_completion::doCompletion()
|
|
380
381
|
{
|
|
381
382
|
completion_token_output token_with_probs = nextToken();
|
|
382
383
|
|
|
@@ -444,7 +445,7 @@ completion_token_output llama_rn_context_completion::doCompletion()
|
|
|
444
445
|
return token_with_probs;
|
|
445
446
|
}
|
|
446
447
|
|
|
447
|
-
completion_partial_output
|
|
448
|
+
completion_partial_output llama_cap_context_completion::getPartialOutput(const std::string &token_text) {
|
|
448
449
|
common_chat_syntax syntax;
|
|
449
450
|
syntax.format = static_cast<common_chat_format>(current_chat_format);
|
|
450
451
|
syntax.reasoning_format = current_reasoning_format;
|
|
@@ -463,7 +464,7 @@ completion_partial_output llama_rn_context_completion::getPartialOutput(const st
|
|
|
463
464
|
return result;
|
|
464
465
|
}
|
|
465
466
|
|
|
466
|
-
std::vector<float>
|
|
467
|
+
std::vector<float> llama_cap_context_completion::getEmbedding(common_params &embd_params)
|
|
467
468
|
{
|
|
468
469
|
static const int n_embd = llama_model_n_embd(llama_get_model(parent_ctx->ctx));
|
|
469
470
|
if (!embd_params.embedding)
|
|
@@ -489,7 +490,7 @@ std::vector<float> llama_rn_context_completion::getEmbedding(common_params &embd
|
|
|
489
490
|
return out;
|
|
490
491
|
}
|
|
491
492
|
|
|
492
|
-
std::vector<float>
|
|
493
|
+
std::vector<float> llama_cap_context_completion::rerank(const std::string &query, const std::vector<std::string> &documents)
|
|
493
494
|
{
|
|
494
495
|
std::vector<float> scores;
|
|
495
496
|
|
|
@@ -548,7 +549,7 @@ std::vector<float> llama_rn_context_completion::rerank(const std::string &query,
|
|
|
548
549
|
return scores;
|
|
549
550
|
}
|
|
550
551
|
|
|
551
|
-
std::string
|
|
552
|
+
std::string llama_cap_context_completion::bench(int pp, int tg, int pl, int nr)
|
|
552
553
|
{
|
|
553
554
|
if (is_predicting) {
|
|
554
555
|
LOG_ERROR("cannot benchmark while predicting", "");
|
|
@@ -563,7 +564,7 @@ std::string llama_rn_context_completion::bench(int pp, int tg, int pl, int nr)
|
|
|
563
564
|
double pp_std = 0;
|
|
564
565
|
double tg_std = 0;
|
|
565
566
|
|
|
566
|
-
// TODO: move batch into
|
|
567
|
+
// TODO: move batch into llama_cap_context (related https://github.com/mybigday/llama.rn/issues/30)
|
|
567
568
|
llama_batch batch = llama_batch_init(
|
|
568
569
|
std::min(pp, parent_ctx->params.n_ubatch), // max n_tokens is limited by n_ubatch
|
|
569
570
|
0, // No embeddings
|
|
@@ -656,7 +657,7 @@ std::string llama_rn_context_completion::bench(int pp, int tg, int pl, int nr)
|
|
|
656
657
|
std::string("]");
|
|
657
658
|
}
|
|
658
659
|
|
|
659
|
-
void
|
|
660
|
+
void llama_cap_context_completion::processMedia(
|
|
660
661
|
const std::string &prompt,
|
|
661
662
|
const std::vector<std::string> &media_paths
|
|
662
663
|
) {
|
|
@@ -678,4 +679,181 @@ void llama_rn_context_completion::processMedia(
|
|
|
678
679
|
);
|
|
679
680
|
}
|
|
680
681
|
|
|
681
|
-
|
|
682
|
+
// Speculative decoding implementation
|
|
683
|
+
completion_token_output llama_cap_context_completion::nextTokenSpeculative() {
|
|
684
|
+
// Enable speculative mode
|
|
685
|
+
use_speculative = parent_ctx->isSpectulativeEnabled();
|
|
686
|
+
|
|
687
|
+
if (!use_speculative) {
|
|
688
|
+
// Fallback to regular token generation
|
|
689
|
+
return nextToken();
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
completion_token_output result;
|
|
693
|
+
|
|
694
|
+
// If we don't have drafted tokens, draft some
|
|
695
|
+
if (draft_tokens.empty()) {
|
|
696
|
+
draft_tokens = draftTokens(parent_ctx->speculative_samples);
|
|
697
|
+
n_drafted = draft_tokens.size();
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
// Try to verify and accept draft tokens
|
|
701
|
+
if (!draft_tokens.empty()) {
|
|
702
|
+
int accepted = verifyAndAcceptTokens(draft_tokens);
|
|
703
|
+
n_accepted += accepted;
|
|
704
|
+
|
|
705
|
+
if (accepted > 0) {
|
|
706
|
+
// Use the first accepted token
|
|
707
|
+
result.tok = draft_tokens[0];
|
|
708
|
+
draft_tokens.erase(draft_tokens.begin());
|
|
709
|
+
|
|
710
|
+
// Update context
|
|
711
|
+
embd.push_back(result.tok);
|
|
712
|
+
--n_remain;
|
|
713
|
+
num_tokens_predicted++;
|
|
714
|
+
|
|
715
|
+
has_next_token = parent_ctx->params.n_predict == -1 || n_remain != 0;
|
|
716
|
+
return result;
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
// If no tokens were accepted, fall back to regular sampling
|
|
721
|
+
draft_tokens.clear();
|
|
722
|
+
return nextToken();
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
std::vector<llama_token> llama_cap_context_completion::draftTokens(int n_draft) {
|
|
726
|
+
std::vector<llama_token> drafted;
|
|
727
|
+
|
|
728
|
+
// Check if draft model is available
|
|
729
|
+
if (!parent_ctx->draft_ctx || !parent_ctx->draft_model || !parent_ctx->isSpectulativeEnabled()) {
|
|
730
|
+
return drafted; // Return empty vector - will fallback to regular decoding
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
// Copy current context to draft model
|
|
734
|
+
// Note: KV cache copying may not be available in all llama.cpp versions
|
|
735
|
+
// For now, we'll skip this optimization and let the draft model generate from scratch
|
|
736
|
+
// This is still effective for speculative decoding
|
|
737
|
+
|
|
738
|
+
// Generate draft tokens using the smaller model
|
|
739
|
+
for (int i = 0; i < n_draft; i++) {
|
|
740
|
+
// Create batch with current context
|
|
741
|
+
llama_batch batch = llama_batch_init(1, 0, 1);
|
|
742
|
+
|
|
743
|
+
if (!embd.empty()) {
|
|
744
|
+
llama_batch_add(&batch, embd.back(), n_past + i, {0}, true);
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
// Decode with draft model
|
|
748
|
+
if (llama_decode(parent_ctx->draft_ctx, batch) != 0) {
|
|
749
|
+
llama_batch_free(batch);
|
|
750
|
+
break;
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
// Sample from draft model (using faster, simpler sampling)
|
|
754
|
+
const float temp = 0.8f; // Fixed temperature for draft
|
|
755
|
+
auto logits = llama_get_logits_ith(parent_ctx->draft_ctx, -1);
|
|
756
|
+
const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(parent_ctx->draft_model));
|
|
757
|
+
|
|
758
|
+
// Simple sampling for draft model
|
|
759
|
+
std::vector<llama_token_data> candidates;
|
|
760
|
+
candidates.reserve(n_vocab);
|
|
761
|
+
|
|
762
|
+
for (int token_id = 0; token_id < n_vocab; token_id++) {
|
|
763
|
+
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
llama_token_data_array candidates_p = {
|
|
767
|
+
candidates.data(),
|
|
768
|
+
candidates.size(),
|
|
769
|
+
-1,
|
|
770
|
+
false,
|
|
771
|
+
};
|
|
772
|
+
|
|
773
|
+
// Simple temperature sampling for draft model
|
|
774
|
+
for (int token_id = 0; token_id < n_vocab; token_id++) {
|
|
775
|
+
candidates[token_id].logit /= temp; // Apply temperature
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
// Sort by logit (simple greedy sampling for draft)
|
|
779
|
+
std::sort(candidates.begin(), candidates.end(),
|
|
780
|
+
[](const llama_token_data& a, const llama_token_data& b) {
|
|
781
|
+
return a.logit > b.logit;
|
|
782
|
+
});
|
|
783
|
+
|
|
784
|
+
llama_token token = candidates[0].id; // Take top token
|
|
785
|
+
drafted.push_back(token);
|
|
786
|
+
|
|
787
|
+
// Clean up
|
|
788
|
+
llama_batch_free(batch);
|
|
789
|
+
|
|
790
|
+
// Stop if we hit EOS
|
|
791
|
+
const llama_vocab * vocab = llama_model_get_vocab(parent_ctx->draft_model);
|
|
792
|
+
if (llama_vocab_is_eog(vocab, token)) {
|
|
793
|
+
break;
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
return drafted;
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
int llama_cap_context_completion::verifyAndAcceptTokens(const std::vector<llama_token> &draft_tokens) {
|
|
801
|
+
if (draft_tokens.empty() || !parent_ctx->ctx) {
|
|
802
|
+
return 0;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
int accepted = 0;
|
|
806
|
+
|
|
807
|
+
// Verify each draft token against the main model
|
|
808
|
+
for (size_t i = 0; i < draft_tokens.size(); i++) {
|
|
809
|
+
// Create batch for verification
|
|
810
|
+
llama_batch batch = llama_batch_init(1, 0, 1);
|
|
811
|
+
|
|
812
|
+
if (!embd.empty()) {
|
|
813
|
+
llama_batch_add(&batch, embd.back(), n_past + accepted, {0}, true);
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
// Decode with main model
|
|
817
|
+
if (llama_decode(parent_ctx->ctx, batch) != 0) {
|
|
818
|
+
llama_batch_free(batch);
|
|
819
|
+
break;
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
// Get logits from main model
|
|
823
|
+
auto logits = llama_get_logits_ith(parent_ctx->ctx, -1);
|
|
824
|
+
const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(parent_ctx->model));
|
|
825
|
+
|
|
826
|
+
// Sample from main model
|
|
827
|
+
std::vector<llama_token_data> candidates;
|
|
828
|
+
candidates.reserve(n_vocab);
|
|
829
|
+
|
|
830
|
+
for (int token_id = 0; token_id < n_vocab; token_id++) {
|
|
831
|
+
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
llama_token_data_array candidates_p = {
|
|
835
|
+
candidates.data(),
|
|
836
|
+
candidates.size(),
|
|
837
|
+
-1,
|
|
838
|
+
false,
|
|
839
|
+
};
|
|
840
|
+
|
|
841
|
+
// Apply sampling from main model using common_sampler
|
|
842
|
+
llama_token main_token = common_sampler_sample(ctx_sampling, parent_ctx->ctx, -1);
|
|
843
|
+
|
|
844
|
+
// Accept if tokens match
|
|
845
|
+
if (main_token == draft_tokens[i]) {
|
|
846
|
+
accepted++;
|
|
847
|
+
common_sampler_accept(ctx_sampling, main_token, true);
|
|
848
|
+
} else {
|
|
849
|
+
// Reject and stop verification
|
|
850
|
+
break;
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
llama_batch_free(batch);
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
return accepted;
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
} // namespace capllama
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
#ifndef
|
|
2
|
-
#define
|
|
1
|
+
#ifndef CAP_COMPLETION_H
|
|
2
|
+
#define CAP_COMPLETION_H
|
|
3
3
|
|
|
4
4
|
#include "common.h"
|
|
5
5
|
#include "llama.h"
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
|
|
10
10
|
using json = nlohmann::ordered_json;
|
|
11
11
|
|
|
12
|
-
namespace
|
|
12
|
+
namespace capllama {
|
|
13
13
|
|
|
14
14
|
// Utility functions
|
|
15
15
|
static inline void llama_batch_clear(llama_batch *batch) {
|
|
@@ -17,9 +17,9 @@ static inline void llama_batch_clear(llama_batch *batch) {
|
|
|
17
17
|
}
|
|
18
18
|
|
|
19
19
|
// Forward declarations
|
|
20
|
-
struct
|
|
20
|
+
struct llama_cap_context;
|
|
21
21
|
|
|
22
|
-
// Types defined in
|
|
22
|
+
// Types defined in cap-llama.h (needed here for compilation)
|
|
23
23
|
enum stop_type
|
|
24
24
|
{
|
|
25
25
|
STOP_FULL,
|
|
@@ -47,9 +47,9 @@ struct completion_partial_output
|
|
|
47
47
|
};
|
|
48
48
|
|
|
49
49
|
// Completion context class
|
|
50
|
-
struct
|
|
50
|
+
struct llama_cap_context_completion {
|
|
51
51
|
// Reference to parent context
|
|
52
|
-
|
|
52
|
+
llama_cap_context* parent_ctx;
|
|
53
53
|
|
|
54
54
|
// Completion state fields
|
|
55
55
|
bool is_predicting = false;
|
|
@@ -77,12 +77,18 @@ struct llama_rn_context_completion {
|
|
|
77
77
|
|
|
78
78
|
// Sampling context
|
|
79
79
|
common_sampler *ctx_sampling = nullptr;
|
|
80
|
+
|
|
81
|
+
// Speculative decoding state
|
|
82
|
+
std::vector<llama_token> draft_tokens;
|
|
83
|
+
int n_drafted = 0;
|
|
84
|
+
int n_accepted = 0;
|
|
85
|
+
bool use_speculative = false;
|
|
80
86
|
|
|
81
87
|
// Constructor
|
|
82
|
-
|
|
88
|
+
llama_cap_context_completion(llama_cap_context* parent);
|
|
83
89
|
|
|
84
90
|
// Destructor
|
|
85
|
-
~
|
|
91
|
+
~llama_cap_context_completion();
|
|
86
92
|
|
|
87
93
|
// Completion processing methods
|
|
88
94
|
void rewind();
|
|
@@ -93,9 +99,14 @@ struct llama_rn_context_completion {
|
|
|
93
99
|
void beginCompletion(int chat_format, common_reasoning_format reasoning_format, bool thinking_forced_open);
|
|
94
100
|
void endCompletion();
|
|
95
101
|
completion_token_output nextToken();
|
|
102
|
+
completion_token_output nextTokenSpeculative(); // NEW: Speculative version
|
|
96
103
|
size_t findStoppingStrings(const std::string &text, const size_t last_token_size, const stop_type type);
|
|
97
104
|
completion_token_output doCompletion();
|
|
98
105
|
completion_partial_output getPartialOutput(const std::string &token_text);
|
|
106
|
+
|
|
107
|
+
// Speculative decoding methods
|
|
108
|
+
std::vector<llama_token> draftTokens(int n_draft);
|
|
109
|
+
int verifyAndAcceptTokens(const std::vector<llama_token> &draft_tokens);
|
|
99
110
|
|
|
100
111
|
// Embedding methods
|
|
101
112
|
std::vector<float> getEmbedding(common_params &embd_params);
|
|
@@ -111,6 +122,6 @@ struct llama_rn_context_completion {
|
|
|
111
122
|
);
|
|
112
123
|
};
|
|
113
124
|
|
|
114
|
-
} // namespace
|
|
125
|
+
} // namespace capllama
|
|
115
126
|
|
|
116
|
-
#endif /*
|
|
127
|
+
#endif /* CAP_COMPLETION_H */
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
#include "
|
|
2
|
-
#include "
|
|
3
|
-
#include "
|
|
4
|
-
#include "
|
|
1
|
+
#include "cap-llama.h"
|
|
2
|
+
#include "cap-tts.h"
|
|
3
|
+
#include "cap-mtmd.hpp"
|
|
4
|
+
#include "cap-completion.h"
|
|
5
5
|
|
|
6
6
|
// Include multimodal support
|
|
7
7
|
#include "tools/mtmd/mtmd.h"
|
|
8
8
|
#include "tools/mtmd/mtmd-helper.h"
|
|
9
9
|
#include "tools/mtmd/clip.h"
|
|
10
10
|
|
|
11
|
-
namespace
|
|
11
|
+
namespace capllama {
|
|
12
12
|
|
|
13
13
|
static const std::vector<lm_ggml_type> kv_cache_types = {
|
|
14
14
|
LM_GGML_TYPE_F32,
|
|
@@ -122,7 +122,7 @@ std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::co
|
|
|
122
122
|
}
|
|
123
123
|
|
|
124
124
|
|
|
125
|
-
|
|
125
|
+
llama_cap_context::~llama_cap_context() {
|
|
126
126
|
if (completion != nullptr) {
|
|
127
127
|
delete completion;
|
|
128
128
|
completion = nullptr;
|
|
@@ -130,9 +130,10 @@ llama_rn_context::~llama_rn_context() {
|
|
|
130
130
|
|
|
131
131
|
releaseMultimodal();
|
|
132
132
|
releaseVocoder();
|
|
133
|
+
releaseDraftModel(); // Clean up speculative decoding resources
|
|
133
134
|
}
|
|
134
135
|
|
|
135
|
-
bool
|
|
136
|
+
bool llama_cap_context::loadModel(common_params ¶ms_)
|
|
136
137
|
{
|
|
137
138
|
params = params_;
|
|
138
139
|
llama_init = common_init_from_params(params);
|
|
@@ -150,7 +151,7 @@ bool llama_rn_context::loadModel(common_params ¶ms_)
|
|
|
150
151
|
if (completion != nullptr) {
|
|
151
152
|
delete completion;
|
|
152
153
|
}
|
|
153
|
-
completion = new
|
|
154
|
+
completion = new llama_cap_context_completion(this);
|
|
154
155
|
|
|
155
156
|
// Initialize context shift flag
|
|
156
157
|
LOG_INFO("ctx_shift: %s", params.ctx_shift ? "enabled" : "disabled");
|
|
@@ -162,7 +163,7 @@ bool llama_rn_context::loadModel(common_params ¶ms_)
|
|
|
162
163
|
}
|
|
163
164
|
|
|
164
165
|
|
|
165
|
-
bool
|
|
166
|
+
bool llama_cap_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
|
|
166
167
|
const char * tmpl = llama_model_chat_template(model, name);
|
|
167
168
|
if (tmpl == nullptr) {
|
|
168
169
|
return false;
|
|
@@ -170,7 +171,7 @@ bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *nam
|
|
|
170
171
|
return common_chat_verify_template(tmpl, use_jinja);
|
|
171
172
|
}
|
|
172
173
|
|
|
173
|
-
common_chat_params
|
|
174
|
+
common_chat_params llama_cap_context::getFormattedChatWithJinja(
|
|
174
175
|
const std::string& messages,
|
|
175
176
|
const std::string& chat_template,
|
|
176
177
|
const std::string& json_schema,
|
|
@@ -222,7 +223,7 @@ common_chat_params llama_rn_context::getFormattedChatWithJinja(
|
|
|
222
223
|
}
|
|
223
224
|
}
|
|
224
225
|
|
|
225
|
-
std::string
|
|
226
|
+
std::string llama_cap_context::getFormattedChat(
|
|
226
227
|
const std::string &messages,
|
|
227
228
|
const std::string &chat_template
|
|
228
229
|
) const {
|
|
@@ -239,14 +240,14 @@ std::string llama_rn_context::getFormattedChat(
|
|
|
239
240
|
}
|
|
240
241
|
}
|
|
241
242
|
|
|
242
|
-
|
|
243
|
+
llama_cap_tokenize_result llama_cap_context::tokenize(const std::string &text, const std::vector<std::string> &media_paths) {
|
|
243
244
|
if (media_paths.size() > 0) {
|
|
244
245
|
if (!isMultimodalEnabled()) {
|
|
245
246
|
throw std::runtime_error("Multimodal is not enabled but media paths are provided");
|
|
246
247
|
}
|
|
247
248
|
auto result = tokenizeWithMedia(mtmd_wrapper, text, media_paths);
|
|
248
249
|
mtmd_input_chunks_free(result.chunks);
|
|
249
|
-
|
|
250
|
+
llama_cap_tokenize_result tokenize_result = {
|
|
250
251
|
.tokens = result.tokens,
|
|
251
252
|
.has_media = true,
|
|
252
253
|
.bitmap_hashes = result.bitmap_hashes,
|
|
@@ -257,7 +258,7 @@ llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, con
|
|
|
257
258
|
}
|
|
258
259
|
std::vector<llama_token> text_tokens;
|
|
259
260
|
text_tokens = common_tokenize(ctx, text, false);
|
|
260
|
-
|
|
261
|
+
llama_cap_tokenize_result tokenize_result = {
|
|
261
262
|
.tokens = text_tokens,
|
|
262
263
|
.has_media = false,
|
|
263
264
|
.bitmap_hashes = {},
|
|
@@ -267,7 +268,7 @@ llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, con
|
|
|
267
268
|
return tokenize_result;
|
|
268
269
|
}
|
|
269
270
|
|
|
270
|
-
int
|
|
271
|
+
int llama_cap_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
|
|
271
272
|
for (auto &la : lora) {
|
|
272
273
|
la.ptr = llama_adapter_lora_init(model, la.path.c_str());
|
|
273
274
|
if (la.ptr == nullptr) {
|
|
@@ -280,18 +281,18 @@ int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lo
|
|
|
280
281
|
return 0;
|
|
281
282
|
}
|
|
282
283
|
|
|
283
|
-
void
|
|
284
|
+
void llama_cap_context::removeLoraAdapters() {
|
|
284
285
|
this->lora.clear();
|
|
285
286
|
common_set_adapter_lora(ctx, this->lora); // apply empty list
|
|
286
287
|
}
|
|
287
288
|
|
|
288
|
-
std::vector<common_adapter_lora_info>
|
|
289
|
+
std::vector<common_adapter_lora_info> llama_cap_context::getLoadedLoraAdapters() {
|
|
289
290
|
return this->lora;
|
|
290
291
|
}
|
|
291
292
|
|
|
292
|
-
bool
|
|
293
|
+
bool llama_cap_context::initMultimodal(const std::string &mmproj_path, bool use_gpu) {
|
|
293
294
|
try {
|
|
294
|
-
mtmd_wrapper = new
|
|
295
|
+
mtmd_wrapper = new llama_cap_context_mtmd(mmproj_path, use_gpu, model, ctx, params, has_multimodal, params);
|
|
295
296
|
return true;
|
|
296
297
|
} catch (const std::exception& e) {
|
|
297
298
|
LOG_ERROR("[DEBUG] Failed to initialize multimodal: %s", e.what());
|
|
@@ -299,19 +300,19 @@ bool llama_rn_context::initMultimodal(const std::string &mmproj_path, bool use_g
|
|
|
299
300
|
}
|
|
300
301
|
}
|
|
301
302
|
|
|
302
|
-
bool
|
|
303
|
+
bool llama_cap_context::isMultimodalEnabled() const {
|
|
303
304
|
return mtmd_wrapper != nullptr && mtmd_wrapper->isEnabled(has_multimodal);
|
|
304
305
|
}
|
|
305
306
|
|
|
306
|
-
bool
|
|
307
|
+
bool llama_cap_context::isMultimodalSupportVision() const {
|
|
307
308
|
return isMultimodalEnabled() && mtmd_wrapper->supportVision();
|
|
308
309
|
}
|
|
309
310
|
|
|
310
|
-
bool
|
|
311
|
+
bool llama_cap_context::isMultimodalSupportAudio() const {
|
|
311
312
|
return isMultimodalEnabled() && mtmd_wrapper->supportAudio();
|
|
312
313
|
}
|
|
313
314
|
|
|
314
|
-
void
|
|
315
|
+
void llama_cap_context::releaseMultimodal() {
|
|
315
316
|
if (mtmd_wrapper != nullptr) {
|
|
316
317
|
delete mtmd_wrapper;
|
|
317
318
|
mtmd_wrapper = nullptr;
|
|
@@ -319,9 +320,9 @@ void llama_rn_context::releaseMultimodal() {
|
|
|
319
320
|
}
|
|
320
321
|
}
|
|
321
322
|
|
|
322
|
-
bool
|
|
323
|
+
bool llama_cap_context::initVocoder(const std::string &vocoder_model_path, int batch_size) {
|
|
323
324
|
try {
|
|
324
|
-
tts_wrapper = new
|
|
325
|
+
tts_wrapper = new llama_cap_context_tts(vocoder_model_path, batch_size);
|
|
325
326
|
has_vocoder = true;
|
|
326
327
|
return true;
|
|
327
328
|
} catch (const std::exception& e) {
|
|
@@ -330,11 +331,11 @@ bool llama_rn_context::initVocoder(const std::string &vocoder_model_path, int ba
|
|
|
330
331
|
}
|
|
331
332
|
}
|
|
332
333
|
|
|
333
|
-
bool
|
|
334
|
+
bool llama_cap_context::isVocoderEnabled() const {
|
|
334
335
|
return has_vocoder && tts_wrapper != nullptr;
|
|
335
336
|
}
|
|
336
337
|
|
|
337
|
-
void
|
|
338
|
+
void llama_cap_context::releaseVocoder() {
|
|
338
339
|
if (tts_wrapper != nullptr) {
|
|
339
340
|
delete tts_wrapper;
|
|
340
341
|
tts_wrapper = nullptr;
|
|
@@ -342,4 +343,57 @@ void llama_rn_context::releaseVocoder() {
|
|
|
342
343
|
has_vocoder = false;
|
|
343
344
|
}
|
|
344
345
|
|
|
346
|
+
// Speculative decoding methods
|
|
347
|
+
bool llama_cap_context::loadDraftModel(const std::string &draft_model_path) {
|
|
348
|
+
if (draft_model_path.empty()) {
|
|
349
|
+
return false;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// Create draft model parameters (based on main model params)
|
|
353
|
+
common_params draft_params = params;
|
|
354
|
+
draft_params.model.path = draft_model_path;
|
|
355
|
+
|
|
356
|
+
// Mobile optimization: smaller context for draft model
|
|
357
|
+
if (mobile_speculative) {
|
|
358
|
+
draft_params.n_ctx = std::min(params.n_ctx, 1024); // Limit draft context
|
|
359
|
+
draft_params.n_batch = std::min(params.n_batch, 128); // Smaller batch
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
try {
|
|
363
|
+
// For now, use simplified draft model initialization
|
|
364
|
+
// This would be expanded in a full implementation to properly initialize
|
|
365
|
+
// the draft model and context
|
|
366
|
+
|
|
367
|
+
// TODO: Implement proper draft model loading
|
|
368
|
+
// draft_model = llama_load_model_from_file(draft_model_path.c_str(), draft_params);
|
|
369
|
+
// draft_ctx = llama_new_context_with_model(draft_model, draft_params);
|
|
370
|
+
|
|
371
|
+
// For this implementation, we'll disable speculative decoding
|
|
372
|
+
// until proper model loading is implemented
|
|
373
|
+
printf("Draft model loading not yet implemented - falling back to regular decoding\n");
|
|
374
|
+
speculative_enabled = false;
|
|
375
|
+
return false;
|
|
376
|
+
|
|
377
|
+
} catch (const std::exception& e) {
|
|
378
|
+
printf("Failed to load draft model: %s\n", e.what());
|
|
379
|
+
releaseDraftModel();
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
return false;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
void llama_cap_context::releaseDraftModel() {
|
|
386
|
+
if (draft_ctx) {
|
|
387
|
+
// Note: draft_ctx and draft_model are managed by common_init_result
|
|
388
|
+
// They will be automatically cleaned up
|
|
389
|
+
draft_ctx = nullptr;
|
|
390
|
+
draft_model = nullptr;
|
|
391
|
+
}
|
|
392
|
+
speculative_enabled = false;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
bool llama_cap_context::isSpectulativeEnabled() const {
|
|
396
|
+
return speculative_enabled && draft_model && draft_ctx;
|
|
397
|
+
}
|
|
398
|
+
|
|
345
399
|
}
|