llama-cpp-capacitor 0.0.13 → 0.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/LlamaCpp.podspec +17 -17
  2. package/Package.swift +27 -27
  3. package/README.md +717 -574
  4. package/android/build.gradle +88 -69
  5. package/android/src/main/AndroidManifest.xml +2 -2
  6. package/android/src/main/CMakeLists-arm64.txt +131 -0
  7. package/android/src/main/CMakeLists-x86_64.txt +135 -0
  8. package/android/src/main/CMakeLists.txt +35 -52
  9. package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCpp.java +956 -717
  10. package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCppPlugin.java +710 -590
  11. package/android/src/main/jni-utils.h +7 -7
  12. package/android/src/main/jni.cpp +868 -127
  13. package/cpp/{rn-completion.cpp → cap-completion.cpp} +202 -24
  14. package/cpp/{rn-completion.h → cap-completion.h} +22 -11
  15. package/cpp/{rn-llama.cpp → cap-llama.cpp} +81 -27
  16. package/cpp/{rn-llama.h → cap-llama.h} +32 -20
  17. package/cpp/{rn-mtmd.hpp → cap-mtmd.hpp} +15 -15
  18. package/cpp/{rn-tts.cpp → cap-tts.cpp} +12 -12
  19. package/cpp/{rn-tts.h → cap-tts.h} +14 -14
  20. package/cpp/ggml-cpu/ggml-cpu-impl.h +30 -0
  21. package/dist/docs.json +100 -3
  22. package/dist/esm/definitions.d.ts +45 -2
  23. package/dist/esm/definitions.js.map +1 -1
  24. package/dist/esm/index.d.ts +22 -0
  25. package/dist/esm/index.js +66 -3
  26. package/dist/esm/index.js.map +1 -1
  27. package/dist/plugin.cjs.js +71 -3
  28. package/dist/plugin.cjs.js.map +1 -1
  29. package/dist/plugin.js +71 -3
  30. package/dist/plugin.js.map +1 -1
  31. package/ios/Sources/LlamaCppPlugin/LlamaCpp.swift +596 -596
  32. package/ios/Sources/LlamaCppPlugin/LlamaCppPlugin.swift +591 -514
  33. package/ios/Tests/LlamaCppPluginTests/LlamaCppPluginTests.swift +15 -15
  34. package/package.json +111 -110
@@ -1,14 +1,15 @@
1
- #include "rn-completion.h"
2
- #include "rn-llama.h"
3
- #include "rn-tts.h"
4
- #include "rn-mtmd.hpp"
1
+ #include "cap-completion.h"
2
+ #include "cap-llama.h"
3
+ #include "cap-tts.h"
4
+ #include "cap-mtmd.hpp"
5
+ #include <algorithm> // For std::sort in speculative decoding
5
6
 
6
7
  // Include multimodal support
7
8
  #include "tools/mtmd/mtmd.h"
8
9
  #include "tools/mtmd/mtmd-helper.h"
9
10
  #include "tools/mtmd/clip.h"
10
11
 
11
- namespace rnllama {
12
+ namespace capllama {
12
13
 
13
14
  static bool ends_with(const std::string &str, const std::string &suffix)
14
15
  {
@@ -67,19 +68,19 @@ static std::vector<llama_token> format_rerank(const llama_vocab * vocab, const s
67
68
  }
68
69
 
69
70
  // Constructor
70
- llama_rn_context_completion::llama_rn_context_completion(llama_rn_context* parent)
71
+ llama_cap_context_completion::llama_cap_context_completion(llama_cap_context* parent)
71
72
  : parent_ctx(parent) {
72
73
  }
73
74
 
74
75
  // Destructor
75
- llama_rn_context_completion::~llama_rn_context_completion() {
76
+ llama_cap_context_completion::~llama_cap_context_completion() {
76
77
  if (ctx_sampling != nullptr) {
77
78
  common_sampler_free(ctx_sampling);
78
79
  ctx_sampling = nullptr;
79
80
  }
80
81
  }
81
82
 
82
- void llama_rn_context_completion::rewind() {
83
+ void llama_cap_context_completion::rewind() {
83
84
  is_interrupted = false;
84
85
  parent_ctx->params.antiprompt.clear();
85
86
  parent_ctx->params.sampling.grammar.clear();
@@ -105,7 +106,7 @@ void llama_rn_context_completion::rewind() {
105
106
  }
106
107
  }
107
108
 
108
- bool llama_rn_context_completion::initSampling() {
109
+ bool llama_cap_context_completion::initSampling() {
109
110
  if (ctx_sampling != nullptr) {
110
111
  common_sampler_free(ctx_sampling);
111
112
  }
@@ -113,7 +114,7 @@ bool llama_rn_context_completion::initSampling() {
113
114
  return ctx_sampling != nullptr;
114
115
  }
115
116
 
116
- void llama_rn_context_completion::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
117
+ void llama_cap_context_completion::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
117
118
  const int n_left = parent_ctx->n_ctx - parent_ctx->params.n_keep;
118
119
  const int n_block_size = n_left / 2;
119
120
  const int erased_blocks = (prompt_tokens.size() - parent_ctx->params.n_keep - n_block_size) / n_block_size;
@@ -135,7 +136,7 @@ void llama_rn_context_completion::truncatePrompt(std::vector<llama_token> &promp
135
136
  prompt_tokens = new_tokens;
136
137
  }
137
138
 
138
- void llama_rn_context_completion::loadPrompt(const std::vector<std::string> &media_paths) {
139
+ void llama_cap_context_completion::loadPrompt(const std::vector<std::string> &media_paths) {
139
140
  bool has_media = !media_paths.empty();
140
141
 
141
142
  if (!has_media) {
@@ -203,11 +204,11 @@ void llama_rn_context_completion::loadPrompt(const std::vector<std::string> &med
203
204
  n_past, embd.size(), num_prompt_tokens, has_media ? 1 : 0);
204
205
  }
205
206
 
206
- void llama_rn_context_completion::beginCompletion() {
207
+ void llama_cap_context_completion::beginCompletion() {
207
208
  beginCompletion(COMMON_CHAT_FORMAT_CONTENT_ONLY, COMMON_REASONING_FORMAT_NONE, false);
208
209
  }
209
210
 
210
- void llama_rn_context_completion::beginCompletion(int chat_format, common_reasoning_format reasoning_format, bool thinking_forced_open) {
211
+ void llama_cap_context_completion::beginCompletion(int chat_format, common_reasoning_format reasoning_format, bool thinking_forced_open) {
211
212
  // number of tokens to keep when resetting context
212
213
  n_remain = parent_ctx->params.n_predict;
213
214
  llama_perf_context_reset(parent_ctx->ctx);
@@ -218,11 +219,11 @@ void llama_rn_context_completion::beginCompletion(int chat_format, common_reason
218
219
  current_thinking_forced_open = thinking_forced_open;
219
220
  }
220
221
 
221
- void llama_rn_context_completion::endCompletion() {
222
+ void llama_cap_context_completion::endCompletion() {
222
223
  is_predicting = false;
223
224
  }
224
225
 
225
- completion_token_output llama_rn_context_completion::nextToken()
226
+ completion_token_output llama_cap_context_completion::nextToken()
226
227
  {
227
228
  completion_token_output result;
228
229
  result.tok = -1;
@@ -344,7 +345,7 @@ completion_token_output llama_rn_context_completion::nextToken()
344
345
  return result;
345
346
  }
346
347
 
347
- size_t llama_rn_context_completion::findStoppingStrings(const std::string &text, const size_t last_token_size,
348
+ size_t llama_cap_context_completion::findStoppingStrings(const std::string &text, const size_t last_token_size,
348
349
  const stop_type type)
349
350
  {
350
351
  size_t stop_pos = std::string::npos;
@@ -376,7 +377,7 @@ size_t llama_rn_context_completion::findStoppingStrings(const std::string &text,
376
377
  return stop_pos;
377
378
  }
378
379
 
379
- completion_token_output llama_rn_context_completion::doCompletion()
380
+ completion_token_output llama_cap_context_completion::doCompletion()
380
381
  {
381
382
  completion_token_output token_with_probs = nextToken();
382
383
 
@@ -444,7 +445,7 @@ completion_token_output llama_rn_context_completion::doCompletion()
444
445
  return token_with_probs;
445
446
  }
446
447
 
447
- completion_partial_output llama_rn_context_completion::getPartialOutput(const std::string &token_text) {
448
+ completion_partial_output llama_cap_context_completion::getPartialOutput(const std::string &token_text) {
448
449
  common_chat_syntax syntax;
449
450
  syntax.format = static_cast<common_chat_format>(current_chat_format);
450
451
  syntax.reasoning_format = current_reasoning_format;
@@ -463,7 +464,7 @@ completion_partial_output llama_rn_context_completion::getPartialOutput(const st
463
464
  return result;
464
465
  }
465
466
 
466
- std::vector<float> llama_rn_context_completion::getEmbedding(common_params &embd_params)
467
+ std::vector<float> llama_cap_context_completion::getEmbedding(common_params &embd_params)
467
468
  {
468
469
  static const int n_embd = llama_model_n_embd(llama_get_model(parent_ctx->ctx));
469
470
  if (!embd_params.embedding)
@@ -489,7 +490,7 @@ std::vector<float> llama_rn_context_completion::getEmbedding(common_params &embd
489
490
  return out;
490
491
  }
491
492
 
492
- std::vector<float> llama_rn_context_completion::rerank(const std::string &query, const std::vector<std::string> &documents)
493
+ std::vector<float> llama_cap_context_completion::rerank(const std::string &query, const std::vector<std::string> &documents)
493
494
  {
494
495
  std::vector<float> scores;
495
496
 
@@ -548,7 +549,7 @@ std::vector<float> llama_rn_context_completion::rerank(const std::string &query,
548
549
  return scores;
549
550
  }
550
551
 
551
- std::string llama_rn_context_completion::bench(int pp, int tg, int pl, int nr)
552
+ std::string llama_cap_context_completion::bench(int pp, int tg, int pl, int nr)
552
553
  {
553
554
  if (is_predicting) {
554
555
  LOG_ERROR("cannot benchmark while predicting", "");
@@ -563,7 +564,7 @@ std::string llama_rn_context_completion::bench(int pp, int tg, int pl, int nr)
563
564
  double pp_std = 0;
564
565
  double tg_std = 0;
565
566
 
566
- // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
567
+ // TODO: move batch into llama_cap_context (related https://github.com/mybigday/llama.rn/issues/30)
567
568
  llama_batch batch = llama_batch_init(
568
569
  std::min(pp, parent_ctx->params.n_ubatch), // max n_tokens is limited by n_ubatch
569
570
  0, // No embeddings
@@ -656,7 +657,7 @@ std::string llama_rn_context_completion::bench(int pp, int tg, int pl, int nr)
656
657
  std::string("]");
657
658
  }
658
659
 
659
- void llama_rn_context_completion::processMedia(
660
+ void llama_cap_context_completion::processMedia(
660
661
  const std::string &prompt,
661
662
  const std::vector<std::string> &media_paths
662
663
  ) {
@@ -678,4 +679,181 @@ void llama_rn_context_completion::processMedia(
678
679
  );
679
680
  }
680
681
 
681
- } // namespace rnllama
682
+ // Speculative decoding implementation
683
+ completion_token_output llama_cap_context_completion::nextTokenSpeculative() {
684
+ // Enable speculative mode
685
+ use_speculative = parent_ctx->isSpectulativeEnabled();
686
+
687
+ if (!use_speculative) {
688
+ // Fallback to regular token generation
689
+ return nextToken();
690
+ }
691
+
692
+ completion_token_output result;
693
+
694
+ // If we don't have drafted tokens, draft some
695
+ if (draft_tokens.empty()) {
696
+ draft_tokens = draftTokens(parent_ctx->speculative_samples);
697
+ n_drafted = draft_tokens.size();
698
+ }
699
+
700
+ // Try to verify and accept draft tokens
701
+ if (!draft_tokens.empty()) {
702
+ int accepted = verifyAndAcceptTokens(draft_tokens);
703
+ n_accepted += accepted;
704
+
705
+ if (accepted > 0) {
706
+ // Use the first accepted token
707
+ result.tok = draft_tokens[0];
708
+ draft_tokens.erase(draft_tokens.begin());
709
+
710
+ // Update context
711
+ embd.push_back(result.tok);
712
+ --n_remain;
713
+ num_tokens_predicted++;
714
+
715
+ has_next_token = parent_ctx->params.n_predict == -1 || n_remain != 0;
716
+ return result;
717
+ }
718
+ }
719
+
720
+ // If no tokens were accepted, fall back to regular sampling
721
+ draft_tokens.clear();
722
+ return nextToken();
723
+ }
724
+
725
+ std::vector<llama_token> llama_cap_context_completion::draftTokens(int n_draft) {
726
+ std::vector<llama_token> drafted;
727
+
728
+ // Check if draft model is available
729
+ if (!parent_ctx->draft_ctx || !parent_ctx->draft_model || !parent_ctx->isSpectulativeEnabled()) {
730
+ return drafted; // Return empty vector - will fallback to regular decoding
731
+ }
732
+
733
+ // Copy current context to draft model
734
+ // Note: KV cache copying may not be available in all llama.cpp versions
735
+ // For now, we'll skip this optimization and let the draft model generate from scratch
736
+ // This is still effective for speculative decoding
737
+
738
+ // Generate draft tokens using the smaller model
739
+ for (int i = 0; i < n_draft; i++) {
740
+ // Create batch with current context
741
+ llama_batch batch = llama_batch_init(1, 0, 1);
742
+
743
+ if (!embd.empty()) {
744
+ llama_batch_add(&batch, embd.back(), n_past + i, {0}, true);
745
+ }
746
+
747
+ // Decode with draft model
748
+ if (llama_decode(parent_ctx->draft_ctx, batch) != 0) {
749
+ llama_batch_free(batch);
750
+ break;
751
+ }
752
+
753
+ // Sample from draft model (using faster, simpler sampling)
754
+ const float temp = 0.8f; // Fixed temperature for draft
755
+ auto logits = llama_get_logits_ith(parent_ctx->draft_ctx, -1);
756
+ const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(parent_ctx->draft_model));
757
+
758
+ // Simple sampling for draft model
759
+ std::vector<llama_token_data> candidates;
760
+ candidates.reserve(n_vocab);
761
+
762
+ for (int token_id = 0; token_id < n_vocab; token_id++) {
763
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
764
+ }
765
+
766
+ llama_token_data_array candidates_p = {
767
+ candidates.data(),
768
+ candidates.size(),
769
+ -1,
770
+ false,
771
+ };
772
+
773
+ // Simple temperature sampling for draft model
774
+ for (int token_id = 0; token_id < n_vocab; token_id++) {
775
+ candidates[token_id].logit /= temp; // Apply temperature
776
+ }
777
+
778
+ // Sort by logit (simple greedy sampling for draft)
779
+ std::sort(candidates.begin(), candidates.end(),
780
+ [](const llama_token_data& a, const llama_token_data& b) {
781
+ return a.logit > b.logit;
782
+ });
783
+
784
+ llama_token token = candidates[0].id; // Take top token
785
+ drafted.push_back(token);
786
+
787
+ // Clean up
788
+ llama_batch_free(batch);
789
+
790
+ // Stop if we hit EOS
791
+ const llama_vocab * vocab = llama_model_get_vocab(parent_ctx->draft_model);
792
+ if (llama_vocab_is_eog(vocab, token)) {
793
+ break;
794
+ }
795
+ }
796
+
797
+ return drafted;
798
+ }
799
+
800
+ int llama_cap_context_completion::verifyAndAcceptTokens(const std::vector<llama_token> &draft_tokens) {
801
+ if (draft_tokens.empty() || !parent_ctx->ctx) {
802
+ return 0;
803
+ }
804
+
805
+ int accepted = 0;
806
+
807
+ // Verify each draft token against the main model
808
+ for (size_t i = 0; i < draft_tokens.size(); i++) {
809
+ // Create batch for verification
810
+ llama_batch batch = llama_batch_init(1, 0, 1);
811
+
812
+ if (!embd.empty()) {
813
+ llama_batch_add(&batch, embd.back(), n_past + accepted, {0}, true);
814
+ }
815
+
816
+ // Decode with main model
817
+ if (llama_decode(parent_ctx->ctx, batch) != 0) {
818
+ llama_batch_free(batch);
819
+ break;
820
+ }
821
+
822
+ // Get logits from main model
823
+ auto logits = llama_get_logits_ith(parent_ctx->ctx, -1);
824
+ const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(parent_ctx->model));
825
+
826
+ // Sample from main model
827
+ std::vector<llama_token_data> candidates;
828
+ candidates.reserve(n_vocab);
829
+
830
+ for (int token_id = 0; token_id < n_vocab; token_id++) {
831
+ candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
832
+ }
833
+
834
+ llama_token_data_array candidates_p = {
835
+ candidates.data(),
836
+ candidates.size(),
837
+ -1,
838
+ false,
839
+ };
840
+
841
+ // Apply sampling from main model using common_sampler
842
+ llama_token main_token = common_sampler_sample(ctx_sampling, parent_ctx->ctx, -1);
843
+
844
+ // Accept if tokens match
845
+ if (main_token == draft_tokens[i]) {
846
+ accepted++;
847
+ common_sampler_accept(ctx_sampling, main_token, true);
848
+ } else {
849
+ // Reject and stop verification
850
+ break;
851
+ }
852
+
853
+ llama_batch_free(batch);
854
+ }
855
+
856
+ return accepted;
857
+ }
858
+
859
+ } // namespace capllama
@@ -1,5 +1,5 @@
1
- #ifndef RN_COMPLETION_H
2
- #define RN_COMPLETION_H
1
+ #ifndef CAP_COMPLETION_H
2
+ #define CAP_COMPLETION_H
3
3
 
4
4
  #include "common.h"
5
5
  #include "llama.h"
@@ -9,7 +9,7 @@
9
9
 
10
10
  using json = nlohmann::ordered_json;
11
11
 
12
- namespace rnllama {
12
+ namespace capllama {
13
13
 
14
14
  // Utility functions
15
15
  static inline void llama_batch_clear(llama_batch *batch) {
@@ -17,9 +17,9 @@ static inline void llama_batch_clear(llama_batch *batch) {
17
17
  }
18
18
 
19
19
  // Forward declarations
20
- struct llama_rn_context;
20
+ struct llama_cap_context;
21
21
 
22
- // Types defined in rn-llama.h (needed here for compilation)
22
+ // Types defined in cap-llama.h (needed here for compilation)
23
23
  enum stop_type
24
24
  {
25
25
  STOP_FULL,
@@ -47,9 +47,9 @@ struct completion_partial_output
47
47
  };
48
48
 
49
49
  // Completion context class
50
- struct llama_rn_context_completion {
50
+ struct llama_cap_context_completion {
51
51
  // Reference to parent context
52
- llama_rn_context* parent_ctx;
52
+ llama_cap_context* parent_ctx;
53
53
 
54
54
  // Completion state fields
55
55
  bool is_predicting = false;
@@ -77,12 +77,18 @@ struct llama_rn_context_completion {
77
77
 
78
78
  // Sampling context
79
79
  common_sampler *ctx_sampling = nullptr;
80
+
81
+ // Speculative decoding state
82
+ std::vector<llama_token> draft_tokens;
83
+ int n_drafted = 0;
84
+ int n_accepted = 0;
85
+ bool use_speculative = false;
80
86
 
81
87
  // Constructor
82
- llama_rn_context_completion(llama_rn_context* parent);
88
+ llama_cap_context_completion(llama_cap_context* parent);
83
89
 
84
90
  // Destructor
85
- ~llama_rn_context_completion();
91
+ ~llama_cap_context_completion();
86
92
 
87
93
  // Completion processing methods
88
94
  void rewind();
@@ -93,9 +99,14 @@ struct llama_rn_context_completion {
93
99
  void beginCompletion(int chat_format, common_reasoning_format reasoning_format, bool thinking_forced_open);
94
100
  void endCompletion();
95
101
  completion_token_output nextToken();
102
+ completion_token_output nextTokenSpeculative(); // NEW: Speculative version
96
103
  size_t findStoppingStrings(const std::string &text, const size_t last_token_size, const stop_type type);
97
104
  completion_token_output doCompletion();
98
105
  completion_partial_output getPartialOutput(const std::string &token_text);
106
+
107
+ // Speculative decoding methods
108
+ std::vector<llama_token> draftTokens(int n_draft);
109
+ int verifyAndAcceptTokens(const std::vector<llama_token> &draft_tokens);
99
110
 
100
111
  // Embedding methods
101
112
  std::vector<float> getEmbedding(common_params &embd_params);
@@ -111,6 +122,6 @@ struct llama_rn_context_completion {
111
122
  );
112
123
  };
113
124
 
114
- } // namespace rnllama
125
+ } // namespace capllama
115
126
 
116
- #endif /* RN_COMPLETION_H */
127
+ #endif /* CAP_COMPLETION_H */
@@ -1,14 +1,14 @@
1
- #include "rn-llama.h"
2
- #include "rn-tts.h"
3
- #include "rn-mtmd.hpp"
4
- #include "rn-completion.h"
1
+ #include "cap-llama.h"
2
+ #include "cap-tts.h"
3
+ #include "cap-mtmd.hpp"
4
+ #include "cap-completion.h"
5
5
 
6
6
  // Include multimodal support
7
7
  #include "tools/mtmd/mtmd.h"
8
8
  #include "tools/mtmd/mtmd-helper.h"
9
9
  #include "tools/mtmd/clip.h"
10
10
 
11
- namespace rnllama {
11
+ namespace capllama {
12
12
 
13
13
  static const std::vector<lm_ggml_type> kv_cache_types = {
14
14
  LM_GGML_TYPE_F32,
@@ -122,7 +122,7 @@ std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::co
122
122
  }
123
123
 
124
124
 
125
- llama_rn_context::~llama_rn_context() {
125
+ llama_cap_context::~llama_cap_context() {
126
126
  if (completion != nullptr) {
127
127
  delete completion;
128
128
  completion = nullptr;
@@ -130,9 +130,10 @@ llama_rn_context::~llama_rn_context() {
130
130
 
131
131
  releaseMultimodal();
132
132
  releaseVocoder();
133
+ releaseDraftModel(); // Clean up speculative decoding resources
133
134
  }
134
135
 
135
- bool llama_rn_context::loadModel(common_params &params_)
136
+ bool llama_cap_context::loadModel(common_params &params_)
136
137
  {
137
138
  params = params_;
138
139
  llama_init = common_init_from_params(params);
@@ -150,7 +151,7 @@ bool llama_rn_context::loadModel(common_params &params_)
150
151
  if (completion != nullptr) {
151
152
  delete completion;
152
153
  }
153
- completion = new llama_rn_context_completion(this);
154
+ completion = new llama_cap_context_completion(this);
154
155
 
155
156
  // Initialize context shift flag
156
157
  LOG_INFO("ctx_shift: %s", params.ctx_shift ? "enabled" : "disabled");
@@ -162,7 +163,7 @@ bool llama_rn_context::loadModel(common_params &params_)
162
163
  }
163
164
 
164
165
 
165
- bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
166
+ bool llama_cap_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
166
167
  const char * tmpl = llama_model_chat_template(model, name);
167
168
  if (tmpl == nullptr) {
168
169
  return false;
@@ -170,7 +171,7 @@ bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *nam
170
171
  return common_chat_verify_template(tmpl, use_jinja);
171
172
  }
172
173
 
173
- common_chat_params llama_rn_context::getFormattedChatWithJinja(
174
+ common_chat_params llama_cap_context::getFormattedChatWithJinja(
174
175
  const std::string& messages,
175
176
  const std::string& chat_template,
176
177
  const std::string& json_schema,
@@ -222,7 +223,7 @@ common_chat_params llama_rn_context::getFormattedChatWithJinja(
222
223
  }
223
224
  }
224
225
 
225
- std::string llama_rn_context::getFormattedChat(
226
+ std::string llama_cap_context::getFormattedChat(
226
227
  const std::string &messages,
227
228
  const std::string &chat_template
228
229
  ) const {
@@ -239,14 +240,14 @@ std::string llama_rn_context::getFormattedChat(
239
240
  }
240
241
  }
241
242
 
242
- llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, const std::vector<std::string> &media_paths) {
243
+ llama_cap_tokenize_result llama_cap_context::tokenize(const std::string &text, const std::vector<std::string> &media_paths) {
243
244
  if (media_paths.size() > 0) {
244
245
  if (!isMultimodalEnabled()) {
245
246
  throw std::runtime_error("Multimodal is not enabled but media paths are provided");
246
247
  }
247
248
  auto result = tokenizeWithMedia(mtmd_wrapper, text, media_paths);
248
249
  mtmd_input_chunks_free(result.chunks);
249
- llama_rn_tokenize_result tokenize_result = {
250
+ llama_cap_tokenize_result tokenize_result = {
250
251
  .tokens = result.tokens,
251
252
  .has_media = true,
252
253
  .bitmap_hashes = result.bitmap_hashes,
@@ -257,7 +258,7 @@ llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, con
257
258
  }
258
259
  std::vector<llama_token> text_tokens;
259
260
  text_tokens = common_tokenize(ctx, text, false);
260
- llama_rn_tokenize_result tokenize_result = {
261
+ llama_cap_tokenize_result tokenize_result = {
261
262
  .tokens = text_tokens,
262
263
  .has_media = false,
263
264
  .bitmap_hashes = {},
@@ -267,7 +268,7 @@ llama_rn_tokenize_result llama_rn_context::tokenize(const std::string &text, con
267
268
  return tokenize_result;
268
269
  }
269
270
 
270
- int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
271
+ int llama_cap_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
271
272
  for (auto &la : lora) {
272
273
  la.ptr = llama_adapter_lora_init(model, la.path.c_str());
273
274
  if (la.ptr == nullptr) {
@@ -280,18 +281,18 @@ int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lo
280
281
  return 0;
281
282
  }
282
283
 
283
- void llama_rn_context::removeLoraAdapters() {
284
+ void llama_cap_context::removeLoraAdapters() {
284
285
  this->lora.clear();
285
286
  common_set_adapter_lora(ctx, this->lora); // apply empty list
286
287
  }
287
288
 
288
- std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
289
+ std::vector<common_adapter_lora_info> llama_cap_context::getLoadedLoraAdapters() {
289
290
  return this->lora;
290
291
  }
291
292
 
292
- bool llama_rn_context::initMultimodal(const std::string &mmproj_path, bool use_gpu) {
293
+ bool llama_cap_context::initMultimodal(const std::string &mmproj_path, bool use_gpu) {
293
294
  try {
294
- mtmd_wrapper = new llama_rn_context_mtmd(mmproj_path, use_gpu, model, ctx, params, has_multimodal, params);
295
+ mtmd_wrapper = new llama_cap_context_mtmd(mmproj_path, use_gpu, model, ctx, params, has_multimodal, params);
295
296
  return true;
296
297
  } catch (const std::exception& e) {
297
298
  LOG_ERROR("[DEBUG] Failed to initialize multimodal: %s", e.what());
@@ -299,19 +300,19 @@ bool llama_rn_context::initMultimodal(const std::string &mmproj_path, bool use_g
299
300
  }
300
301
  }
301
302
 
302
- bool llama_rn_context::isMultimodalEnabled() const {
303
+ bool llama_cap_context::isMultimodalEnabled() const {
303
304
  return mtmd_wrapper != nullptr && mtmd_wrapper->isEnabled(has_multimodal);
304
305
  }
305
306
 
306
- bool llama_rn_context::isMultimodalSupportVision() const {
307
+ bool llama_cap_context::isMultimodalSupportVision() const {
307
308
  return isMultimodalEnabled() && mtmd_wrapper->supportVision();
308
309
  }
309
310
 
310
- bool llama_rn_context::isMultimodalSupportAudio() const {
311
+ bool llama_cap_context::isMultimodalSupportAudio() const {
311
312
  return isMultimodalEnabled() && mtmd_wrapper->supportAudio();
312
313
  }
313
314
 
314
- void llama_rn_context::releaseMultimodal() {
315
+ void llama_cap_context::releaseMultimodal() {
315
316
  if (mtmd_wrapper != nullptr) {
316
317
  delete mtmd_wrapper;
317
318
  mtmd_wrapper = nullptr;
@@ -319,9 +320,9 @@ void llama_rn_context::releaseMultimodal() {
319
320
  }
320
321
  }
321
322
 
322
- bool llama_rn_context::initVocoder(const std::string &vocoder_model_path, int batch_size) {
323
+ bool llama_cap_context::initVocoder(const std::string &vocoder_model_path, int batch_size) {
323
324
  try {
324
- tts_wrapper = new llama_rn_context_tts(vocoder_model_path, batch_size);
325
+ tts_wrapper = new llama_cap_context_tts(vocoder_model_path, batch_size);
325
326
  has_vocoder = true;
326
327
  return true;
327
328
  } catch (const std::exception& e) {
@@ -330,11 +331,11 @@ bool llama_rn_context::initVocoder(const std::string &vocoder_model_path, int ba
330
331
  }
331
332
  }
332
333
 
333
- bool llama_rn_context::isVocoderEnabled() const {
334
+ bool llama_cap_context::isVocoderEnabled() const {
334
335
  return has_vocoder && tts_wrapper != nullptr;
335
336
  }
336
337
 
337
- void llama_rn_context::releaseVocoder() {
338
+ void llama_cap_context::releaseVocoder() {
338
339
  if (tts_wrapper != nullptr) {
339
340
  delete tts_wrapper;
340
341
  tts_wrapper = nullptr;
@@ -342,4 +343,57 @@ void llama_rn_context::releaseVocoder() {
342
343
  has_vocoder = false;
343
344
  }
344
345
 
346
+ // Speculative decoding methods
347
+ bool llama_cap_context::loadDraftModel(const std::string &draft_model_path) {
348
+ if (draft_model_path.empty()) {
349
+ return false;
350
+ }
351
+
352
+ // Create draft model parameters (based on main model params)
353
+ common_params draft_params = params;
354
+ draft_params.model.path = draft_model_path;
355
+
356
+ // Mobile optimization: smaller context for draft model
357
+ if (mobile_speculative) {
358
+ draft_params.n_ctx = std::min(params.n_ctx, 1024); // Limit draft context
359
+ draft_params.n_batch = std::min(params.n_batch, 128); // Smaller batch
360
+ }
361
+
362
+ try {
363
+ // For now, use simplified draft model initialization
364
+ // This would be expanded in a full implementation to properly initialize
365
+ // the draft model and context
366
+
367
+ // TODO: Implement proper draft model loading
368
+ // draft_model = llama_load_model_from_file(draft_model_path.c_str(), draft_params);
369
+ // draft_ctx = llama_new_context_with_model(draft_model, draft_params);
370
+
371
+ // For this implementation, we'll disable speculative decoding
372
+ // until proper model loading is implemented
373
+ printf("Draft model loading not yet implemented - falling back to regular decoding\n");
374
+ speculative_enabled = false;
375
+ return false;
376
+
377
+ } catch (const std::exception& e) {
378
+ printf("Failed to load draft model: %s\n", e.what());
379
+ releaseDraftModel();
380
+ }
381
+
382
+ return false;
383
+ }
384
+
385
+ void llama_cap_context::releaseDraftModel() {
386
+ if (draft_ctx) {
387
+ // Note: draft_ctx and draft_model are managed by common_init_result
388
+ // They will be automatically cleaned up
389
+ draft_ctx = nullptr;
390
+ draft_model = nullptr;
391
+ }
392
+ speculative_enabled = false;
393
+ }
394
+
395
+ bool llama_cap_context::isSpectulativeEnabled() const {
396
+ return speculative_enabled && draft_model && draft_ctx;
397
+ }
398
+
345
399
  }