llama-cpp-capacitor 0.0.13 → 0.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/LlamaCpp.podspec +17 -17
  2. package/Package.swift +27 -27
  3. package/README.md +717 -574
  4. package/android/build.gradle +88 -69
  5. package/android/src/main/AndroidManifest.xml +2 -2
  6. package/android/src/main/CMakeLists-arm64.txt +131 -0
  7. package/android/src/main/CMakeLists-x86_64.txt +135 -0
  8. package/android/src/main/CMakeLists.txt +35 -52
  9. package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCpp.java +956 -717
  10. package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCppPlugin.java +710 -590
  11. package/android/src/main/jni-utils.h +7 -7
  12. package/android/src/main/jni.cpp +868 -127
  13. package/cpp/{rn-completion.cpp → cap-completion.cpp} +202 -24
  14. package/cpp/{rn-completion.h → cap-completion.h} +22 -11
  15. package/cpp/{rn-llama.cpp → cap-llama.cpp} +81 -27
  16. package/cpp/{rn-llama.h → cap-llama.h} +32 -20
  17. package/cpp/{rn-mtmd.hpp → cap-mtmd.hpp} +15 -15
  18. package/cpp/{rn-tts.cpp → cap-tts.cpp} +12 -12
  19. package/cpp/{rn-tts.h → cap-tts.h} +14 -14
  20. package/cpp/ggml-cpu/ggml-cpu-impl.h +30 -0
  21. package/dist/docs.json +100 -3
  22. package/dist/esm/definitions.d.ts +45 -2
  23. package/dist/esm/definitions.js.map +1 -1
  24. package/dist/esm/index.d.ts +22 -0
  25. package/dist/esm/index.js +66 -3
  26. package/dist/esm/index.js.map +1 -1
  27. package/dist/plugin.cjs.js +71 -3
  28. package/dist/plugin.cjs.js.map +1 -1
  29. package/dist/plugin.js +71 -3
  30. package/dist/plugin.js.map +1 -1
  31. package/ios/Sources/LlamaCppPlugin/LlamaCpp.swift +596 -596
  32. package/ios/Sources/LlamaCppPlugin/LlamaCppPlugin.swift +591 -514
  33. package/ios/Tests/LlamaCppPluginTests/LlamaCppPluginTests.swift +15 -15
  34. package/package.json +111 -110
@@ -1,5 +1,5 @@
1
- #ifndef RNLLAMA_H
2
- #define RNLLAMA_H
1
+ #ifndef CAPLLAMA_H
2
+ #define CAPLLAMA_H
3
3
 
4
4
  #include <sstream>
5
5
  #include <iostream>
@@ -14,14 +14,14 @@
14
14
  #include "llama-impl.h"
15
15
  #include "sampling.h"
16
16
  #include "nlohmann/json.hpp"
17
- #include "rn-tts.h"
17
+ #include "cap-tts.h"
18
18
  #if defined(__ANDROID__)
19
19
  #include <android/log.h>
20
20
  #endif
21
21
 
22
22
  using json = nlohmann::ordered_json;
23
23
 
24
- namespace rnllama {
24
+ namespace capllama {
25
25
 
26
26
  std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
27
27
 
@@ -29,17 +29,17 @@ std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::co
29
29
 
30
30
  lm_ggml_type kv_cache_type_from_str(const std::string & s);
31
31
 
32
- // Forward declarations - actual definitions are in rn-completion.h
32
+ // Forward declarations - actual definitions are in cap-completion.h
33
33
  // Note: enum forward declarations not allowed in C++, using include in implementation file
34
34
  struct completion_token_output;
35
35
  struct completion_partial_output;
36
- struct llama_rn_context_mtmd;
36
+ struct llama_cap_context_mtmd;
37
37
 
38
- struct llama_rn_context_tts;
38
+ struct llama_cap_context_tts;
39
39
 
40
- struct llama_rn_context_completion;
40
+ struct llama_cap_context_completion;
41
41
 
42
- struct llama_rn_tokenize_result {
42
+ struct llama_cap_tokenize_result {
43
43
  std::vector<llama_token> tokens;
44
44
  bool has_media = false;
45
45
  std::vector<std::string> bitmap_hashes;
@@ -48,7 +48,7 @@ struct llama_rn_tokenize_result {
48
48
  };
49
49
 
50
50
  // Main context class
51
- struct llama_rn_context {
51
+ struct llama_cap_context {
52
52
  // Model state fields
53
53
  llama_model *model = nullptr;
54
54
  float loading_progress = 0;
@@ -59,13 +59,25 @@ struct llama_rn_context {
59
59
  common_chat_templates_ptr templates;
60
60
  int n_ctx;
61
61
 
62
+ // Speculative decoding fields
63
+ llama_model *draft_model = nullptr;
64
+ llama_context *draft_ctx = nullptr;
65
+ bool speculative_enabled = false;
66
+ int speculative_samples = 3; // Mobile-optimized default
67
+ bool mobile_speculative = true;
68
+
62
69
  // Completion context
63
- llama_rn_context_completion *completion = nullptr;
70
+ llama_cap_context_completion *completion = nullptr;
64
71
 
65
- ~llama_rn_context();
72
+ ~llama_cap_context();
66
73
 
67
74
  bool loadModel(common_params &params_);
68
75
 
76
+ // Speculative decoding methods
77
+ bool loadDraftModel(const std::string &draft_model_path);
78
+ void releaseDraftModel();
79
+ bool isSpectulativeEnabled() const;
80
+
69
81
  // Model methods
70
82
  bool validateModelChatTemplate(bool use_jinja, const char *name) const;
71
83
  common_chat_params getFormattedChatWithJinja(
@@ -84,7 +96,7 @@ struct llama_rn_context {
84
96
  const std::string &messages,
85
97
  const std::string &chat_template
86
98
  ) const;
87
- llama_rn_tokenize_result tokenize(const std::string &text, const std::vector<std::string> &media_paths);
99
+ llama_cap_tokenize_result tokenize(const std::string &text, const std::vector<std::string> &media_paths);
88
100
 
89
101
  // Lora methods
90
102
  std::vector<common_adapter_lora_info> lora;
@@ -93,7 +105,7 @@ struct llama_rn_context {
93
105
  std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
94
106
 
95
107
  // Multimodal fields and methods
96
- llama_rn_context_mtmd *mtmd_wrapper = nullptr;
108
+ llama_cap_context_mtmd *mtmd_wrapper = nullptr;
97
109
  bool has_multimodal = false;
98
110
  bool initMultimodal(const std::string &mmproj_path, bool use_gpu);
99
111
  bool isMultimodalEnabled() const;
@@ -102,7 +114,7 @@ struct llama_rn_context {
102
114
  void releaseMultimodal();
103
115
 
104
116
  // TTS fields and methods (delegated to TTS context)
105
- llama_rn_context_tts *tts_wrapper = nullptr;
117
+ llama_cap_context_tts *tts_wrapper = nullptr;
106
118
  bool has_vocoder = false;
107
119
  bool initVocoder(const std::string &vocoder_model_path, int batch_size = -1);
108
120
  bool isVocoderEnabled() const;
@@ -125,15 +137,15 @@ inline void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
125
137
  void log(const char *level, const char *function, int line, const char *format, ...);
126
138
 
127
139
  // Logging macros
128
- extern bool rnllama_verbose;
140
+ extern bool capllama_verbose;
129
141
 
130
- #if RNLLAMA_VERBOSE != 1
142
+ #if CAPLLAMA_VERBOSE != 1
131
143
  #define LOG_VERBOSE(MSG, ...)
132
144
  #else
133
145
  #define LOG_VERBOSE(MSG, ...) \
134
146
  do \
135
147
  { \
136
- if (rnllama_verbose) \
148
+ if (capllama_verbose) \
137
149
  { \
138
150
  log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
139
151
  } \
@@ -144,6 +156,6 @@ extern bool rnllama_verbose;
144
156
  #define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
145
157
  #define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
146
158
 
147
- } // namespace rnllama
159
+ } // namespace capllama
148
160
 
149
- #endif /* RNLLAMA_H */
161
+ #endif /* CAPLLAMA_H */
@@ -1,6 +1,6 @@
1
1
  #pragma once
2
2
 
3
- #include "rn-llama.h"
3
+ #include "cap-llama.h"
4
4
  #include "tools/mtmd/mtmd.h"
5
5
  #include "tools/mtmd/mtmd-helper.h"
6
6
  #include "tools/mtmd/clip.h"
@@ -8,17 +8,17 @@
8
8
  #include <vector>
9
9
  #include <cstdint>
10
10
 
11
- namespace rnllama {
11
+ namespace capllama {
12
12
 
13
13
  // MTMD context structure
14
- struct llama_rn_context_mtmd {
14
+ struct llama_cap_context_mtmd {
15
15
  mtmd_context *mtmd_ctx = nullptr;
16
16
 
17
17
  // State fields
18
18
  std::vector<std::string> bitmap_past_hashes;
19
19
 
20
20
  // Constructor - Initialize multimodal
21
- llama_rn_context_mtmd(
21
+ llama_cap_context_mtmd(
22
22
  const std::string &mmproj_path,
23
23
  bool use_gpu,
24
24
  llama_model *model,
@@ -29,7 +29,7 @@ struct llama_rn_context_mtmd {
29
29
  );
30
30
 
31
31
  // Destructor - Release multimodal resources
32
- ~llama_rn_context_mtmd();
32
+ ~llama_cap_context_mtmd();
33
33
 
34
34
  // Process media
35
35
  void processMedia(
@@ -149,11 +149,11 @@ struct mtmd_tokenize_result {
149
149
  mtmd_input_chunks* chunks = nullptr;
150
150
  };
151
151
 
152
- // Forward declaration for llama_rn_context
153
- struct llama_rn_context;
152
+ // Forward declaration for llama_cap_context
153
+ struct llama_cap_context;
154
154
 
155
155
  // Tokenize text with media function
156
- inline mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrapper, const std::string &prompt, const std::vector<std::string> &media_paths) {
156
+ inline mtmd_tokenize_result tokenizeWithMedia(llama_cap_context_mtmd *mtmd_wrapper, const std::string &prompt, const std::vector<std::string> &media_paths) {
157
157
  mtmd_tokenize_result result;
158
158
  mtmd::bitmaps bitmaps;
159
159
 
@@ -369,7 +369,7 @@ inline mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrappe
369
369
  return result;
370
370
  }
371
371
 
372
- inline void llama_rn_context_mtmd::processMedia(
372
+ inline void llama_cap_context_mtmd::processMedia(
373
373
  llama_context *ctx,
374
374
  const std::string &prompt,
375
375
  const std::vector<std::string> &media_paths,
@@ -524,7 +524,7 @@ inline void llama_rn_context_mtmd::processMedia(
524
524
  mtmd_input_chunks_free(chunks);
525
525
  }
526
526
 
527
- inline llama_rn_context_mtmd::llama_rn_context_mtmd(
527
+ inline llama_cap_context_mtmd::llama_cap_context_mtmd(
528
528
  const std::string &mmproj_path,
529
529
  bool use_gpu,
530
530
  llama_model *model,
@@ -580,23 +580,23 @@ inline llama_rn_context_mtmd::llama_rn_context_mtmd(
580
580
  LOG_INFO("Context shifting disabled for multimodal support");
581
581
  }
582
582
 
583
- inline llama_rn_context_mtmd::~llama_rn_context_mtmd() {
583
+ inline llama_cap_context_mtmd::~llama_cap_context_mtmd() {
584
584
  if (mtmd_ctx != nullptr) {
585
585
  mtmd_free(mtmd_ctx);
586
586
  mtmd_ctx = nullptr;
587
587
  }
588
588
  }
589
589
 
590
- inline bool llama_rn_context_mtmd::isEnabled(bool has_multimodal) const {
590
+ inline bool llama_cap_context_mtmd::isEnabled(bool has_multimodal) const {
591
591
  return has_multimodal && this != nullptr;
592
592
  }
593
593
 
594
- inline bool llama_rn_context_mtmd::supportVision() const {
594
+ inline bool llama_cap_context_mtmd::supportVision() const {
595
595
  return mtmd_ctx != nullptr && mtmd_support_vision(mtmd_ctx);
596
596
  }
597
597
 
598
- inline bool llama_rn_context_mtmd::supportAudio() const {
598
+ inline bool llama_cap_context_mtmd::supportAudio() const {
599
599
  return mtmd_ctx != nullptr && mtmd_support_audio(mtmd_ctx);
600
600
  }
601
601
 
602
- } // namespace rnllama
602
+ } // namespace capllama
@@ -1,5 +1,5 @@
1
- #include "rn-tts.h"
2
- #include "rn-llama.h"
1
+ #include "cap-tts.h"
2
+ #include "cap-llama.h"
3
3
  #include "anyascii.h"
4
4
  #include "common.h"
5
5
  #include <regex>
@@ -11,7 +11,7 @@
11
11
  #include <thread>
12
12
  #include <cmath>
13
13
 
14
- namespace rnllama {
14
+ namespace capllama {
15
15
 
16
16
  // Constants definitions
17
17
  const std::string default_audio_text = "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>";
@@ -269,7 +269,7 @@ std::string audio_data_from_speaker(json speaker, const tts_type type) {
269
269
  }
270
270
 
271
271
  // Constructor and destructor implementations
272
- llama_rn_context_tts::llama_rn_context_tts(const std::string &vocoder_model_path, int batch_size) {
272
+ llama_cap_context_tts::llama_cap_context_tts(const std::string &vocoder_model_path, int batch_size) {
273
273
  common_params vocoder_params;
274
274
  vocoder_params.model.path = vocoder_model_path;
275
275
  vocoder_params.embedding = true;
@@ -291,14 +291,14 @@ llama_rn_context_tts::llama_rn_context_tts(const std::string &vocoder_model_path
291
291
  type = UNKNOWN; // Will be determined when used
292
292
  }
293
293
 
294
- llama_rn_context_tts::~llama_rn_context_tts() {
294
+ llama_cap_context_tts::~llama_cap_context_tts() {
295
295
  // init_result will handle cleanup automatically when it goes out of scope
296
296
  model = nullptr;
297
297
  ctx = nullptr;
298
298
  type = UNKNOWN;
299
299
  }
300
300
 
301
- void llama_rn_context_tts::setGuideTokens(const std::vector<llama_token> &tokens) {
301
+ void llama_cap_context_tts::setGuideTokens(const std::vector<llama_token> &tokens) {
302
302
  guide_tokens = tokens;
303
303
  }
304
304
 
@@ -456,8 +456,8 @@ std::vector<float> embd_to_audio(
456
456
  return audio;
457
457
  }
458
458
 
459
- // Forward declarations from rn-llama.h
460
- extern bool rnllama_verbose;
459
+ // Forward declarations from cap-llama.h
460
+ extern bool capllama_verbose;
461
461
  void log(const char *level, const char *function, int line, const char *format, ...);
462
462
 
463
463
  #define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
@@ -465,7 +465,7 @@ void log(const char *level, const char *function, int line, const char *format,
465
465
  #define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
466
466
 
467
467
  // TTS member functions
468
- tts_type llama_rn_context_tts::getTTSType(llama_rn_context* main_ctx, json speaker) {
468
+ tts_type llama_cap_context_tts::getTTSType(llama_cap_context* main_ctx, json speaker) {
469
469
  if (speaker.is_object() && speaker.contains("version")) {
470
470
  std::string version = speaker["version"].get<std::string>();
471
471
  if (version == "0.2") {
@@ -489,7 +489,7 @@ tts_type llama_rn_context_tts::getTTSType(llama_rn_context* main_ctx, json speak
489
489
  return OUTETTS_V0_2;
490
490
  }
491
491
 
492
- llama_rn_audio_completion_result llama_rn_context_tts::getFormattedAudioCompletion(llama_rn_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak) {
492
+ llama_cap_audio_completion_result llama_cap_context_tts::getFormattedAudioCompletion(llama_cap_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak) {
493
493
  std::string audio_text = default_audio_text;
494
494
  std::string audio_data = default_audio_data;
495
495
 
@@ -522,7 +522,7 @@ llama_rn_audio_completion_result llama_rn_context_tts::getFormattedAudioCompleti
522
522
  }
523
523
  }
524
524
 
525
- std::vector<llama_token> llama_rn_context_tts::getAudioCompletionGuideTokens(llama_rn_context* main_ctx, const std::string &text_to_speak) {
525
+ std::vector<llama_token> llama_cap_context_tts::getAudioCompletionGuideTokens(llama_cap_context* main_ctx, const std::string &text_to_speak) {
526
526
  const llama_vocab * vocab = llama_model_get_vocab(main_ctx->model);
527
527
  const tts_type tts_type = getTTSType(main_ctx);
528
528
  std::string clean_text = process_text(text_to_speak, tts_type);
@@ -557,7 +557,7 @@ std::vector<llama_token> llama_rn_context_tts::getAudioCompletionGuideTokens(lla
557
557
  return result;
558
558
  }
559
559
 
560
- std::vector<float> llama_rn_context_tts::decodeAudioTokens(llama_rn_context* main_ctx, const std::vector<llama_token> &tokens) {
560
+ std::vector<float> llama_cap_context_tts::decodeAudioTokens(llama_cap_context* main_ctx, const std::vector<llama_token> &tokens) {
561
561
  std::vector<llama_token> tokens_audio = tokens;
562
562
  tts_type tts_type = getTTSType(main_ctx);
563
563
  if (tts_type == OUTETTS_V0_3 || tts_type == OUTETTS_V0_2) {
@@ -1,5 +1,5 @@
1
- #ifndef RNTTS_H
2
- #define RNTTS_H
1
+ #ifndef CAPTTS_H
2
+ #define CAPTTS_H
3
3
 
4
4
  #include <vector>
5
5
  #include <string>
@@ -9,10 +9,10 @@
9
9
 
10
10
  using json = nlohmann::ordered_json;
11
11
 
12
- namespace rnllama {
12
+ namespace capllama {
13
13
 
14
14
  // Forward declarations
15
- struct llama_rn_context;
15
+ struct llama_cap_context;
16
16
 
17
17
  // TTS type enumeration
18
18
  enum tts_type {
@@ -23,19 +23,19 @@ enum tts_type {
23
23
  };
24
24
 
25
25
  // Audio completion result structure
26
- struct llama_rn_audio_completion_result {
26
+ struct llama_cap_audio_completion_result {
27
27
  std::string prompt;
28
28
  const char *grammar;
29
29
  };
30
30
 
31
31
  // TTS context for TTS-specific functionality
32
- struct llama_rn_context_tts {
32
+ struct llama_cap_context_tts {
33
33
  // TTS state fields
34
34
  std::vector<llama_token> audio_tokens;
35
35
  std::vector<llama_token> guide_tokens;
36
36
  bool next_token_uses_guide_token = true;
37
37
 
38
- // Vocoder fields (from llama_rn_context_vocoder)
38
+ // Vocoder fields (from llama_cap_context_vocoder)
39
39
  common_init_result init_result;
40
40
  common_params params;
41
41
  llama_model *model = nullptr;
@@ -43,17 +43,17 @@ struct llama_rn_context_tts {
43
43
  tts_type type = UNKNOWN;
44
44
 
45
45
  // Constructor and destructor
46
- llama_rn_context_tts(const std::string &vocoder_model_path, int batch_size = -1);
47
- ~llama_rn_context_tts();
46
+ llama_cap_context_tts(const std::string &vocoder_model_path, int batch_size = -1);
47
+ ~llama_cap_context_tts();
48
48
 
49
49
  // TTS utility methods
50
- tts_type getTTSType(llama_rn_context* main_ctx, json speaker = nullptr);
51
- llama_rn_audio_completion_result getFormattedAudioCompletion(llama_rn_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak);
52
- std::vector<llama_token> getAudioCompletionGuideTokens(llama_rn_context* main_ctx, const std::string &text_to_speak);
53
- std::vector<float> decodeAudioTokens(llama_rn_context* main_ctx, const std::vector<llama_token> &tokens);
50
+ tts_type getTTSType(llama_cap_context* main_ctx, json speaker = nullptr);
51
+ llama_cap_audio_completion_result getFormattedAudioCompletion(llama_cap_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak);
52
+ std::vector<llama_token> getAudioCompletionGuideTokens(llama_cap_context* main_ctx, const std::string &text_to_speak);
53
+ std::vector<float> decodeAudioTokens(llama_cap_context* main_ctx, const std::vector<llama_token> &tokens);
54
54
  void setGuideTokens(const std::vector<llama_token> &tokens);
55
55
  };
56
56
 
57
57
  }
58
58
 
59
- #endif /* RNTTS_H */
59
+ #endif /* CAPTTS_H */
@@ -78,6 +78,36 @@ struct lm_ggml_compute_params {
78
78
  #include <sys/prctl.h>
79
79
  #endif
80
80
 
81
+ // NEON compatibility layer
82
+ #if defined(__ARM_NEON)
83
+ #include <arm_neon.h>
84
+
85
+ // Only define vcvtnq_s32_f32 for older ARM architectures that don't have it
86
+ // NDK 29+ includes this function for ARMv8 and newer
87
+ #if !defined(__aarch64__) && !defined(__ARM_ARCH_8A__) && defined(__ARM_ARCH) && __ARM_ARCH < 8
88
+ // Emulate vcvtn (round to nearest) for ARMv7
89
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
90
+ // Round to nearest integer
91
+ const float32x4_t vhalf = vdupq_n_f32(0.5f);
92
+ const float32x4_t vnhalf = vdupq_n_f32(-0.5f);
93
+ const float32x4_t vzero = vdupq_n_f32(0.0f);
94
+ uint32x4_t mask = vcgeq_f32(v, vzero);
95
+ float32x4_t rounded = vbslq_f32(mask, vaddq_f32(v, vhalf), vsubq_f32(v, vhalf));
96
+ return vcvtq_s32_f32(rounded);
97
+ }
98
+ #endif
99
+ #endif
100
+
101
+ #if defined(__s390x__) && defined(LM_GGML_NNPA)
102
+ #ifndef __NNPA__
103
+ #define __NNPA__
104
+ #endif // __NNPA__
105
+ #endif // __s390x__ && LM_GGML_NNPA
106
+
107
+ #if defined(__ARM_FEATURE_SVE)
108
+ #include <sys/prctl.h>
109
+ #endif
110
+
81
111
  #if defined(__ARM_NEON)
82
112
 
83
113
  // ref: https://github.com/ggml-org/llama.cpp/pull/5404
package/dist/docs.json CHANGED
@@ -493,6 +493,82 @@
493
493
  "complexTypes": [],
494
494
  "slug": "releasevocoder"
495
495
  },
496
+ {
497
+ "name": "downloadModel",
498
+ "signature": "(options: { url: string; filename: string; }) => Promise<string>",
499
+ "parameters": [
500
+ {
501
+ "name": "options",
502
+ "docs": "",
503
+ "type": "{ url: string; filename: string; }"
504
+ }
505
+ ],
506
+ "returns": "Promise<string>",
507
+ "tags": [],
508
+ "docs": "",
509
+ "complexTypes": [],
510
+ "slug": "downloadmodel"
511
+ },
512
+ {
513
+ "name": "getDownloadProgress",
514
+ "signature": "(options: { url: string; }) => Promise<{ progress: number; completed: boolean; failed: boolean; errorMessage?: string; localPath?: string; downloadedBytes: number; totalBytes: number; }>",
515
+ "parameters": [
516
+ {
517
+ "name": "options",
518
+ "docs": "",
519
+ "type": "{ url: string; }"
520
+ }
521
+ ],
522
+ "returns": "Promise<{ progress: number; completed: boolean; failed: boolean; errorMessage?: string | undefined; localPath?: string | undefined; downloadedBytes: number; totalBytes: number; }>",
523
+ "tags": [],
524
+ "docs": "",
525
+ "complexTypes": [],
526
+ "slug": "getdownloadprogress"
527
+ },
528
+ {
529
+ "name": "cancelDownload",
530
+ "signature": "(options: { url: string; }) => Promise<boolean>",
531
+ "parameters": [
532
+ {
533
+ "name": "options",
534
+ "docs": "",
535
+ "type": "{ url: string; }"
536
+ }
537
+ ],
538
+ "returns": "Promise<boolean>",
539
+ "tags": [],
540
+ "docs": "",
541
+ "complexTypes": [],
542
+ "slug": "canceldownload"
543
+ },
544
+ {
545
+ "name": "getAvailableModels",
546
+ "signature": "() => Promise<Array<{ name: string; path: string; size: number; }>>",
547
+ "parameters": [],
548
+ "returns": "Promise<{ name: string; path: string; size: number; }[]>",
549
+ "tags": [],
550
+ "docs": "",
551
+ "complexTypes": [
552
+ "Array"
553
+ ],
554
+ "slug": "getavailablemodels"
555
+ },
556
+ {
557
+ "name": "convertJsonSchemaToGrammar",
558
+ "signature": "(options: { schema: string; }) => Promise<string>",
559
+ "parameters": [
560
+ {
561
+ "name": "options",
562
+ "docs": "",
563
+ "type": "{ schema: string; }"
564
+ }
565
+ ],
566
+ "returns": "Promise<string>",
567
+ "tags": [],
568
+ "docs": "",
569
+ "complexTypes": [],
570
+ "slug": "convertjsonschematogrammar"
571
+ },
496
572
  {
497
573
  "name": "addListener",
498
574
  "signature": "(eventName: string, listenerFunc: (data: any) => void) => Promise<void>",
@@ -4031,6 +4107,27 @@
4031
4107
  "complexTypes": [],
4032
4108
  "type": "number | undefined"
4033
4109
  },
4110
+ {
4111
+ "name": "draft_model",
4112
+ "tags": [],
4113
+ "docs": "Path to draft model for speculative decoding (mobile optimization)",
4114
+ "complexTypes": [],
4115
+ "type": "string | undefined"
4116
+ },
4117
+ {
4118
+ "name": "speculative_samples",
4119
+ "tags": [],
4120
+ "docs": "Number of tokens to predict speculatively (default: 3 for mobile)",
4121
+ "complexTypes": [],
4122
+ "type": "number | undefined"
4123
+ },
4124
+ {
4125
+ "name": "mobile_speculative",
4126
+ "tags": [],
4127
+ "docs": "Enable mobile-optimized speculative decoding",
4128
+ "complexTypes": [],
4129
+ "type": "boolean | undefined"
4130
+ },
4034
4131
  {
4035
4132
  "name": "n_gpu_layers",
4036
4133
  "tags": [],
@@ -4263,11 +4360,11 @@
4263
4360
  {
4264
4361
  "name": "tool_calls",
4265
4362
  "tags": [],
4266
- "docs": "Tool calls",
4363
+ "docs": "Tool calls (parsed from response)",
4267
4364
  "complexTypes": [
4268
4365
  "Array"
4269
4366
  ],
4270
- "type": "Array<{\r\n type: 'function';\r\n function: {\r\n name: string;\r\n arguments: string;\r\n };\r\n id?: string;\r\n }>"
4367
+ "type": "Array<{\r\n type: 'function';\r\n function: {\r\n name: string;\r\n arguments: string; // JSON string of arguments\r\n };\r\n id?: string;\r\n }>"
4271
4368
  },
4272
4369
  {
4273
4370
  "name": "content",
@@ -4535,7 +4632,7 @@
4535
4632
  {
4536
4633
  "name": "grammar",
4537
4634
  "tags": [],
4538
- "docs": "Set grammar for grammar-based sampling. Default: no grammar",
4635
+ "docs": "Set grammar for grammar-based sampling (GBNF format). Default: no grammar\r\nThis will override json_schema if both are provided.",
4539
4636
  "complexTypes": [],
4540
4637
  "type": "string | undefined"
4541
4638
  },
@@ -13,6 +13,18 @@ export interface NativeContextParams {
13
13
  n_batch?: number;
14
14
  n_ubatch?: number;
15
15
  n_threads?: number;
16
+ /**
17
+ * Path to draft model for speculative decoding (mobile optimization)
18
+ */
19
+ draft_model?: string;
20
+ /**
21
+ * Number of tokens to predict speculatively (default: 3 for mobile)
22
+ */
23
+ speculative_samples?: number;
24
+ /**
25
+ * Enable mobile-optimized speculative decoding
26
+ */
27
+ mobile_speculative?: boolean;
16
28
  /**
17
29
  * Number of layers to store in VRAM (Currently only for iOS)
18
30
  */
@@ -87,7 +99,8 @@ export interface NativeCompletionParams {
87
99
  */
88
100
  json_schema?: string;
89
101
  /**
90
- * Set grammar for grammar-based sampling. Default: no grammar
102
+ * Set grammar for grammar-based sampling (GBNF format). Default: no grammar
103
+ * This will override json_schema if both are provided.
91
104
  */
92
105
  grammar?: string;
93
106
  /**
@@ -268,7 +281,7 @@ export interface NativeCompletionResult {
268
281
  */
269
282
  reasoning_content: string;
270
283
  /**
271
- * Tool calls
284
+ * Tool calls (parsed from response)
272
285
  */
273
286
  tool_calls: Array<{
274
287
  type: 'function';
@@ -483,6 +496,10 @@ export interface CompletionParams extends Omit<NativeCompletionParams, 'emit_par
483
496
  chatTemplate?: string;
484
497
  chat_template?: string;
485
498
  jinja?: boolean;
499
+ /**
500
+ * GBNF grammar for structured output. Takes precedence over json_schema.
501
+ */
502
+ grammar?: string;
486
503
  tools?: object;
487
504
  parallel_tool_calls?: object;
488
505
  tool_choice?: string;
@@ -648,6 +665,32 @@ export interface LlamaCppPlugin {
648
665
  releaseVocoder(options: {
649
666
  contextId: number;
650
667
  }): Promise<void>;
668
+ downloadModel(options: {
669
+ url: string;
670
+ filename: string;
671
+ }): Promise<string>;
672
+ getDownloadProgress(options: {
673
+ url: string;
674
+ }): Promise<{
675
+ progress: number;
676
+ completed: boolean;
677
+ failed: boolean;
678
+ errorMessage?: string;
679
+ localPath?: string;
680
+ downloadedBytes: number;
681
+ totalBytes: number;
682
+ }>;
683
+ cancelDownload(options: {
684
+ url: string;
685
+ }): Promise<boolean>;
686
+ getAvailableModels(): Promise<Array<{
687
+ name: string;
688
+ path: string;
689
+ size: number;
690
+ }>>;
691
+ convertJsonSchemaToGrammar(options: {
692
+ schema: string;
693
+ }): Promise<string>;
651
694
  addListener(eventName: string, listenerFunc: (data: any) => void): Promise<void>;
652
695
  removeAllListeners(eventName: string): Promise<void>;
653
696
  }