llama-cpp-capacitor 0.0.13 → 0.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LlamaCpp.podspec +17 -17
- package/Package.swift +27 -27
- package/README.md +717 -574
- package/android/build.gradle +88 -69
- package/android/src/main/AndroidManifest.xml +2 -2
- package/android/src/main/CMakeLists-arm64.txt +131 -0
- package/android/src/main/CMakeLists-x86_64.txt +135 -0
- package/android/src/main/CMakeLists.txt +35 -52
- package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCpp.java +956 -717
- package/android/src/main/java/ai/annadata/plugin/capacitor/LlamaCppPlugin.java +710 -590
- package/android/src/main/jni-utils.h +7 -7
- package/android/src/main/jni.cpp +868 -127
- package/cpp/{rn-completion.cpp → cap-completion.cpp} +202 -24
- package/cpp/{rn-completion.h → cap-completion.h} +22 -11
- package/cpp/{rn-llama.cpp → cap-llama.cpp} +81 -27
- package/cpp/{rn-llama.h → cap-llama.h} +32 -20
- package/cpp/{rn-mtmd.hpp → cap-mtmd.hpp} +15 -15
- package/cpp/{rn-tts.cpp → cap-tts.cpp} +12 -12
- package/cpp/{rn-tts.h → cap-tts.h} +14 -14
- package/cpp/ggml-cpu/ggml-cpu-impl.h +30 -0
- package/dist/docs.json +100 -3
- package/dist/esm/definitions.d.ts +45 -2
- package/dist/esm/definitions.js.map +1 -1
- package/dist/esm/index.d.ts +22 -0
- package/dist/esm/index.js +66 -3
- package/dist/esm/index.js.map +1 -1
- package/dist/plugin.cjs.js +71 -3
- package/dist/plugin.cjs.js.map +1 -1
- package/dist/plugin.js +71 -3
- package/dist/plugin.js.map +1 -1
- package/ios/Sources/LlamaCppPlugin/LlamaCpp.swift +596 -596
- package/ios/Sources/LlamaCppPlugin/LlamaCppPlugin.swift +591 -514
- package/ios/Tests/LlamaCppPluginTests/LlamaCppPluginTests.swift +15 -15
- package/package.json +111 -110
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
#ifndef
|
|
2
|
-
#define
|
|
1
|
+
#ifndef CAPLLAMA_H
|
|
2
|
+
#define CAPLLAMA_H
|
|
3
3
|
|
|
4
4
|
#include <sstream>
|
|
5
5
|
#include <iostream>
|
|
@@ -14,14 +14,14 @@
|
|
|
14
14
|
#include "llama-impl.h"
|
|
15
15
|
#include "sampling.h"
|
|
16
16
|
#include "nlohmann/json.hpp"
|
|
17
|
-
#include "
|
|
17
|
+
#include "cap-tts.h"
|
|
18
18
|
#if defined(__ANDROID__)
|
|
19
19
|
#include <android/log.h>
|
|
20
20
|
#endif
|
|
21
21
|
|
|
22
22
|
using json = nlohmann::ordered_json;
|
|
23
23
|
|
|
24
|
-
namespace
|
|
24
|
+
namespace capllama {
|
|
25
25
|
|
|
26
26
|
std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
|
|
27
27
|
|
|
@@ -29,17 +29,17 @@ std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::co
|
|
|
29
29
|
|
|
30
30
|
lm_ggml_type kv_cache_type_from_str(const std::string & s);
|
|
31
31
|
|
|
32
|
-
// Forward declarations - actual definitions are in
|
|
32
|
+
// Forward declarations - actual definitions are in cap-completion.h
|
|
33
33
|
// Note: enum forward declarations not allowed in C++, using include in implementation file
|
|
34
34
|
struct completion_token_output;
|
|
35
35
|
struct completion_partial_output;
|
|
36
|
-
struct
|
|
36
|
+
struct llama_cap_context_mtmd;
|
|
37
37
|
|
|
38
|
-
struct
|
|
38
|
+
struct llama_cap_context_tts;
|
|
39
39
|
|
|
40
|
-
struct
|
|
40
|
+
struct llama_cap_context_completion;
|
|
41
41
|
|
|
42
|
-
struct
|
|
42
|
+
struct llama_cap_tokenize_result {
|
|
43
43
|
std::vector<llama_token> tokens;
|
|
44
44
|
bool has_media = false;
|
|
45
45
|
std::vector<std::string> bitmap_hashes;
|
|
@@ -48,7 +48,7 @@ struct llama_rn_tokenize_result {
|
|
|
48
48
|
};
|
|
49
49
|
|
|
50
50
|
// Main context class
|
|
51
|
-
struct
|
|
51
|
+
struct llama_cap_context {
|
|
52
52
|
// Model state fields
|
|
53
53
|
llama_model *model = nullptr;
|
|
54
54
|
float loading_progress = 0;
|
|
@@ -59,13 +59,25 @@ struct llama_rn_context {
|
|
|
59
59
|
common_chat_templates_ptr templates;
|
|
60
60
|
int n_ctx;
|
|
61
61
|
|
|
62
|
+
// Speculative decoding fields
|
|
63
|
+
llama_model *draft_model = nullptr;
|
|
64
|
+
llama_context *draft_ctx = nullptr;
|
|
65
|
+
bool speculative_enabled = false;
|
|
66
|
+
int speculative_samples = 3; // Mobile-optimized default
|
|
67
|
+
bool mobile_speculative = true;
|
|
68
|
+
|
|
62
69
|
// Completion context
|
|
63
|
-
|
|
70
|
+
llama_cap_context_completion *completion = nullptr;
|
|
64
71
|
|
|
65
|
-
~
|
|
72
|
+
~llama_cap_context();
|
|
66
73
|
|
|
67
74
|
bool loadModel(common_params ¶ms_);
|
|
68
75
|
|
|
76
|
+
// Speculative decoding methods
|
|
77
|
+
bool loadDraftModel(const std::string &draft_model_path);
|
|
78
|
+
void releaseDraftModel();
|
|
79
|
+
bool isSpectulativeEnabled() const;
|
|
80
|
+
|
|
69
81
|
// Model methods
|
|
70
82
|
bool validateModelChatTemplate(bool use_jinja, const char *name) const;
|
|
71
83
|
common_chat_params getFormattedChatWithJinja(
|
|
@@ -84,7 +96,7 @@ struct llama_rn_context {
|
|
|
84
96
|
const std::string &messages,
|
|
85
97
|
const std::string &chat_template
|
|
86
98
|
) const;
|
|
87
|
-
|
|
99
|
+
llama_cap_tokenize_result tokenize(const std::string &text, const std::vector<std::string> &media_paths);
|
|
88
100
|
|
|
89
101
|
// Lora methods
|
|
90
102
|
std::vector<common_adapter_lora_info> lora;
|
|
@@ -93,7 +105,7 @@ struct llama_rn_context {
|
|
|
93
105
|
std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
|
|
94
106
|
|
|
95
107
|
// Multimodal fields and methods
|
|
96
|
-
|
|
108
|
+
llama_cap_context_mtmd *mtmd_wrapper = nullptr;
|
|
97
109
|
bool has_multimodal = false;
|
|
98
110
|
bool initMultimodal(const std::string &mmproj_path, bool use_gpu);
|
|
99
111
|
bool isMultimodalEnabled() const;
|
|
@@ -102,7 +114,7 @@ struct llama_rn_context {
|
|
|
102
114
|
void releaseMultimodal();
|
|
103
115
|
|
|
104
116
|
// TTS fields and methods (delegated to TTS context)
|
|
105
|
-
|
|
117
|
+
llama_cap_context_tts *tts_wrapper = nullptr;
|
|
106
118
|
bool has_vocoder = false;
|
|
107
119
|
bool initVocoder(const std::string &vocoder_model_path, int batch_size = -1);
|
|
108
120
|
bool isVocoderEnabled() const;
|
|
@@ -125,15 +137,15 @@ inline void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, s
|
|
|
125
137
|
void log(const char *level, const char *function, int line, const char *format, ...);
|
|
126
138
|
|
|
127
139
|
// Logging macros
|
|
128
|
-
extern bool
|
|
140
|
+
extern bool capllama_verbose;
|
|
129
141
|
|
|
130
|
-
#if
|
|
142
|
+
#if CAPLLAMA_VERBOSE != 1
|
|
131
143
|
#define LOG_VERBOSE(MSG, ...)
|
|
132
144
|
#else
|
|
133
145
|
#define LOG_VERBOSE(MSG, ...) \
|
|
134
146
|
do \
|
|
135
147
|
{ \
|
|
136
|
-
if (
|
|
148
|
+
if (capllama_verbose) \
|
|
137
149
|
{ \
|
|
138
150
|
log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
|
|
139
151
|
} \
|
|
@@ -144,6 +156,6 @@ extern bool rnllama_verbose;
|
|
|
144
156
|
#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
|
145
157
|
#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
|
146
158
|
|
|
147
|
-
} // namespace
|
|
159
|
+
} // namespace capllama
|
|
148
160
|
|
|
149
|
-
#endif /*
|
|
161
|
+
#endif /* CAPLLAMA_H */
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#pragma once
|
|
2
2
|
|
|
3
|
-
#include "
|
|
3
|
+
#include "cap-llama.h"
|
|
4
4
|
#include "tools/mtmd/mtmd.h"
|
|
5
5
|
#include "tools/mtmd/mtmd-helper.h"
|
|
6
6
|
#include "tools/mtmd/clip.h"
|
|
@@ -8,17 +8,17 @@
|
|
|
8
8
|
#include <vector>
|
|
9
9
|
#include <cstdint>
|
|
10
10
|
|
|
11
|
-
namespace
|
|
11
|
+
namespace capllama {
|
|
12
12
|
|
|
13
13
|
// MTMD context structure
|
|
14
|
-
struct
|
|
14
|
+
struct llama_cap_context_mtmd {
|
|
15
15
|
mtmd_context *mtmd_ctx = nullptr;
|
|
16
16
|
|
|
17
17
|
// State fields
|
|
18
18
|
std::vector<std::string> bitmap_past_hashes;
|
|
19
19
|
|
|
20
20
|
// Constructor - Initialize multimodal
|
|
21
|
-
|
|
21
|
+
llama_cap_context_mtmd(
|
|
22
22
|
const std::string &mmproj_path,
|
|
23
23
|
bool use_gpu,
|
|
24
24
|
llama_model *model,
|
|
@@ -29,7 +29,7 @@ struct llama_rn_context_mtmd {
|
|
|
29
29
|
);
|
|
30
30
|
|
|
31
31
|
// Destructor - Release multimodal resources
|
|
32
|
-
~
|
|
32
|
+
~llama_cap_context_mtmd();
|
|
33
33
|
|
|
34
34
|
// Process media
|
|
35
35
|
void processMedia(
|
|
@@ -149,11 +149,11 @@ struct mtmd_tokenize_result {
|
|
|
149
149
|
mtmd_input_chunks* chunks = nullptr;
|
|
150
150
|
};
|
|
151
151
|
|
|
152
|
-
// Forward declaration for
|
|
153
|
-
struct
|
|
152
|
+
// Forward declaration for llama_cap_context
|
|
153
|
+
struct llama_cap_context;
|
|
154
154
|
|
|
155
155
|
// Tokenize text with media function
|
|
156
|
-
inline mtmd_tokenize_result tokenizeWithMedia(
|
|
156
|
+
inline mtmd_tokenize_result tokenizeWithMedia(llama_cap_context_mtmd *mtmd_wrapper, const std::string &prompt, const std::vector<std::string> &media_paths) {
|
|
157
157
|
mtmd_tokenize_result result;
|
|
158
158
|
mtmd::bitmaps bitmaps;
|
|
159
159
|
|
|
@@ -369,7 +369,7 @@ inline mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrappe
|
|
|
369
369
|
return result;
|
|
370
370
|
}
|
|
371
371
|
|
|
372
|
-
inline void
|
|
372
|
+
inline void llama_cap_context_mtmd::processMedia(
|
|
373
373
|
llama_context *ctx,
|
|
374
374
|
const std::string &prompt,
|
|
375
375
|
const std::vector<std::string> &media_paths,
|
|
@@ -524,7 +524,7 @@ inline void llama_rn_context_mtmd::processMedia(
|
|
|
524
524
|
mtmd_input_chunks_free(chunks);
|
|
525
525
|
}
|
|
526
526
|
|
|
527
|
-
inline
|
|
527
|
+
inline llama_cap_context_mtmd::llama_cap_context_mtmd(
|
|
528
528
|
const std::string &mmproj_path,
|
|
529
529
|
bool use_gpu,
|
|
530
530
|
llama_model *model,
|
|
@@ -580,23 +580,23 @@ inline llama_rn_context_mtmd::llama_rn_context_mtmd(
|
|
|
580
580
|
LOG_INFO("Context shifting disabled for multimodal support");
|
|
581
581
|
}
|
|
582
582
|
|
|
583
|
-
inline
|
|
583
|
+
inline llama_cap_context_mtmd::~llama_cap_context_mtmd() {
|
|
584
584
|
if (mtmd_ctx != nullptr) {
|
|
585
585
|
mtmd_free(mtmd_ctx);
|
|
586
586
|
mtmd_ctx = nullptr;
|
|
587
587
|
}
|
|
588
588
|
}
|
|
589
589
|
|
|
590
|
-
inline bool
|
|
590
|
+
inline bool llama_cap_context_mtmd::isEnabled(bool has_multimodal) const {
|
|
591
591
|
return has_multimodal && this != nullptr;
|
|
592
592
|
}
|
|
593
593
|
|
|
594
|
-
inline bool
|
|
594
|
+
inline bool llama_cap_context_mtmd::supportVision() const {
|
|
595
595
|
return mtmd_ctx != nullptr && mtmd_support_vision(mtmd_ctx);
|
|
596
596
|
}
|
|
597
597
|
|
|
598
|
-
inline bool
|
|
598
|
+
inline bool llama_cap_context_mtmd::supportAudio() const {
|
|
599
599
|
return mtmd_ctx != nullptr && mtmd_support_audio(mtmd_ctx);
|
|
600
600
|
}
|
|
601
601
|
|
|
602
|
-
} // namespace
|
|
602
|
+
} // namespace capllama
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
#include "
|
|
2
|
-
#include "
|
|
1
|
+
#include "cap-tts.h"
|
|
2
|
+
#include "cap-llama.h"
|
|
3
3
|
#include "anyascii.h"
|
|
4
4
|
#include "common.h"
|
|
5
5
|
#include <regex>
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#include <thread>
|
|
12
12
|
#include <cmath>
|
|
13
13
|
|
|
14
|
-
namespace
|
|
14
|
+
namespace capllama {
|
|
15
15
|
|
|
16
16
|
// Constants definitions
|
|
17
17
|
const std::string default_audio_text = "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>";
|
|
@@ -269,7 +269,7 @@ std::string audio_data_from_speaker(json speaker, const tts_type type) {
|
|
|
269
269
|
}
|
|
270
270
|
|
|
271
271
|
// Constructor and destructor implementations
|
|
272
|
-
|
|
272
|
+
llama_cap_context_tts::llama_cap_context_tts(const std::string &vocoder_model_path, int batch_size) {
|
|
273
273
|
common_params vocoder_params;
|
|
274
274
|
vocoder_params.model.path = vocoder_model_path;
|
|
275
275
|
vocoder_params.embedding = true;
|
|
@@ -291,14 +291,14 @@ llama_rn_context_tts::llama_rn_context_tts(const std::string &vocoder_model_path
|
|
|
291
291
|
type = UNKNOWN; // Will be determined when used
|
|
292
292
|
}
|
|
293
293
|
|
|
294
|
-
|
|
294
|
+
llama_cap_context_tts::~llama_cap_context_tts() {
|
|
295
295
|
// init_result will handle cleanup automatically when it goes out of scope
|
|
296
296
|
model = nullptr;
|
|
297
297
|
ctx = nullptr;
|
|
298
298
|
type = UNKNOWN;
|
|
299
299
|
}
|
|
300
300
|
|
|
301
|
-
void
|
|
301
|
+
void llama_cap_context_tts::setGuideTokens(const std::vector<llama_token> &tokens) {
|
|
302
302
|
guide_tokens = tokens;
|
|
303
303
|
}
|
|
304
304
|
|
|
@@ -456,8 +456,8 @@ std::vector<float> embd_to_audio(
|
|
|
456
456
|
return audio;
|
|
457
457
|
}
|
|
458
458
|
|
|
459
|
-
// Forward declarations from
|
|
460
|
-
extern bool
|
|
459
|
+
// Forward declarations from cap-llama.h
|
|
460
|
+
extern bool capllama_verbose;
|
|
461
461
|
void log(const char *level, const char *function, int line, const char *format, ...);
|
|
462
462
|
|
|
463
463
|
#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
|
@@ -465,7 +465,7 @@ void log(const char *level, const char *function, int line, const char *format,
|
|
|
465
465
|
#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
|
466
466
|
|
|
467
467
|
// TTS member functions
|
|
468
|
-
tts_type
|
|
468
|
+
tts_type llama_cap_context_tts::getTTSType(llama_cap_context* main_ctx, json speaker) {
|
|
469
469
|
if (speaker.is_object() && speaker.contains("version")) {
|
|
470
470
|
std::string version = speaker["version"].get<std::string>();
|
|
471
471
|
if (version == "0.2") {
|
|
@@ -489,7 +489,7 @@ tts_type llama_rn_context_tts::getTTSType(llama_rn_context* main_ctx, json speak
|
|
|
489
489
|
return OUTETTS_V0_2;
|
|
490
490
|
}
|
|
491
491
|
|
|
492
|
-
|
|
492
|
+
llama_cap_audio_completion_result llama_cap_context_tts::getFormattedAudioCompletion(llama_cap_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak) {
|
|
493
493
|
std::string audio_text = default_audio_text;
|
|
494
494
|
std::string audio_data = default_audio_data;
|
|
495
495
|
|
|
@@ -522,7 +522,7 @@ llama_rn_audio_completion_result llama_rn_context_tts::getFormattedAudioCompleti
|
|
|
522
522
|
}
|
|
523
523
|
}
|
|
524
524
|
|
|
525
|
-
std::vector<llama_token>
|
|
525
|
+
std::vector<llama_token> llama_cap_context_tts::getAudioCompletionGuideTokens(llama_cap_context* main_ctx, const std::string &text_to_speak) {
|
|
526
526
|
const llama_vocab * vocab = llama_model_get_vocab(main_ctx->model);
|
|
527
527
|
const tts_type tts_type = getTTSType(main_ctx);
|
|
528
528
|
std::string clean_text = process_text(text_to_speak, tts_type);
|
|
@@ -557,7 +557,7 @@ std::vector<llama_token> llama_rn_context_tts::getAudioCompletionGuideTokens(lla
|
|
|
557
557
|
return result;
|
|
558
558
|
}
|
|
559
559
|
|
|
560
|
-
std::vector<float>
|
|
560
|
+
std::vector<float> llama_cap_context_tts::decodeAudioTokens(llama_cap_context* main_ctx, const std::vector<llama_token> &tokens) {
|
|
561
561
|
std::vector<llama_token> tokens_audio = tokens;
|
|
562
562
|
tts_type tts_type = getTTSType(main_ctx);
|
|
563
563
|
if (tts_type == OUTETTS_V0_3 || tts_type == OUTETTS_V0_2) {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
#ifndef
|
|
2
|
-
#define
|
|
1
|
+
#ifndef CAPTTS_H
|
|
2
|
+
#define CAPTTS_H
|
|
3
3
|
|
|
4
4
|
#include <vector>
|
|
5
5
|
#include <string>
|
|
@@ -9,10 +9,10 @@
|
|
|
9
9
|
|
|
10
10
|
using json = nlohmann::ordered_json;
|
|
11
11
|
|
|
12
|
-
namespace
|
|
12
|
+
namespace capllama {
|
|
13
13
|
|
|
14
14
|
// Forward declarations
|
|
15
|
-
struct
|
|
15
|
+
struct llama_cap_context;
|
|
16
16
|
|
|
17
17
|
// TTS type enumeration
|
|
18
18
|
enum tts_type {
|
|
@@ -23,19 +23,19 @@ enum tts_type {
|
|
|
23
23
|
};
|
|
24
24
|
|
|
25
25
|
// Audio completion result structure
|
|
26
|
-
struct
|
|
26
|
+
struct llama_cap_audio_completion_result {
|
|
27
27
|
std::string prompt;
|
|
28
28
|
const char *grammar;
|
|
29
29
|
};
|
|
30
30
|
|
|
31
31
|
// TTS context for TTS-specific functionality
|
|
32
|
-
struct
|
|
32
|
+
struct llama_cap_context_tts {
|
|
33
33
|
// TTS state fields
|
|
34
34
|
std::vector<llama_token> audio_tokens;
|
|
35
35
|
std::vector<llama_token> guide_tokens;
|
|
36
36
|
bool next_token_uses_guide_token = true;
|
|
37
37
|
|
|
38
|
-
// Vocoder fields (from
|
|
38
|
+
// Vocoder fields (from llama_cap_context_vocoder)
|
|
39
39
|
common_init_result init_result;
|
|
40
40
|
common_params params;
|
|
41
41
|
llama_model *model = nullptr;
|
|
@@ -43,17 +43,17 @@ struct llama_rn_context_tts {
|
|
|
43
43
|
tts_type type = UNKNOWN;
|
|
44
44
|
|
|
45
45
|
// Constructor and destructor
|
|
46
|
-
|
|
47
|
-
~
|
|
46
|
+
llama_cap_context_tts(const std::string &vocoder_model_path, int batch_size = -1);
|
|
47
|
+
~llama_cap_context_tts();
|
|
48
48
|
|
|
49
49
|
// TTS utility methods
|
|
50
|
-
tts_type getTTSType(
|
|
51
|
-
|
|
52
|
-
std::vector<llama_token> getAudioCompletionGuideTokens(
|
|
53
|
-
std::vector<float> decodeAudioTokens(
|
|
50
|
+
tts_type getTTSType(llama_cap_context* main_ctx, json speaker = nullptr);
|
|
51
|
+
llama_cap_audio_completion_result getFormattedAudioCompletion(llama_cap_context* main_ctx, const std::string &speaker_json_str, const std::string &text_to_speak);
|
|
52
|
+
std::vector<llama_token> getAudioCompletionGuideTokens(llama_cap_context* main_ctx, const std::string &text_to_speak);
|
|
53
|
+
std::vector<float> decodeAudioTokens(llama_cap_context* main_ctx, const std::vector<llama_token> &tokens);
|
|
54
54
|
void setGuideTokens(const std::vector<llama_token> &tokens);
|
|
55
55
|
};
|
|
56
56
|
|
|
57
57
|
}
|
|
58
58
|
|
|
59
|
-
#endif /*
|
|
59
|
+
#endif /* CAPTTS_H */
|
|
@@ -78,6 +78,36 @@ struct lm_ggml_compute_params {
|
|
|
78
78
|
#include <sys/prctl.h>
|
|
79
79
|
#endif
|
|
80
80
|
|
|
81
|
+
// NEON compatibility layer
|
|
82
|
+
#if defined(__ARM_NEON)
|
|
83
|
+
#include <arm_neon.h>
|
|
84
|
+
|
|
85
|
+
// Only define vcvtnq_s32_f32 for older ARM architectures that don't have it
|
|
86
|
+
// NDK 29+ includes this function for ARMv8 and newer
|
|
87
|
+
#if !defined(__aarch64__) && !defined(__ARM_ARCH_8A__) && defined(__ARM_ARCH) && __ARM_ARCH < 8
|
|
88
|
+
// Emulate vcvtn (round to nearest) for ARMv7
|
|
89
|
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
90
|
+
// Round to nearest integer
|
|
91
|
+
const float32x4_t vhalf = vdupq_n_f32(0.5f);
|
|
92
|
+
const float32x4_t vnhalf = vdupq_n_f32(-0.5f);
|
|
93
|
+
const float32x4_t vzero = vdupq_n_f32(0.0f);
|
|
94
|
+
uint32x4_t mask = vcgeq_f32(v, vzero);
|
|
95
|
+
float32x4_t rounded = vbslq_f32(mask, vaddq_f32(v, vhalf), vsubq_f32(v, vhalf));
|
|
96
|
+
return vcvtq_s32_f32(rounded);
|
|
97
|
+
}
|
|
98
|
+
#endif
|
|
99
|
+
#endif
|
|
100
|
+
|
|
101
|
+
#if defined(__s390x__) && defined(LM_GGML_NNPA)
|
|
102
|
+
#ifndef __NNPA__
|
|
103
|
+
#define __NNPA__
|
|
104
|
+
#endif // __NNPA__
|
|
105
|
+
#endif // __s390x__ && LM_GGML_NNPA
|
|
106
|
+
|
|
107
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
108
|
+
#include <sys/prctl.h>
|
|
109
|
+
#endif
|
|
110
|
+
|
|
81
111
|
#if defined(__ARM_NEON)
|
|
82
112
|
|
|
83
113
|
// ref: https://github.com/ggml-org/llama.cpp/pull/5404
|
package/dist/docs.json
CHANGED
|
@@ -493,6 +493,82 @@
|
|
|
493
493
|
"complexTypes": [],
|
|
494
494
|
"slug": "releasevocoder"
|
|
495
495
|
},
|
|
496
|
+
{
|
|
497
|
+
"name": "downloadModel",
|
|
498
|
+
"signature": "(options: { url: string; filename: string; }) => Promise<string>",
|
|
499
|
+
"parameters": [
|
|
500
|
+
{
|
|
501
|
+
"name": "options",
|
|
502
|
+
"docs": "",
|
|
503
|
+
"type": "{ url: string; filename: string; }"
|
|
504
|
+
}
|
|
505
|
+
],
|
|
506
|
+
"returns": "Promise<string>",
|
|
507
|
+
"tags": [],
|
|
508
|
+
"docs": "",
|
|
509
|
+
"complexTypes": [],
|
|
510
|
+
"slug": "downloadmodel"
|
|
511
|
+
},
|
|
512
|
+
{
|
|
513
|
+
"name": "getDownloadProgress",
|
|
514
|
+
"signature": "(options: { url: string; }) => Promise<{ progress: number; completed: boolean; failed: boolean; errorMessage?: string; localPath?: string; downloadedBytes: number; totalBytes: number; }>",
|
|
515
|
+
"parameters": [
|
|
516
|
+
{
|
|
517
|
+
"name": "options",
|
|
518
|
+
"docs": "",
|
|
519
|
+
"type": "{ url: string; }"
|
|
520
|
+
}
|
|
521
|
+
],
|
|
522
|
+
"returns": "Promise<{ progress: number; completed: boolean; failed: boolean; errorMessage?: string | undefined; localPath?: string | undefined; downloadedBytes: number; totalBytes: number; }>",
|
|
523
|
+
"tags": [],
|
|
524
|
+
"docs": "",
|
|
525
|
+
"complexTypes": [],
|
|
526
|
+
"slug": "getdownloadprogress"
|
|
527
|
+
},
|
|
528
|
+
{
|
|
529
|
+
"name": "cancelDownload",
|
|
530
|
+
"signature": "(options: { url: string; }) => Promise<boolean>",
|
|
531
|
+
"parameters": [
|
|
532
|
+
{
|
|
533
|
+
"name": "options",
|
|
534
|
+
"docs": "",
|
|
535
|
+
"type": "{ url: string; }"
|
|
536
|
+
}
|
|
537
|
+
],
|
|
538
|
+
"returns": "Promise<boolean>",
|
|
539
|
+
"tags": [],
|
|
540
|
+
"docs": "",
|
|
541
|
+
"complexTypes": [],
|
|
542
|
+
"slug": "canceldownload"
|
|
543
|
+
},
|
|
544
|
+
{
|
|
545
|
+
"name": "getAvailableModels",
|
|
546
|
+
"signature": "() => Promise<Array<{ name: string; path: string; size: number; }>>",
|
|
547
|
+
"parameters": [],
|
|
548
|
+
"returns": "Promise<{ name: string; path: string; size: number; }[]>",
|
|
549
|
+
"tags": [],
|
|
550
|
+
"docs": "",
|
|
551
|
+
"complexTypes": [
|
|
552
|
+
"Array"
|
|
553
|
+
],
|
|
554
|
+
"slug": "getavailablemodels"
|
|
555
|
+
},
|
|
556
|
+
{
|
|
557
|
+
"name": "convertJsonSchemaToGrammar",
|
|
558
|
+
"signature": "(options: { schema: string; }) => Promise<string>",
|
|
559
|
+
"parameters": [
|
|
560
|
+
{
|
|
561
|
+
"name": "options",
|
|
562
|
+
"docs": "",
|
|
563
|
+
"type": "{ schema: string; }"
|
|
564
|
+
}
|
|
565
|
+
],
|
|
566
|
+
"returns": "Promise<string>",
|
|
567
|
+
"tags": [],
|
|
568
|
+
"docs": "",
|
|
569
|
+
"complexTypes": [],
|
|
570
|
+
"slug": "convertjsonschematogrammar"
|
|
571
|
+
},
|
|
496
572
|
{
|
|
497
573
|
"name": "addListener",
|
|
498
574
|
"signature": "(eventName: string, listenerFunc: (data: any) => void) => Promise<void>",
|
|
@@ -4031,6 +4107,27 @@
|
|
|
4031
4107
|
"complexTypes": [],
|
|
4032
4108
|
"type": "number | undefined"
|
|
4033
4109
|
},
|
|
4110
|
+
{
|
|
4111
|
+
"name": "draft_model",
|
|
4112
|
+
"tags": [],
|
|
4113
|
+
"docs": "Path to draft model for speculative decoding (mobile optimization)",
|
|
4114
|
+
"complexTypes": [],
|
|
4115
|
+
"type": "string | undefined"
|
|
4116
|
+
},
|
|
4117
|
+
{
|
|
4118
|
+
"name": "speculative_samples",
|
|
4119
|
+
"tags": [],
|
|
4120
|
+
"docs": "Number of tokens to predict speculatively (default: 3 for mobile)",
|
|
4121
|
+
"complexTypes": [],
|
|
4122
|
+
"type": "number | undefined"
|
|
4123
|
+
},
|
|
4124
|
+
{
|
|
4125
|
+
"name": "mobile_speculative",
|
|
4126
|
+
"tags": [],
|
|
4127
|
+
"docs": "Enable mobile-optimized speculative decoding",
|
|
4128
|
+
"complexTypes": [],
|
|
4129
|
+
"type": "boolean | undefined"
|
|
4130
|
+
},
|
|
4034
4131
|
{
|
|
4035
4132
|
"name": "n_gpu_layers",
|
|
4036
4133
|
"tags": [],
|
|
@@ -4263,11 +4360,11 @@
|
|
|
4263
4360
|
{
|
|
4264
4361
|
"name": "tool_calls",
|
|
4265
4362
|
"tags": [],
|
|
4266
|
-
"docs": "Tool calls",
|
|
4363
|
+
"docs": "Tool calls (parsed from response)",
|
|
4267
4364
|
"complexTypes": [
|
|
4268
4365
|
"Array"
|
|
4269
4366
|
],
|
|
4270
|
-
"type": "Array<{\r\n type: 'function';\r\n function: {\r\n name: string;\r\n arguments: string
|
|
4367
|
+
"type": "Array<{\r\n type: 'function';\r\n function: {\r\n name: string;\r\n arguments: string; // JSON string of arguments\r\n };\r\n id?: string;\r\n }>"
|
|
4271
4368
|
},
|
|
4272
4369
|
{
|
|
4273
4370
|
"name": "content",
|
|
@@ -4535,7 +4632,7 @@
|
|
|
4535
4632
|
{
|
|
4536
4633
|
"name": "grammar",
|
|
4537
4634
|
"tags": [],
|
|
4538
|
-
"docs": "Set grammar for grammar-based sampling.
|
|
4635
|
+
"docs": "Set grammar for grammar-based sampling (GBNF format). Default: no grammar\r\nThis will override json_schema if both are provided.",
|
|
4539
4636
|
"complexTypes": [],
|
|
4540
4637
|
"type": "string | undefined"
|
|
4541
4638
|
},
|
|
@@ -13,6 +13,18 @@ export interface NativeContextParams {
|
|
|
13
13
|
n_batch?: number;
|
|
14
14
|
n_ubatch?: number;
|
|
15
15
|
n_threads?: number;
|
|
16
|
+
/**
|
|
17
|
+
* Path to draft model for speculative decoding (mobile optimization)
|
|
18
|
+
*/
|
|
19
|
+
draft_model?: string;
|
|
20
|
+
/**
|
|
21
|
+
* Number of tokens to predict speculatively (default: 3 for mobile)
|
|
22
|
+
*/
|
|
23
|
+
speculative_samples?: number;
|
|
24
|
+
/**
|
|
25
|
+
* Enable mobile-optimized speculative decoding
|
|
26
|
+
*/
|
|
27
|
+
mobile_speculative?: boolean;
|
|
16
28
|
/**
|
|
17
29
|
* Number of layers to store in VRAM (Currently only for iOS)
|
|
18
30
|
*/
|
|
@@ -87,7 +99,8 @@ export interface NativeCompletionParams {
|
|
|
87
99
|
*/
|
|
88
100
|
json_schema?: string;
|
|
89
101
|
/**
|
|
90
|
-
* Set grammar for grammar-based sampling.
|
|
102
|
+
* Set grammar for grammar-based sampling (GBNF format). Default: no grammar
|
|
103
|
+
* This will override json_schema if both are provided.
|
|
91
104
|
*/
|
|
92
105
|
grammar?: string;
|
|
93
106
|
/**
|
|
@@ -268,7 +281,7 @@ export interface NativeCompletionResult {
|
|
|
268
281
|
*/
|
|
269
282
|
reasoning_content: string;
|
|
270
283
|
/**
|
|
271
|
-
* Tool calls
|
|
284
|
+
* Tool calls (parsed from response)
|
|
272
285
|
*/
|
|
273
286
|
tool_calls: Array<{
|
|
274
287
|
type: 'function';
|
|
@@ -483,6 +496,10 @@ export interface CompletionParams extends Omit<NativeCompletionParams, 'emit_par
|
|
|
483
496
|
chatTemplate?: string;
|
|
484
497
|
chat_template?: string;
|
|
485
498
|
jinja?: boolean;
|
|
499
|
+
/**
|
|
500
|
+
* GBNF grammar for structured output. Takes precedence over json_schema.
|
|
501
|
+
*/
|
|
502
|
+
grammar?: string;
|
|
486
503
|
tools?: object;
|
|
487
504
|
parallel_tool_calls?: object;
|
|
488
505
|
tool_choice?: string;
|
|
@@ -648,6 +665,32 @@ export interface LlamaCppPlugin {
|
|
|
648
665
|
releaseVocoder(options: {
|
|
649
666
|
contextId: number;
|
|
650
667
|
}): Promise<void>;
|
|
668
|
+
downloadModel(options: {
|
|
669
|
+
url: string;
|
|
670
|
+
filename: string;
|
|
671
|
+
}): Promise<string>;
|
|
672
|
+
getDownloadProgress(options: {
|
|
673
|
+
url: string;
|
|
674
|
+
}): Promise<{
|
|
675
|
+
progress: number;
|
|
676
|
+
completed: boolean;
|
|
677
|
+
failed: boolean;
|
|
678
|
+
errorMessage?: string;
|
|
679
|
+
localPath?: string;
|
|
680
|
+
downloadedBytes: number;
|
|
681
|
+
totalBytes: number;
|
|
682
|
+
}>;
|
|
683
|
+
cancelDownload(options: {
|
|
684
|
+
url: string;
|
|
685
|
+
}): Promise<boolean>;
|
|
686
|
+
getAvailableModels(): Promise<Array<{
|
|
687
|
+
name: string;
|
|
688
|
+
path: string;
|
|
689
|
+
size: number;
|
|
690
|
+
}>>;
|
|
691
|
+
convertJsonSchemaToGrammar(options: {
|
|
692
|
+
schema: string;
|
|
693
|
+
}): Promise<string>;
|
|
651
694
|
addListener(eventName: string, listenerFunc: (data: any) => void): Promise<void>;
|
|
652
695
|
removeAllListeners(eventName: string): Promise<void>;
|
|
653
696
|
}
|