cui-llama.rn 1.2.6 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +26 -6
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +228 -40
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/amx/amx.cpp +196 -0
  9. package/cpp/amx/amx.h +20 -0
  10. package/cpp/amx/common.h +101 -0
  11. package/cpp/amx/mmq.cpp +2524 -0
  12. package/cpp/amx/mmq.h +16 -0
  13. package/cpp/common.cpp +118 -251
  14. package/cpp/common.h +53 -30
  15. package/cpp/ggml-aarch64.c +46 -3395
  16. package/cpp/ggml-aarch64.h +0 -20
  17. package/cpp/ggml-alloc.c +6 -8
  18. package/cpp/ggml-backend-impl.h +33 -11
  19. package/cpp/ggml-backend-reg.cpp +423 -0
  20. package/cpp/ggml-backend.cpp +14 -676
  21. package/cpp/ggml-backend.h +46 -9
  22. package/cpp/ggml-common.h +6 -0
  23. package/cpp/ggml-cpu-aarch64.c +3823 -0
  24. package/cpp/ggml-cpu-aarch64.h +32 -0
  25. package/cpp/ggml-cpu-impl.h +14 -242
  26. package/cpp/ggml-cpu-quants.c +10835 -0
  27. package/cpp/ggml-cpu-quants.h +63 -0
  28. package/cpp/ggml-cpu.c +13971 -13720
  29. package/cpp/ggml-cpu.cpp +715 -0
  30. package/cpp/ggml-cpu.h +65 -63
  31. package/cpp/ggml-impl.h +285 -25
  32. package/cpp/ggml-metal.h +8 -8
  33. package/cpp/ggml-metal.m +1221 -728
  34. package/cpp/ggml-quants.c +189 -10681
  35. package/cpp/ggml-quants.h +78 -125
  36. package/cpp/ggml-threading.cpp +12 -0
  37. package/cpp/ggml-threading.h +12 -0
  38. package/cpp/ggml.c +688 -1460
  39. package/cpp/ggml.h +58 -244
  40. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  41. package/cpp/json.hpp +24766 -24766
  42. package/cpp/llama-sampling.cpp +5 -2
  43. package/cpp/llama.cpp +409 -123
  44. package/cpp/llama.h +8 -4
  45. package/cpp/rn-llama.hpp +89 -25
  46. package/cpp/sampling.cpp +42 -3
  47. package/cpp/sampling.h +22 -1
  48. package/cpp/sgemm.cpp +608 -0
  49. package/cpp/speculative.cpp +270 -0
  50. package/cpp/speculative.h +28 -0
  51. package/cpp/unicode.cpp +11 -0
  52. package/ios/RNLlama.mm +43 -20
  53. package/ios/RNLlamaContext.h +9 -3
  54. package/ios/RNLlamaContext.mm +146 -33
  55. package/jest/mock.js +0 -1
  56. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  57. package/lib/commonjs/grammar.js +4 -2
  58. package/lib/commonjs/grammar.js.map +1 -1
  59. package/lib/commonjs/index.js +52 -15
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/module/NativeRNLlama.js.map +1 -1
  62. package/lib/module/grammar.js +2 -1
  63. package/lib/module/grammar.js.map +1 -1
  64. package/lib/module/index.js +51 -15
  65. package/lib/module/index.js.map +1 -1
  66. package/lib/typescript/NativeRNLlama.d.ts +122 -8
  67. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  68. package/lib/typescript/grammar.d.ts +5 -6
  69. package/lib/typescript/grammar.d.ts.map +1 -1
  70. package/lib/typescript/index.d.ts +15 -6
  71. package/lib/typescript/index.d.ts.map +1 -1
  72. package/package.json +2 -1
  73. package/src/NativeRNLlama.ts +135 -13
  74. package/src/grammar.ts +10 -8
  75. package/src/index.ts +104 -28
@@ -0,0 +1,270 @@
1
+ #include "speculative.h"
2
+
3
+ #include "log.h"
4
+ #include "common.h"
5
+ #include "sampling.h"
6
+
7
+ #include <cstring>
8
+
9
+ #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
10
+ #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
11
+
12
+ struct common_speculative {
13
+ struct llama_context * ctx;
14
+ struct common_sampler * smpl;
15
+
16
+ llama_batch batch;
17
+ llama_tokens prompt;
18
+ };
19
+
20
+ struct common_speculative * common_speculative_init(
21
+ struct llama_context * ctx_dft) {
22
+ auto * result = new common_speculative {
23
+ /* .ctx = */ ctx_dft,
24
+ /* .smpl = */ nullptr,
25
+ /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
26
+ /* .prompt = */ {},
27
+ };
28
+
29
+ // TODO: optimize or pass from outside?
30
+ #if 0
31
+ {
32
+ common_params_sampling params;
33
+ params.no_perf = false;
34
+
35
+ params.top_k = 40;
36
+ params.top_p = 0.9;
37
+
38
+ params.samplers = {
39
+ COMMON_SAMPLER_TYPE_TOP_K,
40
+ COMMON_SAMPLER_TYPE_TOP_P,
41
+ COMMON_SAMPLER_TYPE_INFILL,
42
+ };
43
+
44
+ result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
45
+ }
46
+ #else
47
+ {
48
+ common_params_sampling params;
49
+ params.no_perf = false;
50
+
51
+ params.top_k = 10;
52
+
53
+ params.samplers = {
54
+ COMMON_SAMPLER_TYPE_TOP_K,
55
+ };
56
+
57
+ result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
58
+ }
59
+ #endif
60
+
61
+ return result;
62
+ }
63
+
64
+ void common_speculative_free(struct common_speculative * spec) {
65
+ common_sampler_free(spec->smpl);
66
+
67
+ llama_batch_free(spec->batch);
68
+
69
+ delete spec;
70
+ }
71
+
72
+ bool common_speculative_are_compatible(
73
+ const struct llama_context * ctx_tgt,
74
+ const struct llama_context * ctx_dft) {
75
+ const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
76
+ const struct llama_model * model_dft = llama_get_model(ctx_dft);
77
+
78
+ const bool vocab_type_tgt = llama_vocab_type(model_tgt);
79
+ LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
80
+
81
+ const bool vocab_type_dft = llama_vocab_type(model_dft);
82
+ LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
83
+
84
+ if (vocab_type_tgt != vocab_type_dft) {
85
+ LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
86
+ "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
87
+ return false;
88
+ }
89
+
90
+ if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
91
+ llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
92
+ llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
93
+ llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
94
+ LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
95
+ LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
96
+ LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
97
+ return false;
98
+ }
99
+
100
+ {
101
+ const int n_vocab_tgt = llama_n_vocab(model_tgt);
102
+ const int n_vocab_dft = llama_n_vocab(model_dft);
103
+
104
+ const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
105
+
106
+ if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
107
+ LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
108
+ "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
109
+ __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
110
+ return false;
111
+ }
112
+
113
+ for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
114
+ const char * token_text_tgt = llama_token_get_text(model_tgt, i);
115
+ const char * token_text_dft = llama_token_get_text(model_dft, i);
116
+ if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
117
+ LOG_ERR("%s: draft model vocab must match target model to use speculation but "
118
+ "token %d content differs - target '%s', draft '%s'\n", __func__, i,
119
+ common_token_to_piece(ctx_tgt, i).c_str(),
120
+ common_token_to_piece(ctx_dft, i).c_str());
121
+ return false;
122
+ }
123
+ }
124
+ }
125
+
126
+ return true;
127
+ }
128
+
129
+ llama_tokens common_speculative_gen_draft(
130
+ struct common_speculative * spec,
131
+ struct common_speculative_params params,
132
+ const llama_tokens & prompt_tgt,
133
+ llama_token id_last) {
134
+ auto & batch = spec->batch;
135
+ auto & ctx = spec->ctx;
136
+ auto & smpl = spec->smpl;
137
+ auto & prompt = spec->prompt;
138
+
139
+ int reuse_i = 0;
140
+ int reuse_n = 0;
141
+
142
+ const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
143
+
144
+ const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
145
+
146
+ // reuse as much as possible from the old draft context
147
+ // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
148
+ for (int i = 0; i < (int) prompt.size(); ++i) {
149
+ int cur = 0;
150
+ while (i_start + cur < (int) prompt_tgt.size() &&
151
+ i + cur < (int) prompt.size() &&
152
+ prompt_tgt[i_start + cur] == prompt[i + cur]) {
153
+ cur++;
154
+ }
155
+
156
+ if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
157
+ reuse_i = i;
158
+ reuse_n = cur;
159
+ }
160
+ }
161
+
162
+ LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
163
+
164
+ llama_tokens result;
165
+ result.reserve(params.n_draft);
166
+
167
+ if (reuse_n == 0) {
168
+ llama_kv_cache_clear(ctx);
169
+
170
+ prompt.clear();
171
+ } else {
172
+ // this happens when a previous draft has been discarded (for example, due to being too small), but the
173
+ // target model agreed with it. in this case, we simply pass back the previous results to save compute
174
+ if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
175
+ for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
176
+ result.push_back(prompt[i]);
177
+
178
+ if (params.n_draft <= (int) result.size()) {
179
+ break;
180
+ }
181
+ }
182
+
183
+ return result;
184
+ }
185
+
186
+ if (reuse_i > 0) {
187
+ llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
188
+ llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
189
+
190
+ prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
191
+ }
192
+
193
+ if (reuse_n < (int) prompt.size()) {
194
+ llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
195
+
196
+ prompt.erase(prompt.begin() + reuse_n, prompt.end());
197
+ }
198
+ }
199
+
200
+ // prepare a batch to evaluate any new tokens in the prompt
201
+ common_batch_clear(batch);
202
+
203
+ for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
204
+ //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
205
+ common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
206
+
207
+ prompt.push_back(prompt_tgt[i]);
208
+ }
209
+
210
+ // we should rarely end-up here during normal decoding
211
+ if (batch.n_tokens > 0) {
212
+ //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
213
+
214
+ llama_decode(ctx, batch);
215
+ }
216
+
217
+ const llama_pos n_past = prompt.size();
218
+
219
+ LOG_DBG("%s: n_past = %d\n", __func__, n_past);
220
+
221
+ common_batch_clear(batch);
222
+ common_batch_add (batch, id_last, n_past, { 0 }, true);
223
+
224
+ prompt.push_back(id_last);
225
+
226
+ //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
227
+
228
+ llama_decode(ctx, batch);
229
+
230
+ common_sampler_reset(smpl);
231
+
232
+ // sample n_draft tokens from the draft model
233
+ for (int i = 0; i < params.n_draft; ++i) {
234
+ common_batch_clear(batch);
235
+
236
+ common_sampler_sample(smpl, ctx, 0, true);
237
+
238
+ const auto * cur_p = common_sampler_get_candidates(smpl);
239
+
240
+ for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
241
+ LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
242
+ k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
243
+ }
244
+
245
+ // add drafted token for each sequence
246
+ const llama_token id = cur_p->data[0].id;
247
+
248
+ // only collect very high-confidence draft tokens
249
+ if (cur_p->data[0].p < params.p_min) {
250
+ break;
251
+ }
252
+
253
+ common_sampler_accept(smpl, id, true);
254
+
255
+ result.push_back(id);
256
+
257
+ if (params.n_draft <= (int) result.size()) {
258
+ break;
259
+ }
260
+
261
+ common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
262
+
263
+ // evaluate the drafted tokens on the draft model
264
+ llama_decode(ctx, batch);
265
+
266
+ prompt.push_back(id);
267
+ }
268
+
269
+ return result;
270
+ }
@@ -0,0 +1,28 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "common.h"
5
+
6
+ struct common_speculative;
7
+
8
+ struct common_speculative_params {
9
+ int n_draft = 16; // max drafted tokens
10
+ int n_reuse = 256;
11
+
12
+ float p_min = 0.9f; // min probabiliy required to accept a token in the draft
13
+ };
14
+
15
+ struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
16
+
17
+ void common_speculative_free(struct common_speculative * spec);
18
+
19
+ bool common_speculative_are_compatible(
20
+ const struct llama_context * ctx_tgt,
21
+ const struct llama_context * ctx_dft);
22
+
23
+ // sample up to n_draft tokens and add them to the batch using the draft model
24
+ llama_tokens common_speculative_gen_draft(
25
+ struct common_speculative * spec,
26
+ struct common_speculative_params params,
27
+ const llama_tokens & prompt,
28
+ llama_token id_last);
package/cpp/unicode.cpp CHANGED
@@ -201,7 +201,18 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
201
201
  }
202
202
 
203
203
  static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
204
+ #if defined(__clang__)
205
+ // disable C++17 deprecation warning for std::codecvt_utf8
206
+ # pragma clang diagnostic push
207
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
208
+ #endif
209
+
204
210
  std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
211
+
212
+ #if defined(__clang__)
213
+ # pragma clang diagnostic pop
214
+ #endif
215
+
205
216
  return conv.from_bytes(s);
206
217
  }
207
218
 
package/ios/RNLlama.mm CHANGED
@@ -21,10 +21,25 @@ RCT_EXPORT_METHOD(setContextLimit:(double)limit
21
21
  resolve(nil);
22
22
  }
23
23
 
24
- RCT_EXPORT_METHOD(initContext:(NSDictionary *)contextParams
24
+ RCT_EXPORT_METHOD(modelInfo:(NSString *)path
25
+ withSkip:(NSArray *)skip
25
26
  withResolver:(RCTPromiseResolveBlock)resolve
26
27
  withRejecter:(RCTPromiseRejectBlock)reject)
27
28
  {
29
+ resolve([RNLlamaContext modelInfo:path skip:skip]);
30
+ }
31
+
32
+ RCT_EXPORT_METHOD(initContext:(double)contextId
33
+ withContextParams:(NSDictionary *)contextParams
34
+ withResolver:(RCTPromiseResolveBlock)resolve
35
+ withRejecter:(RCTPromiseRejectBlock)reject)
36
+ {
37
+ NSNumber *contextIdNumber = [NSNumber numberWithDouble:contextId];
38
+ if (llamaContexts[contextIdNumber] != nil) {
39
+ reject(@"llama_error", @"Context already exists", nil);
40
+ return;
41
+ }
42
+
28
43
  if (llamaDQueue == nil) {
29
44
  llamaDQueue = dispatch_queue_create("com.rnllama", DISPATCH_QUEUE_SERIAL);
30
45
  }
@@ -38,23 +53,27 @@ RCT_EXPORT_METHOD(initContext:(NSDictionary *)contextParams
38
53
  return;
39
54
  }
40
55
 
41
- RNLlamaContext *context = [RNLlamaContext initWithParams:contextParams];
42
- if (![context isModelLoaded]) {
43
- reject(@"llama_cpp_error", @"Failed to load the model", nil);
44
- return;
56
+ @try {
57
+ RNLlamaContext *context = [RNLlamaContext initWithParams:contextParams onProgress:^(unsigned int progress) {
58
+ dispatch_async(dispatch_get_main_queue(), ^{
59
+ [self sendEventWithName:@"@RNLlama_onInitContextProgress" body:@{ @"contextId": @(contextId), @"progress": @(progress) }];
60
+ });
61
+ }];
62
+ if (![context isModelLoaded]) {
63
+ reject(@"llama_cpp_error", @"Failed to load the model", nil);
64
+ return;
65
+ }
66
+
67
+ [llamaContexts setObject:context forKey:contextIdNumber];
68
+
69
+ resolve(@{
70
+ @"gpu": @([context isMetalEnabled]),
71
+ @"reasonNoGPU": [context reasonNoMetal],
72
+ @"model": [context modelInfo],
73
+ });
74
+ } @catch (NSException *exception) {
75
+ reject(@"llama_cpp_error", exception.reason, nil);
45
76
  }
46
-
47
- double contextId = (double) arc4random_uniform(1000000);
48
-
49
- NSNumber *contextIdNumber = [NSNumber numberWithDouble:contextId];
50
- [llamaContexts setObject:context forKey:contextIdNumber];
51
-
52
- resolve(@{
53
- @"contextId": contextIdNumber,
54
- @"gpu": @([context isMetalEnabled]),
55
- @"reasonNoGPU": [context reasonNoMetal],
56
- @"model": [context modelInfo],
57
- });
58
77
  }
59
78
 
60
79
  RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
@@ -125,6 +144,7 @@ RCT_EXPORT_METHOD(saveSession:(double)contextId
125
144
 
126
145
  - (NSArray *)supportedEvents {
127
146
  return@[
147
+ @"@RNLlama_onInitContextProgress",
128
148
  @"@RNLlama_onToken",
129
149
  ];
130
150
  }
@@ -213,6 +233,7 @@ RCT_EXPORT_METHOD(detokenize:(double)contextId
213
233
 
214
234
  RCT_EXPORT_METHOD(embedding:(double)contextId
215
235
  text:(NSString *)text
236
+ params:(NSDictionary *)params
216
237
  withResolver:(RCTPromiseResolveBlock)resolve
217
238
  withRejecter:(RCTPromiseRejectBlock)reject)
218
239
  {
@@ -222,9 +243,8 @@ RCT_EXPORT_METHOD(embedding:(double)contextId
222
243
  return;
223
244
  }
224
245
  @try {
225
- NSMutableArray *embedding = [context embedding:text];
226
- resolve(@{ @"embedding": embedding });
227
- [embedding release];
246
+ NSDictionary *embedding = [context embedding:text params:params];
247
+ resolve(embedding);
228
248
  } @catch (NSException *exception) {
229
249
  reject(@"llama_cpp_error", exception.reason, nil);
230
250
  }
@@ -260,6 +280,9 @@ RCT_EXPORT_METHOD(releaseContext:(double)contextId
260
280
  reject(@"llama_error", @"Context not found", nil);
261
281
  return;
262
282
  }
283
+ if (![context isModelLoaded]) {
284
+ [context interruptLoad];
285
+ }
263
286
  [context stopCompletion];
264
287
  dispatch_barrier_sync(llamaDQueue, ^{});
265
288
  [context invalidate];
@@ -1,18 +1,24 @@
1
1
  #ifdef __cplusplus
2
2
  #import "llama.h"
3
+ #import "llama-impl.h"
4
+ #import "ggml.h"
3
5
  #import "rn-llama.hpp"
4
6
  #endif
5
7
 
6
8
 
7
9
  @interface RNLlamaContext : NSObject {
8
10
  bool is_metal_enabled;
9
- NSString * reason_no_metal;
10
11
  bool is_model_loaded;
12
+ NSString * reason_no_metal;
13
+
14
+ void (^onProgress)(unsigned int progress);
11
15
 
12
16
  rnllama::llama_rn_context * llama;
13
17
  }
14
18
 
15
- + (instancetype)initWithParams:(NSDictionary *)params;
19
+ + (NSDictionary *)modelInfo:(NSString *)path skip:(NSArray *)skip;
20
+ + (instancetype)initWithParams:(NSDictionary *)params onProgress:(void (^)(unsigned int progress))onProgress;
21
+ - (void)interruptLoad;
16
22
  - (bool)isMetalEnabled;
17
23
  - (NSString *)reasonNoMetal;
18
24
  - (NSDictionary *)modelInfo;
@@ -22,7 +28,7 @@
22
28
  - (void)stopCompletion;
23
29
  - (NSArray *)tokenize:(NSString *)text;
24
30
  - (NSString *)detokenize:(NSArray *)tokens;
25
- - (NSArray *)embedding:(NSString *)text;
31
+ - (NSDictionary *)embedding:(NSString *)text params:(NSDictionary *)params;
26
32
  - (NSString *)getFormattedChat:(NSArray *)messages withTemplate:(NSString *)chatTemplate;
27
33
  - (NSDictionary *)loadSession:(NSString *)path;
28
34
  - (int)saveSession:(NSString *)path size:(int)size;