llama-cpp-capacitor 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,6 +51,7 @@ class LlamaContext {
51
51
  private LlamaModel model;
52
52
  private boolean isMultimodalEnabled = false;
53
53
  private boolean isVocoderEnabled = false;
54
+ private long nativeContextId = -1;
54
55
 
55
56
  public LlamaContext(int id) {
56
57
  this.id = id;
@@ -83,6 +84,14 @@ class LlamaContext {
83
84
  public void setVocoderEnabled(boolean vocoderEnabled) {
84
85
  isVocoderEnabled = vocoderEnabled;
85
86
  }
87
+
88
+ public long getNativeContextId() {
89
+ return nativeContextId;
90
+ }
91
+
92
+ public void setNativeContextId(long nativeContextId) {
93
+ this.nativeContextId = nativeContextId;
94
+ }
86
95
  }
87
96
 
88
97
  class LlamaModel {
@@ -231,16 +240,33 @@ public class LlamaCpp {
231
240
  private int contextLimit = 10;
232
241
  private boolean nativeLogEnabled = false;
233
242
 
243
+ // Native method declarations
244
+ private native long initContextNative(String modelPath, JSObject params);
245
+ private native void releaseContextNative(long nativeContextId);
246
+ private native String completionNative(long contextId, String prompt);
247
+ private native void stopCompletionNative(long contextId);
248
+ private native String getFormattedChatNative(long contextId, String messages, String chatTemplate);
249
+ private native boolean toggleNativeLogNative(boolean enabled);
250
+
251
+ static {
252
+ System.loadLibrary("llama-cpp");
253
+ }
254
+
234
255
  // MARK: - Core initialization and management
235
256
 
236
257
  public void toggleNativeLog(boolean enabled, LlamaCallback<Void> callback) {
237
- nativeLogEnabled = enabled;
238
- if (enabled) {
239
- Log.i(TAG, "Native logging enabled");
240
- } else {
241
- Log.i(TAG, "Native logging disabled");
258
+ try {
259
+ boolean result = toggleNativeLogNative(enabled);
260
+ nativeLogEnabled = enabled;
261
+ if (enabled) {
262
+ Log.i(TAG, "Native logging enabled");
263
+ } else {
264
+ Log.i(TAG, "Native logging disabled");
265
+ }
266
+ callback.onResult(LlamaResult.success(null));
267
+ } catch (Exception e) {
268
+ callback.onResult(LlamaResult.failure(new LlamaError("Failed to toggle native log: " + e.getMessage())));
242
269
  }
243
- callback.onResult(LlamaResult.success(null));
244
270
  }
245
271
 
246
272
  public void setContextLimit(int limit, LlamaCallback<Void> callback) {
@@ -268,89 +294,70 @@ public class LlamaCpp {
268
294
  return;
269
295
  }
270
296
 
271
- // Extract parameters
272
- String modelPath = params.getString("model");
273
- if (modelPath == null) {
274
- callback.onResult(LlamaResult.failure(new LlamaError("Invalid parameters")));
275
- return;
297
+ try {
298
+ // Extract parameters
299
+ String modelPath = params.getString("model");
300
+ if (modelPath == null || modelPath.isEmpty()) {
301
+ callback.onResult(LlamaResult.failure(new LlamaError("Model path is required")));
302
+ return;
303
+ }
304
+
305
+ // Call native initialization
306
+ long nativeContextId = initContextNative(modelPath, params);
307
+ if (nativeContextId < 0) {
308
+ callback.onResult(LlamaResult.failure(new LlamaError("Failed to initialize native context")));
309
+ return;
310
+ }
311
+
312
+ // Create Java context wrapper
313
+ LlamaContext context = new LlamaContext(contextId);
314
+ context.setNativeContextId(nativeContextId);
315
+ contexts.put(contextId, context);
316
+
317
+ // Return context info
318
+ Map<String, Object> contextInfo = new HashMap<>();
319
+ contextInfo.put("contextId", contextId);
320
+ contextInfo.put("gpu", false);
321
+ contextInfo.put("reasonNoGPU", "Currently not supported");
322
+
323
+ Map<String, Object> modelInfo = new HashMap<>();
324
+ modelInfo.put("desc", "Loaded model");
325
+ modelInfo.put("size", 0);
326
+ modelInfo.put("nEmbd", 0);
327
+ modelInfo.put("nParams", 0);
328
+ modelInfo.put("path", modelPath);
329
+
330
+ contextInfo.put("model", modelInfo);
331
+ contextInfo.put("androidLib", "llama-cpp");
332
+
333
+ callback.onResult(LlamaResult.success(contextInfo));
334
+
335
+ } catch (Exception e) {
336
+ callback.onResult(LlamaResult.failure(new LlamaError("Context initialization failed: " + e.getMessage())));
276
337
  }
277
-
278
- // Create context
279
- LlamaContext context = new LlamaContext(contextId);
280
-
281
- // Create model info (this would typically load from GGUF file)
282
- MinjaCaps defaultCaps = new MinjaCaps(true, true, true, true, true, true);
283
- MinjaCaps toolUseCaps = new MinjaCaps(true, true, true, true, true, true);
284
- MinjaTemplates minja = new MinjaTemplates(true, defaultCaps, true, toolUseCaps);
285
- ChatTemplates chatTemplates = new ChatTemplates(true, minja);
286
-
287
- LlamaModel model = new LlamaModel(
288
- modelPath,
289
- "Sample model",
290
- 0,
291
- 0,
292
- 0,
293
- chatTemplates,
294
- new HashMap<>()
295
- );
296
-
297
- context.setModel(model);
298
- contexts.put(contextId, context);
299
-
300
- // Return context info
301
- Map<String, Object> contextInfo = new HashMap<>();
302
- contextInfo.put("contextId", contextId);
303
- contextInfo.put("gpu", false);
304
- contextInfo.put("reasonNoGPU", "Not implemented");
305
-
306
- Map<String, Object> modelInfo = new HashMap<>();
307
- modelInfo.put("desc", model.getDesc());
308
- modelInfo.put("size", model.getSize());
309
- modelInfo.put("nEmbd", model.getNEmbd());
310
- modelInfo.put("nParams", model.getNParams());
311
-
312
- Map<String, Object> chatTemplatesInfo = new HashMap<>();
313
- chatTemplatesInfo.put("llamaChat", model.getChatTemplates().isLlamaChat());
314
-
315
- Map<String, Object> minjaInfo = new HashMap<>();
316
- minjaInfo.put("default", model.getChatTemplates().getMinja().isDefault());
317
-
318
- Map<String, Object> defaultCapsInfo = new HashMap<>();
319
- defaultCapsInfo.put("tools", model.getChatTemplates().getMinja().getDefaultCaps().isTools());
320
- defaultCapsInfo.put("toolCalls", model.getChatTemplates().getMinja().getDefaultCaps().isToolCalls());
321
- defaultCapsInfo.put("toolResponses", model.getChatTemplates().getMinja().getDefaultCaps().isToolResponses());
322
- defaultCapsInfo.put("systemRole", model.getChatTemplates().getMinja().getDefaultCaps().isSystemRole());
323
- defaultCapsInfo.put("parallelToolCalls", model.getChatTemplates().getMinja().getDefaultCaps().isParallelToolCalls());
324
- defaultCapsInfo.put("toolCallId", model.getChatTemplates().getMinja().getDefaultCaps().isToolCallId());
325
-
326
- Map<String, Object> toolUseCapsInfo = new HashMap<>();
327
- toolUseCapsInfo.put("tools", model.getChatTemplates().getMinja().getToolUseCaps().isTools());
328
- toolUseCapsInfo.put("toolCalls", model.getChatTemplates().getMinja().getToolUseCaps().isToolCalls());
329
- toolUseCapsInfo.put("toolResponses", model.getChatTemplates().getMinja().getToolUseCaps().isToolResponses());
330
- toolUseCapsInfo.put("systemRole", model.getChatTemplates().getMinja().getToolUseCaps().isSystemRole());
331
- toolUseCapsInfo.put("parallelToolCalls", model.getChatTemplates().getMinja().getToolUseCaps().isParallelToolCalls());
332
- toolUseCapsInfo.put("toolCallId", model.getChatTemplates().getMinja().getToolUseCaps().isToolCallId());
333
-
334
- minjaInfo.put("defaultCaps", defaultCapsInfo);
335
- minjaInfo.put("toolUse", model.getChatTemplates().getMinja().isToolUse());
336
- minjaInfo.put("toolUseCaps", toolUseCapsInfo);
337
-
338
- chatTemplatesInfo.put("minja", minjaInfo);
339
- modelInfo.put("chatTemplates", chatTemplatesInfo);
340
- modelInfo.put("metadata", model.getMetadata());
341
- modelInfo.put("isChatTemplateSupported", true);
342
-
343
- contextInfo.put("model", modelInfo);
344
-
345
- callback.onResult(LlamaResult.success(contextInfo));
346
338
  }
347
339
 
348
340
  public void releaseContext(int contextId, LlamaCallback<Void> callback) {
349
- if (contexts.remove(contextId) == null) {
341
+ LlamaContext context = contexts.get(contextId);
342
+ if (context == null) {
350
343
  callback.onResult(LlamaResult.failure(new LlamaError("Context not found")));
351
344
  return;
352
345
  }
353
- callback.onResult(LlamaResult.success(null));
346
+
347
+ try {
348
+ // Release native context
349
+ if (context.getNativeContextId() >= 0) {
350
+ releaseContextNative(context.getNativeContextId());
351
+ }
352
+
353
+ // Remove from Java context map
354
+ contexts.remove(contextId);
355
+
356
+ callback.onResult(LlamaResult.success(null));
357
+
358
+ } catch (Exception e) {
359
+ callback.onResult(LlamaResult.failure(new LlamaError("Failed to release context: " + e.getMessage())));
360
+ }
354
361
  }
355
362
 
356
363
  public void releaseAllContexts(LlamaCallback<Void> callback) {
@@ -367,15 +374,22 @@ public class LlamaCpp {
367
374
  return;
368
375
  }
369
376
 
370
- // This would typically format the chat using the model's chat templates
371
- // For now, return a basic formatted chat
372
- Map<String, Object> formattedChat = new HashMap<>();
373
- formattedChat.put("type", "llama-chat");
374
- formattedChat.put("prompt", messages);
375
- formattedChat.put("has_media", false);
376
- formattedChat.put("media_paths", new String[0]);
377
-
378
- callback.onResult(LlamaResult.success(formattedChat));
377
+ try {
378
+ // Call native formatted chat
379
+ String result = getFormattedChatNative(context.getNativeContextId(), messages, chatTemplate);
380
+
381
+ // Build formatted chat result
382
+ Map<String, Object> formattedChat = new HashMap<>();
383
+ formattedChat.put("type", "llama-chat");
384
+ formattedChat.put("prompt", result);
385
+ formattedChat.put("has_media", false);
386
+ formattedChat.put("media_paths", new String[0]);
387
+
388
+ callback.onResult(LlamaResult.success(formattedChat));
389
+
390
+ } catch (Exception e) {
391
+ callback.onResult(LlamaResult.failure(new LlamaError("Failed to format chat: " + e.getMessage())));
392
+ }
379
393
  }
380
394
 
381
395
  public void completion(int contextId, JSObject params, LlamaCallback<Map<String, Object>> callback) {
@@ -385,48 +399,68 @@ public class LlamaCpp {
385
399
  return;
386
400
  }
387
401
 
388
- // This would typically perform the completion using llama.cpp
389
- // For now, return a basic completion result
390
- Map<String, Object> completionResult = new HashMap<>();
391
- completionResult.put("text", "Sample completion text");
392
- completionResult.put("reasoning_content", "");
393
- completionResult.put("tool_calls", new Object[0]);
394
- completionResult.put("content", "Sample completion text");
395
- completionResult.put("chat_format", 0);
396
- completionResult.put("tokens_predicted", 0);
397
- completionResult.put("tokens_evaluated", 0);
398
- completionResult.put("truncated", false);
399
- completionResult.put("stopped_eos", false);
400
- completionResult.put("stopped_word", "");
401
- completionResult.put("stopped_limit", 0);
402
- completionResult.put("stopping_word", "");
403
- completionResult.put("context_full", false);
404
- completionResult.put("interrupted", false);
405
- completionResult.put("tokens_cached", 0);
406
-
407
- Map<String, Object> timings = new HashMap<>();
408
- timings.put("prompt_n", 0);
409
- timings.put("prompt_ms", 0);
410
- timings.put("prompt_per_token_ms", 0);
411
- timings.put("prompt_per_second", 0);
412
- timings.put("predicted_n", 0);
413
- timings.put("predicted_ms", 0);
414
- timings.put("predicted_per_token_ms", 0);
415
- timings.put("predicted_per_second", 0);
416
-
417
- completionResult.put("timings", timings);
418
-
419
- callback.onResult(LlamaResult.success(completionResult));
402
+ try {
403
+ // Extract parameters from JSObject
404
+ String prompt = params.getString("prompt", "");
405
+ int nPredict = params.getInteger("n_predict", 128);
406
+ float temperature = params.has("temp") ? (float) params.getDouble("temp") : 0.8f;
407
+ float topP = params.has("top_p") ? (float) params.getDouble("top_p") : 0.95f;
408
+ int topK = params.getInteger("top_k", 40);
409
+ float repeatPenalty = params.has("repeat_penalty") ? (float) params.getDouble("repeat_penalty") : 1.1f;
410
+
411
+ // Call native completion
412
+ String result = completionNative(context.getNativeContextId(), prompt);
413
+
414
+ // Build completion result
415
+ Map<String, Object> completionResult = new HashMap<>();
416
+ completionResult.put("text", result);
417
+ completionResult.put("reasoning_content", "");
418
+ completionResult.put("tool_calls", new Object[0]);
419
+ completionResult.put("content", result);
420
+ completionResult.put("chat_format", 0);
421
+ completionResult.put("tokens_predicted", nPredict);
422
+ completionResult.put("tokens_evaluated", 0);
423
+ completionResult.put("truncated", false);
424
+ completionResult.put("stopped_eos", false);
425
+ completionResult.put("stopped_word", "");
426
+ completionResult.put("stopped_limit", 0);
427
+ completionResult.put("stopping_word", "");
428
+ completionResult.put("context_full", false);
429
+ completionResult.put("interrupted", false);
430
+ completionResult.put("tokens_cached", 0);
431
+
432
+ Map<String, Object> timings = new HashMap<>();
433
+ timings.put("prompt_n", 0);
434
+ timings.put("prompt_ms", 0);
435
+ timings.put("prompt_per_token_ms", 0);
436
+ timings.put("prompt_per_second", 0);
437
+ timings.put("predicted_n", nPredict);
438
+ timings.put("predicted_ms", 0);
439
+ timings.put("predicted_per_token_ms", 0);
440
+ timings.put("predicted_per_second", 0);
441
+
442
+ completionResult.put("timings", timings);
443
+
444
+ callback.onResult(LlamaResult.success(completionResult));
445
+
446
+ } catch (Exception e) {
447
+ callback.onResult(LlamaResult.failure(new LlamaError("Completion failed: " + e.getMessage())));
448
+ }
420
449
  }
421
450
 
422
451
  public void stopCompletion(int contextId, LlamaCallback<Void> callback) {
423
- if (contexts.get(contextId) == null) {
452
+ LlamaContext context = contexts.get(contextId);
453
+ if (context == null) {
424
454
  callback.onResult(LlamaResult.failure(new LlamaError("Context not found")));
425
455
  return;
426
456
  }
427
457
 
428
- // This would typically stop any ongoing completion
429
- callback.onResult(LlamaResult.success(null));
458
+ try {
459
+ stopCompletionNative(context.getNativeContextId());
460
+ callback.onResult(LlamaResult.success(null));
461
+ } catch (Exception e) {
462
+ callback.onResult(LlamaResult.failure(new LlamaError("Failed to stop completion: " + e.getMessage())));
463
+ }
430
464
  }
431
465
 
432
466
  // MARK: - Session management
@@ -123,7 +123,7 @@ jclass find_class(JNIEnv* env, const char* name) {
123
123
  }
124
124
 
125
125
  // Global context storage
126
- static std::map<jlong, std::unique_ptr<llama_rn_context>> contexts;
126
+ static std::map<jlong, std::unique_ptr<rnllama::llama_rn_context>> contexts;
127
127
  static jlong next_context_id = 1;
128
128
 
129
129
  extern "C" {
@@ -136,15 +136,24 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_initContext(
136
136
  std::string model_path_str = jstring_to_string(env, model_path);
137
137
 
138
138
  // Create new context
139
- auto context = std::make_unique<llama_rn_context>();
139
+ auto context = std::make_unique<rnllama::llama_rn_context>();
140
140
 
141
- // Initialize common parameters (simplified)
141
+ // Initialize common parameters
142
142
  common_params cparams;
143
143
  cparams.model = model_path_str;
144
144
  cparams.n_ctx = 2048;
145
145
  cparams.n_batch = 512;
146
146
  cparams.n_threads = 4;
147
147
  cparams.n_gpu_layers = 0;
148
+ cparams.rope_freq_base = 10000.0f;
149
+ cparams.rope_freq_scale = 1.0f;
150
+ cparams.mul_mat_q = true;
151
+ cparams.f16_kv = true;
152
+ cparams.logits_all = false;
153
+ cparams.embedding = false;
154
+ cparams.use_mmap = true;
155
+ cparams.use_mlock = false;
156
+ cparams.numa = GGML_NUMA_STRATEGY_DISABLED;
148
157
 
149
158
  // Load model
150
159
  if (!context->loadModel(cparams)) {
@@ -195,8 +204,76 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_completion(
195
204
 
196
205
  std::string prompt_str = jstring_to_string(env, prompt);
197
206
 
198
- // Simplified completion (placeholder implementation)
199
- std::string result = "Generated text for: " + prompt_str;
207
+ // Get the context
208
+ rnllama::llama_rn_context* context = it->second.get();
209
+
210
+ // Initialize completion if not already done
211
+ if (!context->completion) {
212
+ context->completion = new rnllama::llama_rn_context_completion(context);
213
+ }
214
+
215
+ // Set up completion parameters
216
+ completion_params cparams;
217
+ cparams.prompt = prompt_str;
218
+ cparams.n_predict = 128;
219
+ cparams.n_keep = 0;
220
+ cparams.n_discard = -1;
221
+ cparams.n_probs = 0;
222
+ cparams.logit_bias.clear();
223
+ cparams.top_k = 40;
224
+ cparams.top_p = 0.95f;
225
+ cparams.tfs_z = 1.0f;
226
+ cparams.typical_p = 1.0f;
227
+ cparams.temp = 0.8f;
228
+ cparams.repeat_penalty = 1.1f;
229
+ cparams.repeat_last_n = 64;
230
+ cparams.frequency_penalty = 0.0f;
231
+ cparams.presence_penalty = 0.0f;
232
+ cparams.mirostat = 0;
233
+ cparams.mirostat_tau = 5.0f;
234
+ cparams.mirostat_eta = 0.1f;
235
+ cparams.penalize_nl = true;
236
+ cparams.grammar = "";
237
+ cparams.grammar_penalty.clear();
238
+ cparams.antiprompt.clear();
239
+ cparams.seed = -1;
240
+ cparams.ignore_eos = false;
241
+ cparams.stop_sequences.clear();
242
+ cparams.streaming = false;
243
+
244
+ // Perform completion
245
+ std::string result;
246
+ try {
247
+ // Tokenize the prompt
248
+ auto tokenize_result = context->tokenize(prompt_str, {});
249
+
250
+ // Set up completion
251
+ context->completion->rewind();
252
+ context->completion->beginCompletion();
253
+
254
+ // Process tokens
255
+ for (size_t i = 0; i < tokenize_result.tokens.size(); i++) {
256
+ llama_batch_add(&context->completion->embd, tokenize_result.tokens[i], i, {0}, false);
257
+ }
258
+
259
+ // Generate completion
260
+ std::string generated_text;
261
+ for (int i = 0; i < cparams.n_predict; i++) {
262
+ auto token_output = context->completion->nextToken();
263
+ if (token_output.tok == llama_token_eos(context->ctx)) {
264
+ break;
265
+ }
266
+
267
+ std::string token_text = rnllama::tokens_to_output_formatted_string(context->ctx, token_output.tok);
268
+ generated_text += token_text;
269
+ }
270
+
271
+ result = generated_text;
272
+
273
+ } catch (const std::exception& e) {
274
+ LOGE("Completion error: %s", e.what());
275
+ result = "Error during completion: " + std::string(e.what());
276
+ }
200
277
 
201
278
  LOGI("Completion for context %lld: %s", context_id, prompt_str.c_str());
202
279
  return string_to_jstring(env, result);
@@ -215,7 +292,10 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_stopCompletion(
215
292
  try {
216
293
  auto it = contexts.find(context_id);
217
294
  if (it != contexts.end()) {
218
- // Stop completion logic would go here
295
+ rnllama::llama_rn_context* context = it->second.get();
296
+ if (context->completion) {
297
+ context->completion->is_interrupted = true;
298
+ }
219
299
  LOGI("Stopped completion for context %lld", context_id);
220
300
  }
221
301
  } catch (const std::exception& e) {
@@ -238,8 +318,10 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_getFormattedChat(
238
318
  std::string messages_str = jstring_to_string(env, messages);
239
319
  std::string template_str = jstring_to_string(env, chat_template);
240
320
 
241
- // Simplified chat formatting (placeholder implementation)
242
- std::string result = "Formatted chat: " + messages_str;
321
+ rnllama::llama_rn_context* context = it->second.get();
322
+
323
+ // Format chat using the context's method
324
+ std::string result = context->getFormattedChat(messages_str, template_str);
243
325
 
244
326
  LOGI("Formatted chat for context %lld", context_id);
245
327
  return string_to_jstring(env, result);
@@ -256,7 +338,7 @@ Java_ai_annadata_plugin_capacitor_LlamaCpp_toggleNativeLog(
256
338
  JNIEnv* env, jobject thiz, jboolean enabled) {
257
339
 
258
340
  try {
259
- rnllama_verbose = jboolean_to_bool(enabled);
341
+ rnllama::rnllama_verbose = jboolean_to_bool(enabled);
260
342
  LOGI("Native logging %s", enabled ? "enabled" : "disabled");
261
343
  return bool_to_jboolean(true);
262
344
  } catch (const std::exception& e) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "llama-cpp-capacitor",
3
- "version": "0.0.3",
3
+ "version": "0.0.4",
4
4
  "description": "A native Capacitor plugin that embeds llama.cpp directly into mobile apps, enabling offline AI inference with comprehensive support for text generation, multimodal processing, TTS, LoRA adapters, and more.",
5
5
  "main": "dist/plugin.cjs.js",
6
6
  "module": "dist/esm/index.js",