@novastera-oss/llamarn 0.6.3 → 0.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -235,14 +235,36 @@ add_custom_command(TARGET RNLlamaCpp POST_BUILD
235
235
  COMMENT "Copying dependency libraries to build output directory"
236
236
  )
237
237
 
238
- # libggml-cpu.so is REQUIRED when GGML_BACKEND_DL=ON (CPU backend is dynamically loaded)
239
- # Copy it so it gets packaged into the APK
240
- add_custom_command(TARGET RNLlamaCpp POST_BUILD
241
- COMMAND ${CMAKE_COMMAND} -E copy_if_different
242
- ${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-cpu.so
243
- $<TARGET_FILE_DIR:RNLlamaCpp>/libggml-cpu.so
244
- COMMENT "Copying libggml-cpu.so (REQUIRED for CPU backend when GGML_BACKEND_DL=ON)"
245
- )
238
+ # CPU backend libraries: With GGML_CPU_ALL_VARIANTS, multiple variant libraries are built
239
+ # (e.g., libggml-cpu-android_armv8.0_1.so, libggml-cpu-android_armv8.2_1.so, etc.)
240
+ # The runtime loader will select the best variant based on CPU capabilities
241
+ # If variants don't exist, fall back to single libggml-cpu.so (backward compatibility)
242
+ file(GLOB CPU_VARIANT_LIBS "${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-cpu-*.so")
243
+ if(CPU_VARIANT_LIBS)
244
+ # Copy all CPU variant libraries (GGML_CPU_ALL_VARIANTS enabled)
245
+ foreach(CPU_VARIANT_LIB ${CPU_VARIANT_LIBS})
246
+ get_filename_component(CPU_VARIANT_LIB_NAME ${CPU_VARIANT_LIB} NAME)
247
+ add_custom_command(TARGET RNLlamaCpp POST_BUILD
248
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different
249
+ ${CPU_VARIANT_LIB}
250
+ $<TARGET_FILE_DIR:RNLlamaCpp>/${CPU_VARIANT_LIB_NAME}
251
+ COMMENT "Copying CPU variant library ${CPU_VARIANT_LIB_NAME} to build output directory"
252
+ )
253
+ endforeach()
254
+ list(LENGTH CPU_VARIANT_LIBS CPU_VARIANT_COUNT)
255
+ message(STATUS "Found ${CPU_VARIANT_COUNT} CPU variant libraries (GGML_CPU_ALL_VARIANTS enabled)")
256
+ elseif(EXISTS ${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-cpu.so)
257
+ # Fallback: Copy single libggml-cpu.so (backward compatibility)
258
+ add_custom_command(TARGET RNLlamaCpp POST_BUILD
259
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different
260
+ ${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-cpu.so
261
+ $<TARGET_FILE_DIR:RNLlamaCpp>/libggml-cpu.so
262
+ COMMENT "Copying libggml-cpu.so (single CPU backend - backward compatibility)"
263
+ )
264
+ message(STATUS "Found single libggml-cpu.so (backward compatibility mode)")
265
+ else()
266
+ message(WARNING "No CPU backend libraries found in ${JNI_LIBS_DIR}/${ANDROID_ABI}/ - CPU backend will not work!")
267
+ endif()
246
268
 
247
269
  # Also copy any optional GPU libraries if they exist
248
270
  if(EXISTS ${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-vulkan.so)
@@ -45,6 +45,129 @@
45
45
 
46
46
  namespace facebook::react {
47
47
 
48
+ // Helper function to load CPU variant libraries on Android
49
+ // On Android, ggml_backend_load_best() uses filesystem iteration which doesn't work
50
+ // with APK-packaged libraries. This function manually loads CPU variant libraries
51
+ // using dlopen() with just the library name - Android's linker finds them in the APK.
52
+ // We score each variant and only register the best compatible one (score > 0).
53
+ static void load_android_cpu_backends() {
54
+ #ifdef __ANDROID__
55
+ // Skip if CPU backend is already registered
56
+ if (ggml_backend_reg_by_name("CPU")) {
57
+ return;
58
+ }
59
+
60
+ // Try loading all CPU variant libraries (from most advanced to baseline)
61
+ // Score each one and register only the best compatible variant
62
+ static const char* cpu_variants[] = {
63
+ "libggml-cpu-android_armv8.6_1.so", // DOTPROD + FP16 + MATMUL_INT8
64
+ "libggml-cpu-android_armv8.2_2.so", // DOTPROD + FP16
65
+ "libggml-cpu-android_armv8.2_1.so", // DOTPROD
66
+ "libggml-cpu-android_armv8.0_1.so", // Baseline (emulator compatible)
67
+ nullptr
68
+ };
69
+
70
+ typedef ggml_backend_reg_t (*backend_init_fn_t)();
71
+ typedef int (*backend_score_t)();
72
+
73
+ int best_score = 0;
74
+ void* best_handle = nullptr;
75
+ backend_init_fn_t best_init = nullptr;
76
+
77
+ // Score all variants and find the best one
78
+ for (int i = 0; cpu_variants[i] != nullptr; i++) {
79
+ void* cpu_handle = dlopen(cpu_variants[i], RTLD_LAZY | RTLD_LOCAL);
80
+ if (cpu_handle) {
81
+ backend_score_t score_fn = (backend_score_t)dlsym(cpu_handle, "ggml_backend_score");
82
+ if (score_fn) {
83
+ int score = score_fn();
84
+ if (score > best_score) {
85
+ // Close previous best handle if we had one
86
+ if (best_handle) {
87
+ dlclose(best_handle);
88
+ }
89
+ best_score = score;
90
+ best_handle = cpu_handle;
91
+ best_init = (backend_init_fn_t)dlsym(cpu_handle, "ggml_backend_init");
92
+ } else {
93
+ // This variant is not better, close it
94
+ dlclose(cpu_handle);
95
+ }
96
+ } else {
97
+ // No score function, close it
98
+ dlclose(cpu_handle);
99
+ }
100
+ }
101
+ }
102
+
103
+ // Register the best variant if we found one
104
+ if (best_handle && best_init && best_score > 0) {
105
+ ggml_backend_reg_t cpu_backend = best_init();
106
+ if (cpu_backend) {
107
+ ggml_backend_register(cpu_backend);
108
+ // Keep the handle open - it will be cleaned up when the backend is unloaded
109
+ } else {
110
+ dlclose(best_handle);
111
+ }
112
+ }
113
+ #endif
114
+ }
115
+
116
+ // Helper function to load all Android backends manually
117
+ // On Android, ggml_backend_load_best() uses filesystem iteration which doesn't work
118
+ // with APK-packaged libraries. This function manually loads all backend libraries
119
+ // using dlopen() with just the library name - Android's linker finds them in the APK.
120
+ static void load_android_backends() {
121
+ #ifdef __ANDROID__
122
+ typedef ggml_backend_reg_t (*backend_init_fn_t)();
123
+
124
+ // Load Hexagon backend first (Snapdragon DSP) - more performant than Vulkan on Snapdragon devices
125
+ if (!ggml_backend_reg_by_name("HTP")) {
126
+ void* hexagon_handle = dlopen("libggml-hexagon.so", RTLD_LAZY | RTLD_LOCAL);
127
+ if (hexagon_handle) {
128
+ backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(hexagon_handle, "ggml_backend_init");
129
+ if (backend_init) {
130
+ ggml_backend_reg_t hexagon_backend = backend_init();
131
+ if (hexagon_backend) {
132
+ ggml_backend_register(hexagon_backend);
133
+ }
134
+ }
135
+ }
136
+ }
137
+
138
+ // Load OpenCL backend
139
+ if (!ggml_backend_reg_by_name("OpenCL")) {
140
+ void* opencl_handle = dlopen("libggml-opencl.so", RTLD_LAZY | RTLD_LOCAL);
141
+ if (opencl_handle) {
142
+ backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(opencl_handle, "ggml_backend_init");
143
+ if (backend_init) {
144
+ ggml_backend_reg_t opencl_backend = backend_init();
145
+ if (opencl_backend) {
146
+ ggml_backend_register(opencl_backend);
147
+ }
148
+ }
149
+ }
150
+ }
151
+
152
+ // Load Vulkan backend (disabled by default on Android due to emulator crashes, but try anyway)
153
+ if (!ggml_backend_reg_by_name("Vulkan")) {
154
+ void* vulkan_handle = dlopen("libggml-vulkan.so", RTLD_LAZY | RTLD_LOCAL);
155
+ if (vulkan_handle) {
156
+ backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(vulkan_handle, "ggml_backend_init");
157
+ if (backend_init) {
158
+ ggml_backend_reg_t vulkan_backend = backend_init();
159
+ if (vulkan_backend) {
160
+ ggml_backend_register(vulkan_backend);
161
+ }
162
+ }
163
+ }
164
+ }
165
+
166
+ // Load CPU variant libraries (scoring system selects best compatible one)
167
+ load_android_cpu_backends();
168
+ #endif
169
+ }
170
+
48
171
  // Factory method implementation
49
172
  std::shared_ptr<TurboModule> PureCppImpl::create(std::shared_ptr<CallInvoker> jsInvoker) {
50
173
  return std::make_shared<PureCppImpl>(std::move(jsInvoker));
@@ -97,48 +220,20 @@ jsi::Value PureCppImpl::loadLlamaModelInfo(jsi::Runtime &runtime, jsi::String mo
97
220
 
98
221
  // Load all available backends (CPU is dynamically loaded when GGML_BACKEND_DL is enabled)
99
222
  // With GGML_BACKEND_DL=ON, ALL backends (CPU + GPU) are dynamically loaded
100
- // CPU backend is in libggml-cpu.so, GPU backends are in libggml-opencl.so, libggml-vulkan.so, libggml-hexagon.so
101
- // On Android, dlopen() can load libraries by name even from inside APKs
223
+ // When GGML_CPU_ALL_VARIANTS is enabled, CPU backend variants are:
224
+ // libggml-cpu-android_armv8.0_1.so (baseline - emulator compatible)
225
+ // libggml-cpu-android_armv8.2_1.so (DOTPROD)
226
+ // libggml-cpu-android_armv8.2_2.so (DOTPROD + FP16_VECTOR_ARITHMETIC)
227
+ // libggml-cpu-android_armv8.6_1.so (DOTPROD + FP16_VECTOR_ARITHMETIC + MATMUL_INT8)
228
+ // GPU backends are in libggml-opencl.so, libggml-vulkan.so, libggml-hexagon.so
229
+ // On Android, manually load all backends since filesystem iteration doesn't work
230
+ // with APK-packaged libraries. ggml_backend_load_all() will skip already loaded backends.
102
231
  #ifdef __ANDROID__
103
- // Load CPU backend directly - Android's linker will find it in the same directory
104
- // Check if already registered to avoid duplicate registration
105
- if (!ggml_backend_reg_by_name("CPU")) {
106
- void* cpu_handle = dlopen("libggml-cpu.so", RTLD_LAZY | RTLD_LOCAL);
107
- if (cpu_handle) {
108
- typedef ggml_backend_reg_t (*backend_init_fn_t)();
109
- backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(cpu_handle, "ggml_backend_init");
110
- if (backend_init) {
111
- ggml_backend_reg_t cpu_backend = backend_init();
112
- if (cpu_backend) {
113
- ggml_backend_register(cpu_backend);
114
- }
115
- }
116
- }
117
- }
118
-
119
- // Load Hexagon backend (Snapdragon DSP) - more performant than Vulkan on Snapdragon devices
120
- // Load before other GPU backends to give it priority
121
- // Check if already registered to avoid duplicate registration
122
- if (!ggml_backend_reg_by_name("HTP")) {
123
- void* hexagon_handle = dlopen("libggml-hexagon.so", RTLD_LAZY | RTLD_LOCAL);
124
- if (hexagon_handle) {
125
- typedef ggml_backend_reg_t (*backend_init_fn_t)();
126
- backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(hexagon_handle, "ggml_backend_init");
127
- if (backend_init) {
128
- ggml_backend_reg_t hexagon_backend = backend_init();
129
- if (hexagon_backend) {
130
- ggml_backend_register(hexagon_backend);
131
- }
132
- }
133
- }
134
- }
232
+ load_android_backends();
233
+ #endif
135
234
 
136
- // Load other GPU backends (OpenCL, Vulkan) if present - they will be found by name
137
- // ggml_backend_load_all() will skip backends that are already loaded
138
- ggml_backend_load_all();
139
- #else
235
+ // Load any remaining backends (ggml_backend_load_all will skip already loaded ones)
140
236
  ggml_backend_load_all();
141
- #endif
142
237
 
143
238
  // Verify at least CPU backend was loaded
144
239
  if (ggml_backend_reg_count() == 0) {
@@ -389,26 +484,20 @@ jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
389
484
 
390
485
  // Load all available backends (CPU is dynamically loaded when GGML_BACKEND_DL is enabled)
391
486
  // With GGML_BACKEND_DL=ON, ALL backends (CPU + GPU) are dynamically loaded
392
- // CPU backend is in libggml-cpu.so, GPU backends are in libggml-opencl.so, libggml-vulkan.so
487
+ // When GGML_CPU_ALL_VARIANTS is enabled, CPU backend variants are:
488
+ // libggml-cpu-android_armv8.0_1.so (baseline - emulator compatible)
489
+ // libggml-cpu-android_armv8.2_1.so (DOTPROD)
490
+ // libggml-cpu-android_armv8.2_2.so (DOTPROD + FP16_VECTOR_ARITHMETIC)
491
+ // libggml-cpu-android_armv8.6_1.so (DOTPROD + FP16_VECTOR_ARITHMETIC + MATMUL_INT8)
492
+ // GPU backends are in libggml-opencl.so, libggml-vulkan.so, libggml-hexagon.so
493
+ // On Android, manually load all backends since filesystem iteration doesn't work
494
+ // with APK-packaged libraries. ggml_backend_load_all() will skip already loaded backends.
393
495
  #ifdef __ANDROID__
394
- // Load CPU backend directly - Android's linker will find it in the same directory
395
- void* cpu_handle = dlopen("libggml-cpu.so", RTLD_LAZY | RTLD_LOCAL);
396
- if (cpu_handle) {
397
- typedef ggml_backend_reg_t (*backend_init_fn_t)();
398
- backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(cpu_handle, "ggml_backend_init");
399
- if (backend_init) {
400
- ggml_backend_reg_t cpu_backend = backend_init();
401
- if (cpu_backend) {
402
- ggml_backend_register(cpu_backend);
403
- }
404
- }
405
- }
496
+ load_android_backends();
497
+ #endif
406
498
 
407
- // Load GPU backends (OpenCL, Vulkan) if present - they will be found by name
408
- ggml_backend_load_all();
409
- #else
499
+ // Load other backends (OpenCL, Vulkan, etc.) - ggml_backend_load_all will skip already loaded backends
410
500
  ggml_backend_load_all();
411
- #endif
412
501
 
413
502
  // Verify at least CPU backend was loaded
414
503
  if (ggml_backend_reg_count() == 0) {
@@ -549,12 +638,30 @@ jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
549
638
  // Now assign to the context
550
639
  selfPtr->rn_ctx_->params = rn_params;
551
640
 
552
- selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, params.chat_template);
641
+ // Initialize chat templates (matches server.cpp approach)
642
+ // common_chat_templates_init already has try-catch internally for template parsing errors,
643
+ // but exceptions can escape from chat_template constructor during capability detection.
644
+ // We catch all exceptions (not just std::exception) to handle any edge cases.
553
645
  try {
646
+ selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, params.chat_template);
647
+
648
+ // Validate template by trying to format an example (catches runtime errors like null lstrip)
649
+ // This is optional - if it fails, we still use the template anyway (it might work in practice)
650
+ try {
554
651
  common_chat_format_example(selfPtr->rn_ctx_->chat_templates.get(), params.use_jinja, params.default_template_kwargs);
555
- } catch (const std::exception & e) {
556
- // Fallback to chatml if the original template parsing fails
652
+ } catch (...) {
653
+ // Template validation failed, but continue anyway - the template might work in practice
654
+ // This preserves backward compatibility for models that were working before
655
+ }
656
+ } catch (...) {
657
+ // Template initialization failed - fallback to chatml (matches server.cpp behavior)
658
+ // Catch all exceptions (not just std::exception) to handle any edge cases
659
+ try {
557
660
  selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, "chatml");
661
+ } catch (...) {
662
+ // Even chatml failed - this should never happen, but handle it gracefully
663
+ // The model will still load, but chat templates won't work
664
+ }
558
665
  }
559
666
 
560
667
  // Schedule success callback on JS thread to create JSI objects
@@ -379,19 +379,46 @@ CompletionResult run_chat_completion(
379
379
  chat_msgs = common_chat_msgs_parse_oaicompat(data["messages"]);
380
380
  }
381
381
 
382
- // Apply template
382
+ // Apply template (matches server.cpp oaicompat_chat_params_parse approach)
383
383
  common_chat_templates_inputs template_inputs;
384
384
  template_inputs.messages = chat_msgs;
385
385
  template_inputs.add_generation_prompt = true;
386
386
  template_inputs.use_jinja = rn_ctx->params.use_jinja;
387
- // Note: extract_reasoning field doesn't exist in current llama.cpp version
388
- // template_inputs.extract_reasoning = true; // Default to true to extract reasoning content if available
387
+ template_inputs.reasoning_format = rn_ctx->params.reasoning_format;
388
+
389
+ // Set chat_template_kwargs from params (matches server.cpp line 712)
390
+ template_inputs.chat_template_kwargs = rn_ctx->params.default_template_kwargs;
391
+
392
+ // Merge any chat_template_kwargs from request body (if present in future)
393
+ // For now, we use the defaults from params
394
+
395
+ // Parse enable_thinking from chat_template_kwargs (matches server.cpp lines 718-725)
396
+ auto enable_thinking_kwarg = template_inputs.chat_template_kwargs.find("enable_thinking");
397
+ if (enable_thinking_kwarg != template_inputs.chat_template_kwargs.end()) {
398
+ const std::string& value = enable_thinking_kwarg->second;
399
+ if (value == "true") {
400
+ template_inputs.enable_thinking = true;
401
+ } else if (value == "false") {
402
+ template_inputs.enable_thinking = false;
403
+ }
404
+ // else: use default (true)
405
+ }
389
406
 
390
407
  // Add grammar if present in options
391
408
  if (!options.grammar.empty()) {
392
409
  template_inputs.grammar = options.grammar;
393
410
  }
394
411
 
412
+ // Parse json_schema if present (matches server.cpp line 696)
413
+ if (data.contains("json_schema") && !data["json_schema"].is_null()) {
414
+ template_inputs.json_schema = data["json_schema"].dump();
415
+ }
416
+
417
+ // Check for conflicting grammar and json_schema (matches server.cpp lines 570-572)
418
+ if (!template_inputs.json_schema.empty() && !template_inputs.grammar.empty()) {
419
+ throw std::runtime_error("Cannot use both json_schema and grammar");
420
+ }
421
+
395
422
  // Parse tools if present
396
423
  if (data.contains("tools") && !data["tools"].empty()) {
397
424
  template_inputs.tools = common_chat_tools_parse_oaicompat(data["tools"]);
@@ -407,8 +434,20 @@ CompletionResult run_chat_completion(
407
434
  ? data["tool_choice"].get<std::string>()
408
435
  : data["tool_choice"].dump());
409
436
  }
437
+
438
+ // Parse parallel_tool_calls if present (matches server.cpp line 699)
439
+ if (data.contains("parallel_tool_calls")) {
440
+ template_inputs.parallel_tool_calls = data["parallel_tool_calls"].get<bool>();
441
+ }
442
+
443
+ // Check for conflicting tools and grammar (matches server.cpp lines 703-706)
444
+ if (!template_inputs.tools.empty() && template_inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
445
+ if (!template_inputs.grammar.empty()) {
446
+ throw std::runtime_error("Cannot use custom grammar constraints with tools.");
447
+ }
448
+ }
410
449
 
411
- // Apply template
450
+ // Apply template (matches server.cpp approach - no try-catch, exceptions propagate to outer handler)
412
451
  const auto& chat_params = common_chat_templates_apply(rn_ctx->chat_templates.get(), template_inputs);
413
452
 
414
453
  CompletionOptions cmpl_options = options;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@novastera-oss/llamarn",
3
- "version": "0.6.3",
3
+ "version": "0.6.7",
4
4
  "description": "An attempt at a pure cpp turbo module library",
5
5
  "source": "./src/index.tsx",
6
6
  "main": "./lib/module/index.js",