@novastera-oss/llamarn 0.6.3 → 0.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/CMakeLists.txt +30 -8
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-hexagon.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-htp-v73.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-htp-v75.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-htp-v79.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-htp-v81.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/PureCppImpl.cpp +166 -59
- package/cpp/rn-completion.cpp +43 -4
- package/package.json +1 -1
package/android/CMakeLists.txt
CHANGED
|
@@ -235,14 +235,36 @@ add_custom_command(TARGET RNLlamaCpp POST_BUILD
|
|
|
235
235
|
COMMENT "Copying dependency libraries to build output directory"
|
|
236
236
|
)
|
|
237
237
|
|
|
238
|
-
#
|
|
239
|
-
#
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
)
|
|
238
|
+
# CPU backend libraries: With GGML_CPU_ALL_VARIANTS, multiple variant libraries are built
|
|
239
|
+
# (e.g., libggml-cpu-android_armv8.0_1.so, libggml-cpu-android_armv8.2_1.so, etc.)
|
|
240
|
+
# The runtime loader will select the best variant based on CPU capabilities
|
|
241
|
+
# If variants don't exist, fall back to single libggml-cpu.so (backward compatibility)
|
|
242
|
+
file(GLOB CPU_VARIANT_LIBS "${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-cpu-*.so")
|
|
243
|
+
if(CPU_VARIANT_LIBS)
|
|
244
|
+
# Copy all CPU variant libraries (GGML_CPU_ALL_VARIANTS enabled)
|
|
245
|
+
foreach(CPU_VARIANT_LIB ${CPU_VARIANT_LIBS})
|
|
246
|
+
get_filename_component(CPU_VARIANT_LIB_NAME ${CPU_VARIANT_LIB} NAME)
|
|
247
|
+
add_custom_command(TARGET RNLlamaCpp POST_BUILD
|
|
248
|
+
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
|
249
|
+
${CPU_VARIANT_LIB}
|
|
250
|
+
$<TARGET_FILE_DIR:RNLlamaCpp>/${CPU_VARIANT_LIB_NAME}
|
|
251
|
+
COMMENT "Copying CPU variant library ${CPU_VARIANT_LIB_NAME} to build output directory"
|
|
252
|
+
)
|
|
253
|
+
endforeach()
|
|
254
|
+
list(LENGTH CPU_VARIANT_LIBS CPU_VARIANT_COUNT)
|
|
255
|
+
message(STATUS "Found ${CPU_VARIANT_COUNT} CPU variant libraries (GGML_CPU_ALL_VARIANTS enabled)")
|
|
256
|
+
elseif(EXISTS ${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-cpu.so)
|
|
257
|
+
# Fallback: Copy single libggml-cpu.so (backward compatibility)
|
|
258
|
+
add_custom_command(TARGET RNLlamaCpp POST_BUILD
|
|
259
|
+
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
|
260
|
+
${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-cpu.so
|
|
261
|
+
$<TARGET_FILE_DIR:RNLlamaCpp>/libggml-cpu.so
|
|
262
|
+
COMMENT "Copying libggml-cpu.so (single CPU backend - backward compatibility)"
|
|
263
|
+
)
|
|
264
|
+
message(STATUS "Found single libggml-cpu.so (backward compatibility mode)")
|
|
265
|
+
else()
|
|
266
|
+
message(WARNING "No CPU backend libraries found in ${JNI_LIBS_DIR}/${ANDROID_ABI}/ - CPU backend will not work!")
|
|
267
|
+
endif()
|
|
246
268
|
|
|
247
269
|
# Also copy any optional GPU libraries if they exist
|
|
248
270
|
if(EXISTS ${JNI_LIBS_DIR}/${ANDROID_ABI}/libggml-vulkan.so)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/PureCppImpl.cpp
CHANGED
|
@@ -45,6 +45,129 @@
|
|
|
45
45
|
|
|
46
46
|
namespace facebook::react {
|
|
47
47
|
|
|
48
|
+
// Helper function to load CPU variant libraries on Android
|
|
49
|
+
// On Android, ggml_backend_load_best() uses filesystem iteration which doesn't work
|
|
50
|
+
// with APK-packaged libraries. This function manually loads CPU variant libraries
|
|
51
|
+
// using dlopen() with just the library name - Android's linker finds them in the APK.
|
|
52
|
+
// We score each variant and only register the best compatible one (score > 0).
|
|
53
|
+
static void load_android_cpu_backends() {
|
|
54
|
+
#ifdef __ANDROID__
|
|
55
|
+
// Skip if CPU backend is already registered
|
|
56
|
+
if (ggml_backend_reg_by_name("CPU")) {
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Try loading all CPU variant libraries (from most advanced to baseline)
|
|
61
|
+
// Score each one and register only the best compatible variant
|
|
62
|
+
static const char* cpu_variants[] = {
|
|
63
|
+
"libggml-cpu-android_armv8.6_1.so", // DOTPROD + FP16 + MATMUL_INT8
|
|
64
|
+
"libggml-cpu-android_armv8.2_2.so", // DOTPROD + FP16
|
|
65
|
+
"libggml-cpu-android_armv8.2_1.so", // DOTPROD
|
|
66
|
+
"libggml-cpu-android_armv8.0_1.so", // Baseline (emulator compatible)
|
|
67
|
+
nullptr
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
typedef ggml_backend_reg_t (*backend_init_fn_t)();
|
|
71
|
+
typedef int (*backend_score_t)();
|
|
72
|
+
|
|
73
|
+
int best_score = 0;
|
|
74
|
+
void* best_handle = nullptr;
|
|
75
|
+
backend_init_fn_t best_init = nullptr;
|
|
76
|
+
|
|
77
|
+
// Score all variants and find the best one
|
|
78
|
+
for (int i = 0; cpu_variants[i] != nullptr; i++) {
|
|
79
|
+
void* cpu_handle = dlopen(cpu_variants[i], RTLD_LAZY | RTLD_LOCAL);
|
|
80
|
+
if (cpu_handle) {
|
|
81
|
+
backend_score_t score_fn = (backend_score_t)dlsym(cpu_handle, "ggml_backend_score");
|
|
82
|
+
if (score_fn) {
|
|
83
|
+
int score = score_fn();
|
|
84
|
+
if (score > best_score) {
|
|
85
|
+
// Close previous best handle if we had one
|
|
86
|
+
if (best_handle) {
|
|
87
|
+
dlclose(best_handle);
|
|
88
|
+
}
|
|
89
|
+
best_score = score;
|
|
90
|
+
best_handle = cpu_handle;
|
|
91
|
+
best_init = (backend_init_fn_t)dlsym(cpu_handle, "ggml_backend_init");
|
|
92
|
+
} else {
|
|
93
|
+
// This variant is not better, close it
|
|
94
|
+
dlclose(cpu_handle);
|
|
95
|
+
}
|
|
96
|
+
} else {
|
|
97
|
+
// No score function, close it
|
|
98
|
+
dlclose(cpu_handle);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Register the best variant if we found one
|
|
104
|
+
if (best_handle && best_init && best_score > 0) {
|
|
105
|
+
ggml_backend_reg_t cpu_backend = best_init();
|
|
106
|
+
if (cpu_backend) {
|
|
107
|
+
ggml_backend_register(cpu_backend);
|
|
108
|
+
// Keep the handle open - it will be cleaned up when the backend is unloaded
|
|
109
|
+
} else {
|
|
110
|
+
dlclose(best_handle);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
#endif
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Helper function to load all Android backends manually
|
|
117
|
+
// On Android, ggml_backend_load_best() uses filesystem iteration which doesn't work
|
|
118
|
+
// with APK-packaged libraries. This function manually loads all backend libraries
|
|
119
|
+
// using dlopen() with just the library name - Android's linker finds them in the APK.
|
|
120
|
+
static void load_android_backends() {
|
|
121
|
+
#ifdef __ANDROID__
|
|
122
|
+
typedef ggml_backend_reg_t (*backend_init_fn_t)();
|
|
123
|
+
|
|
124
|
+
// Load Hexagon backend first (Snapdragon DSP) - more performant than Vulkan on Snapdragon devices
|
|
125
|
+
if (!ggml_backend_reg_by_name("HTP")) {
|
|
126
|
+
void* hexagon_handle = dlopen("libggml-hexagon.so", RTLD_LAZY | RTLD_LOCAL);
|
|
127
|
+
if (hexagon_handle) {
|
|
128
|
+
backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(hexagon_handle, "ggml_backend_init");
|
|
129
|
+
if (backend_init) {
|
|
130
|
+
ggml_backend_reg_t hexagon_backend = backend_init();
|
|
131
|
+
if (hexagon_backend) {
|
|
132
|
+
ggml_backend_register(hexagon_backend);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Load OpenCL backend
|
|
139
|
+
if (!ggml_backend_reg_by_name("OpenCL")) {
|
|
140
|
+
void* opencl_handle = dlopen("libggml-opencl.so", RTLD_LAZY | RTLD_LOCAL);
|
|
141
|
+
if (opencl_handle) {
|
|
142
|
+
backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(opencl_handle, "ggml_backend_init");
|
|
143
|
+
if (backend_init) {
|
|
144
|
+
ggml_backend_reg_t opencl_backend = backend_init();
|
|
145
|
+
if (opencl_backend) {
|
|
146
|
+
ggml_backend_register(opencl_backend);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Load Vulkan backend (disabled by default on Android due to emulator crashes, but try anyway)
|
|
153
|
+
if (!ggml_backend_reg_by_name("Vulkan")) {
|
|
154
|
+
void* vulkan_handle = dlopen("libggml-vulkan.so", RTLD_LAZY | RTLD_LOCAL);
|
|
155
|
+
if (vulkan_handle) {
|
|
156
|
+
backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(vulkan_handle, "ggml_backend_init");
|
|
157
|
+
if (backend_init) {
|
|
158
|
+
ggml_backend_reg_t vulkan_backend = backend_init();
|
|
159
|
+
if (vulkan_backend) {
|
|
160
|
+
ggml_backend_register(vulkan_backend);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Load CPU variant libraries (scoring system selects best compatible one)
|
|
167
|
+
load_android_cpu_backends();
|
|
168
|
+
#endif
|
|
169
|
+
}
|
|
170
|
+
|
|
48
171
|
// Factory method implementation
|
|
49
172
|
std::shared_ptr<TurboModule> PureCppImpl::create(std::shared_ptr<CallInvoker> jsInvoker) {
|
|
50
173
|
return std::make_shared<PureCppImpl>(std::move(jsInvoker));
|
|
@@ -97,48 +220,20 @@ jsi::Value PureCppImpl::loadLlamaModelInfo(jsi::Runtime &runtime, jsi::String mo
|
|
|
97
220
|
|
|
98
221
|
// Load all available backends (CPU is dynamically loaded when GGML_BACKEND_DL is enabled)
|
|
99
222
|
// With GGML_BACKEND_DL=ON, ALL backends (CPU + GPU) are dynamically loaded
|
|
100
|
-
//
|
|
101
|
-
//
|
|
223
|
+
// When GGML_CPU_ALL_VARIANTS is enabled, CPU backend variants are:
|
|
224
|
+
// libggml-cpu-android_armv8.0_1.so (baseline - emulator compatible)
|
|
225
|
+
// libggml-cpu-android_armv8.2_1.so (DOTPROD)
|
|
226
|
+
// libggml-cpu-android_armv8.2_2.so (DOTPROD + FP16_VECTOR_ARITHMETIC)
|
|
227
|
+
// libggml-cpu-android_armv8.6_1.so (DOTPROD + FP16_VECTOR_ARITHMETIC + MATMUL_INT8)
|
|
228
|
+
// GPU backends are in libggml-opencl.so, libggml-vulkan.so, libggml-hexagon.so
|
|
229
|
+
// On Android, manually load all backends since filesystem iteration doesn't work
|
|
230
|
+
// with APK-packaged libraries. ggml_backend_load_all() will skip already loaded backends.
|
|
102
231
|
#ifdef __ANDROID__
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
if (!ggml_backend_reg_by_name("CPU")) {
|
|
106
|
-
void* cpu_handle = dlopen("libggml-cpu.so", RTLD_LAZY | RTLD_LOCAL);
|
|
107
|
-
if (cpu_handle) {
|
|
108
|
-
typedef ggml_backend_reg_t (*backend_init_fn_t)();
|
|
109
|
-
backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(cpu_handle, "ggml_backend_init");
|
|
110
|
-
if (backend_init) {
|
|
111
|
-
ggml_backend_reg_t cpu_backend = backend_init();
|
|
112
|
-
if (cpu_backend) {
|
|
113
|
-
ggml_backend_register(cpu_backend);
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
// Load Hexagon backend (Snapdragon DSP) - more performant than Vulkan on Snapdragon devices
|
|
120
|
-
// Load before other GPU backends to give it priority
|
|
121
|
-
// Check if already registered to avoid duplicate registration
|
|
122
|
-
if (!ggml_backend_reg_by_name("HTP")) {
|
|
123
|
-
void* hexagon_handle = dlopen("libggml-hexagon.so", RTLD_LAZY | RTLD_LOCAL);
|
|
124
|
-
if (hexagon_handle) {
|
|
125
|
-
typedef ggml_backend_reg_t (*backend_init_fn_t)();
|
|
126
|
-
backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(hexagon_handle, "ggml_backend_init");
|
|
127
|
-
if (backend_init) {
|
|
128
|
-
ggml_backend_reg_t hexagon_backend = backend_init();
|
|
129
|
-
if (hexagon_backend) {
|
|
130
|
-
ggml_backend_register(hexagon_backend);
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
}
|
|
232
|
+
load_android_backends();
|
|
233
|
+
#endif
|
|
135
234
|
|
|
136
|
-
// Load
|
|
137
|
-
// ggml_backend_load_all() will skip backends that are already loaded
|
|
138
|
-
ggml_backend_load_all();
|
|
139
|
-
#else
|
|
235
|
+
// Load any remaining backends (ggml_backend_load_all will skip already loaded ones)
|
|
140
236
|
ggml_backend_load_all();
|
|
141
|
-
#endif
|
|
142
237
|
|
|
143
238
|
// Verify at least CPU backend was loaded
|
|
144
239
|
if (ggml_backend_reg_count() == 0) {
|
|
@@ -389,26 +484,20 @@ jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
|
|
|
389
484
|
|
|
390
485
|
// Load all available backends (CPU is dynamically loaded when GGML_BACKEND_DL is enabled)
|
|
391
486
|
// With GGML_BACKEND_DL=ON, ALL backends (CPU + GPU) are dynamically loaded
|
|
392
|
-
//
|
|
487
|
+
// When GGML_CPU_ALL_VARIANTS is enabled, CPU backend variants are:
|
|
488
|
+
// libggml-cpu-android_armv8.0_1.so (baseline - emulator compatible)
|
|
489
|
+
// libggml-cpu-android_armv8.2_1.so (DOTPROD)
|
|
490
|
+
// libggml-cpu-android_armv8.2_2.so (DOTPROD + FP16_VECTOR_ARITHMETIC)
|
|
491
|
+
// libggml-cpu-android_armv8.6_1.so (DOTPROD + FP16_VECTOR_ARITHMETIC + MATMUL_INT8)
|
|
492
|
+
// GPU backends are in libggml-opencl.so, libggml-vulkan.so, libggml-hexagon.so
|
|
493
|
+
// On Android, manually load all backends since filesystem iteration doesn't work
|
|
494
|
+
// with APK-packaged libraries. ggml_backend_load_all() will skip already loaded backends.
|
|
393
495
|
#ifdef __ANDROID__
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
if (cpu_handle) {
|
|
397
|
-
typedef ggml_backend_reg_t (*backend_init_fn_t)();
|
|
398
|
-
backend_init_fn_t backend_init = (backend_init_fn_t)dlsym(cpu_handle, "ggml_backend_init");
|
|
399
|
-
if (backend_init) {
|
|
400
|
-
ggml_backend_reg_t cpu_backend = backend_init();
|
|
401
|
-
if (cpu_backend) {
|
|
402
|
-
ggml_backend_register(cpu_backend);
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
}
|
|
496
|
+
load_android_backends();
|
|
497
|
+
#endif
|
|
406
498
|
|
|
407
|
-
// Load
|
|
408
|
-
ggml_backend_load_all();
|
|
409
|
-
#else
|
|
499
|
+
// Load other backends (OpenCL, Vulkan, etc.) - ggml_backend_load_all will skip already loaded backends
|
|
410
500
|
ggml_backend_load_all();
|
|
411
|
-
#endif
|
|
412
501
|
|
|
413
502
|
// Verify at least CPU backend was loaded
|
|
414
503
|
if (ggml_backend_reg_count() == 0) {
|
|
@@ -549,12 +638,30 @@ jsi::Value PureCppImpl::initLlama(jsi::Runtime &runtime, jsi::Object options) {
|
|
|
549
638
|
// Now assign to the context
|
|
550
639
|
selfPtr->rn_ctx_->params = rn_params;
|
|
551
640
|
|
|
552
|
-
|
|
641
|
+
// Initialize chat templates (matches server.cpp approach)
|
|
642
|
+
// common_chat_templates_init already has try-catch internally for template parsing errors,
|
|
643
|
+
// but exceptions can escape from chat_template constructor during capability detection.
|
|
644
|
+
// We catch all exceptions (not just std::exception) to handle any edge cases.
|
|
553
645
|
try {
|
|
646
|
+
selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, params.chat_template);
|
|
647
|
+
|
|
648
|
+
// Validate template by trying to format an example (catches runtime errors like null lstrip)
|
|
649
|
+
// This is optional - if it fails, we still use the template anyway (it might work in practice)
|
|
650
|
+
try {
|
|
554
651
|
common_chat_format_example(selfPtr->rn_ctx_->chat_templates.get(), params.use_jinja, params.default_template_kwargs);
|
|
555
|
-
|
|
556
|
-
//
|
|
652
|
+
} catch (...) {
|
|
653
|
+
// Template validation failed, but continue anyway - the template might work in practice
|
|
654
|
+
// This preserves backward compatibility for models that were working before
|
|
655
|
+
}
|
|
656
|
+
} catch (...) {
|
|
657
|
+
// Template initialization failed - fallback to chatml (matches server.cpp behavior)
|
|
658
|
+
// Catch all exceptions (not just std::exception) to handle any edge cases
|
|
659
|
+
try {
|
|
557
660
|
selfPtr->rn_ctx_->chat_templates = common_chat_templates_init(selfPtr->rn_ctx_->model, "chatml");
|
|
661
|
+
} catch (...) {
|
|
662
|
+
// Even chatml failed - this should never happen, but handle it gracefully
|
|
663
|
+
// The model will still load, but chat templates won't work
|
|
664
|
+
}
|
|
558
665
|
}
|
|
559
666
|
|
|
560
667
|
// Schedule success callback on JS thread to create JSI objects
|
package/cpp/rn-completion.cpp
CHANGED
|
@@ -379,19 +379,46 @@ CompletionResult run_chat_completion(
|
|
|
379
379
|
chat_msgs = common_chat_msgs_parse_oaicompat(data["messages"]);
|
|
380
380
|
}
|
|
381
381
|
|
|
382
|
-
// Apply template
|
|
382
|
+
// Apply template (matches server.cpp oaicompat_chat_params_parse approach)
|
|
383
383
|
common_chat_templates_inputs template_inputs;
|
|
384
384
|
template_inputs.messages = chat_msgs;
|
|
385
385
|
template_inputs.add_generation_prompt = true;
|
|
386
386
|
template_inputs.use_jinja = rn_ctx->params.use_jinja;
|
|
387
|
-
|
|
388
|
-
|
|
387
|
+
template_inputs.reasoning_format = rn_ctx->params.reasoning_format;
|
|
388
|
+
|
|
389
|
+
// Set chat_template_kwargs from params (matches server.cpp line 712)
|
|
390
|
+
template_inputs.chat_template_kwargs = rn_ctx->params.default_template_kwargs;
|
|
391
|
+
|
|
392
|
+
// Merge any chat_template_kwargs from request body (if present in future)
|
|
393
|
+
// For now, we use the defaults from params
|
|
394
|
+
|
|
395
|
+
// Parse enable_thinking from chat_template_kwargs (matches server.cpp lines 718-725)
|
|
396
|
+
auto enable_thinking_kwarg = template_inputs.chat_template_kwargs.find("enable_thinking");
|
|
397
|
+
if (enable_thinking_kwarg != template_inputs.chat_template_kwargs.end()) {
|
|
398
|
+
const std::string& value = enable_thinking_kwarg->second;
|
|
399
|
+
if (value == "true") {
|
|
400
|
+
template_inputs.enable_thinking = true;
|
|
401
|
+
} else if (value == "false") {
|
|
402
|
+
template_inputs.enable_thinking = false;
|
|
403
|
+
}
|
|
404
|
+
// else: use default (true)
|
|
405
|
+
}
|
|
389
406
|
|
|
390
407
|
// Add grammar if present in options
|
|
391
408
|
if (!options.grammar.empty()) {
|
|
392
409
|
template_inputs.grammar = options.grammar;
|
|
393
410
|
}
|
|
394
411
|
|
|
412
|
+
// Parse json_schema if present (matches server.cpp line 696)
|
|
413
|
+
if (data.contains("json_schema") && !data["json_schema"].is_null()) {
|
|
414
|
+
template_inputs.json_schema = data["json_schema"].dump();
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// Check for conflicting grammar and json_schema (matches server.cpp lines 570-572)
|
|
418
|
+
if (!template_inputs.json_schema.empty() && !template_inputs.grammar.empty()) {
|
|
419
|
+
throw std::runtime_error("Cannot use both json_schema and grammar");
|
|
420
|
+
}
|
|
421
|
+
|
|
395
422
|
// Parse tools if present
|
|
396
423
|
if (data.contains("tools") && !data["tools"].empty()) {
|
|
397
424
|
template_inputs.tools = common_chat_tools_parse_oaicompat(data["tools"]);
|
|
@@ -407,8 +434,20 @@ CompletionResult run_chat_completion(
|
|
|
407
434
|
? data["tool_choice"].get<std::string>()
|
|
408
435
|
: data["tool_choice"].dump());
|
|
409
436
|
}
|
|
437
|
+
|
|
438
|
+
// Parse parallel_tool_calls if present (matches server.cpp line 699)
|
|
439
|
+
if (data.contains("parallel_tool_calls")) {
|
|
440
|
+
template_inputs.parallel_tool_calls = data["parallel_tool_calls"].get<bool>();
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
// Check for conflicting tools and grammar (matches server.cpp lines 703-706)
|
|
444
|
+
if (!template_inputs.tools.empty() && template_inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
|
445
|
+
if (!template_inputs.grammar.empty()) {
|
|
446
|
+
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
|
447
|
+
}
|
|
448
|
+
}
|
|
410
449
|
|
|
411
|
-
// Apply template
|
|
450
|
+
// Apply template (matches server.cpp approach - no try-catch, exceptions propagate to outer handler)
|
|
412
451
|
const auto& chat_params = common_chat_templates_apply(rn_ctx->chat_templates.get(), template_inputs);
|
|
413
452
|
|
|
414
453
|
CompletionOptions cmpl_options = options;
|