npm - cui-llama.rn - Versions diffs - 1.6.0 → 1.7.0 - Mend

cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (285) hide show

package/ios/CMakeLists.txt CHANGED Viewed

@@ -40,15 +40,18 @@ add_library(rnllama SHARED
     ${SOURCE_DIR}/ggml-alloc.c
     ${SOURCE_DIR}/ggml-backend.cpp
     ${SOURCE_DIR}/ggml-backend-reg.cpp
-    ${SOURCE_DIR}/ggml-cpu.c
-    ${SOURCE_DIR}/ggml-cpu.cpp
-    ${SOURCE_DIR}/ops.cpp
-    ${SOURCE_DIR}/unary-ops.cpp
-    ${SOURCE_DIR}/binary-ops.cpp
-    ${SOURCE_DIR}/vec.cpp
-    ${SOURCE_DIR}/ggml-cpu-aarch64.cpp
-    ${SOURCE_DIR}/ggml-cpu-quants.c
-    ${SOURCE_DIR}/ggml-cpu-traits.cpp
+    ${SOURCE_DIR}/ggml-cpu/amx/amx.cpp
+    ${SOURCE_DIR}/ggml-cpu/amx/mmq.cpp
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu.c
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu.cpp
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu-aarch64.cpp
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu-quants.c
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu-traits.cpp
+    ${SOURCE_DIR}/ggml-cpu/unary-ops.cpp
+    ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
+    ${SOURCE_DIR}/ggml-cpu/sgemm.cpp
+    ${SOURCE_DIR}/ggml-cpu/vec.cpp
+    ${SOURCE_DIR}/ggml-cpu/ops.cpp
     ${SOURCE_DIR}/ggml-metal.m
     ${SOURCE_DIR}/ggml-opt.cpp
     ${SOURCE_DIR}/ggml-threading.cpp
@@ -70,6 +73,7 @@ add_library(rnllama SHARED
     ${SOURCE_DIR}/llama.cpp
     ${SOURCE_DIR}/llama-model.cpp
     ${SOURCE_DIR}/llama-model-loader.cpp
+    ${SOURCE_DIR}/llama-model-saver.cpp
     ${SOURCE_DIR}/llama-mmap.cpp
     ${SOURCE_DIR}/llama-vocab.cpp
     ${SOURCE_DIR}/llama-memory.cpp
@@ -78,13 +82,17 @@ add_library(rnllama SHARED
     ${SOURCE_DIR}/sampling.cpp
     ${SOURCE_DIR}/unicode-data.cpp
     ${SOURCE_DIR}/unicode.cpp
-    ${SOURCE_DIR}/sgemm.cpp
     ${SOURCE_DIR}/common.cpp
     ${SOURCE_DIR}/chat.cpp
     ${SOURCE_DIR}/json-schema-to-grammar.cpp
     ${SOURCE_DIR}/minja/minja.hpp
     ${SOURCE_DIR}/minja/chat-template.hpp
     ${SOURCE_DIR}/json.hpp
+    # Multimodal support
+    ${SOURCE_DIR}/tools/mtmd/mtmd.cpp
+    ${SOURCE_DIR}/tools/mtmd/mtmd-audio.cpp
+    ${SOURCE_DIR}/tools/mtmd/clip.cpp
+    ${SOURCE_DIR}/tools/mtmd/mtmd-helper.cpp
     ${SOURCE_DIR}/rn-llama.cpp
 )
@@ -92,6 +100,8 @@ add_library(rnllama SHARED
 target_include_directories(rnllama
     PUBLIC
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/ggml-cpu>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/tools/mtmd>
         $<INSTALL_INTERFACE:include>
 )

package/ios/RNLlama.h CHANGED Viewed

@@ -1,6 +1,12 @@
 #import <React/RCTEventEmitter.h>
 #import <React/RCTBridgeModule.h>
+#if RNLLAMA_BUILD_FROM_SOURCE
+#import "json.hpp"
+#else
+#import <rnllama/json.hpp>
+#endif
 // TODO: Use RNLlamaSpec (Need to refactor NSDictionary usage)
 @interface RNLlama : RCTEventEmitter <RCTBridgeModule>

package/ios/RNLlama.mm CHANGED Viewed

@@ -108,8 +108,13 @@ RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
         } else {
             resolve([context getFormattedChat:messages withChatTemplate:chatTemplate]);
         }
+    } catch (const nlohmann::json_abi_v3_11_3::detail::parse_error& e) {
+        NSString *errorMessage = [NSString stringWithUTF8String:e.what()];
+        reject(@"llama_error", [NSString stringWithFormat:@"JSON parse error in getFormattedChat: %@", errorMessage], nil);
     } catch (const std::exception& e) { // catch cpp exceptions
         reject(@"llama_error", [NSString stringWithUTF8String:e.what()], nil);
+    } catch (...) {
+        reject(@"llama_error", @"Unknown error in getFormattedChat", nil);
     }
 }
@@ -229,6 +234,7 @@ RCT_EXPORT_METHOD(stopCompletion:(double)contextId
 RCT_EXPORT_METHOD(tokenizeASync:(double)contextId
                   text:(NSString *)text
+                  imagePaths:(NSArray *)imagePaths
                   withResolver:(RCTPromiseResolveBlock)resolve
                   withRejecter:(RCTPromiseRejectBlock)reject)
 {
@@ -237,9 +243,13 @@ RCT_EXPORT_METHOD(tokenizeASync:(double)contextId
         reject(@"llama_error", @"Context not found", nil);
         return;
     }
-    NSMutableArray *tokens = [context tokenize:text];
-    resolve(@{ @"tokens": tokens });
-    [tokens release];
+    @try {
+        NSMutableDictionary *result = [context tokenize:text imagePaths:imagePaths];
+        resolve(result);
+        [result release];
+    } @catch (NSException *exception) {
+        reject(@"llama_error", exception.reason, nil);
+    }
 }
 RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(tokenizeSync:(double)contextId
@@ -355,6 +365,75 @@ RCT_EXPORT_METHOD(getLoadedLoraAdapters:(double)contextId
     resolve([context getLoadedLoraAdapters]);
 }
+RCT_EXPORT_METHOD(initMultimodal:(double)contextId
+                 withParams:(NSDictionary *)params
+                 withResolver:(RCTPromiseResolveBlock)resolve
+                 withRejecter:(RCTPromiseRejectBlock)reject)
+{
+    RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
+    if (context == nil) {
+        reject(@"llama_error", @"Context not found", nil);
+        return;
+    }
+    if ([context isPredicting]) {
+        reject(@"llama_error", @"Context is busy", nil);
+        return;
+    }
+    @try {
+        bool success = [context initMultimodal:params];
+        resolve(@(success));
+    } @catch (NSException *exception) {
+        reject(@"llama_cpp_error", exception.reason, nil);
+    }
+}
+RCT_EXPORT_METHOD(isMultimodalEnabled:(double)contextId
+                 withResolver:(RCTPromiseResolveBlock)resolve
+                 withRejecter:(RCTPromiseRejectBlock)reject)
+{
+    RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
+    if (context == nil) {
+        reject(@"llama_error", @"Context not found", nil);
+        return;
+    }
+    resolve(@([context isMultimodalEnabled]));
+}
+RCT_EXPORT_METHOD(getMultimodalSupport:(double)contextId
+                 withResolver:(RCTPromiseResolveBlock)resolve
+                 withRejecter:(RCTPromiseRejectBlock)reject)
+{
+    RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
+    if (context == nil) {
+        reject(@"llama_error", @"Context not found", nil);
+        return;
+    }
+    if (![context isMultimodalEnabled]) {
+        reject(@"llama_error", @"Multimodal is not enabled", nil);
+        return;
+    }
+    NSDictionary *multimodalSupport = [context getMultimodalSupport];
+    resolve(multimodalSupport);
+}
+RCT_EXPORT_METHOD(releaseMultimodal:(double)contextId
+                 withResolver:(RCTPromiseResolveBlock)resolve
+                 withRejecter:(RCTPromiseRejectBlock)reject)
+{
+    RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
+    if (context == nil) {
+        reject(@"llama_error", @"Context not found", nil);
+        return;
+    }
+    [context releaseMultimodal];
+    resolve(nil);
+}
 RCT_EXPORT_METHOD(releaseContext:(double)contextId
                  withResolver:(RCTPromiseResolveBlock)resolve
                  withRejecter:(RCTPromiseRejectBlock)reject)

package/ios/RNLlamaContext.h CHANGED Viewed

@@ -34,9 +34,13 @@
 - (NSDictionary *)modelInfo;
 - (bool)isModelLoaded;
 - (bool)isPredicting;
+- (bool)initMultimodal:(NSDictionary *)params;
+- (NSDictionary *)getMultimodalSupport;
+- (bool)isMultimodalEnabled;
+- (void)releaseMultimodal;
 - (NSDictionary *)completion:(NSDictionary *)params onToken:(void (^)(NSMutableDictionary *tokenResult))onToken;
 - (void)stopCompletion;
-- (NSArray *)tokenize:(NSString *)text;
+- (NSDictionary *)tokenize:(NSString *)text imagePaths:(NSArray *)imagePaths;
 - (NSString *)detokenize:(NSArray *)tokens;
 - (NSDictionary *)embedding:(NSString *)text params:(NSDictionary *)params;
 - (NSDictionary *)getFormattedChatWithJinja:(NSString *)messages

package/ios/RNLlamaContext.mm CHANGED Viewed

@@ -82,7 +82,7 @@
     BOOL isAsset = [params[@"is_model_asset"] boolValue];
     NSString *path = modelPath;
     if (isAsset) path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
-    defaultParams.model = {[path UTF8String]};
+    defaultParams.model.path = [path UTF8String];
     NSString *chatTemplate = params[@"chat_template"];
     if (chatTemplate) {
@@ -106,37 +106,27 @@
     NSString *reasonNoMetal = @"";
     defaultParams.n_gpu_layers = 0;
 #ifdef LM_GGML_USE_METAL
-    // Check ggml-metal availability
-    NSError * error = nil;
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    id<MTLLibrary> library = [device
-        newLibraryWithSource:@"#include <metal_stdlib>\n"
-                                "using namespace metal;"
-                                "typedef matrix<bfloat, 4, 4> bfloat4x4;"
-                                "kernel void test() { simd_sum(0); }"
-        options:nil
-        error:&error
-    ];
-    if (error) {
-        reasonNoMetal = [error localizedDescription];
+    // Check ggml-metal availability
+    BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
+    if (@available(iOS 16.0, tvOS 16.0, *)) {
+        supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
+    }
+    if (!supportsGgmlMetal) {
+        reasonNoMetal = @"Metal is not supported in this device";
         skipGpuDevices = true;
-    } else {
-        id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
-        id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
-        if (pipeline == nil) {
-            reasonNoMetal = [error localizedDescription];
-            skipGpuDevices = true;
-        } else {
+    }
 #if TARGET_OS_SIMULATOR
-            // Use the backend, but no layers because not supported fully on simulator
-            defaultParams.n_gpu_layers = 0;
-            isMetalEnabled = true;
+    // Use the backend, but no layers because not supported fully on simulator
+    defaultParams.n_gpu_layers = 0;
+    isMetalEnabled = true;
 #else
-            defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
-            isMetalEnabled = true;
+    defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
+    isMetalEnabled = true;
 #endif
-        }
-    }
     device = nil;
 #else
     reasonNoMetal = @"Metal is not enabled in this build";
@@ -158,6 +148,8 @@
         }
         if (cpu_devs.size() > 0) {
             defaultParams.devices = cpu_devs;
+            defaultParams.n_gpu_layers = 0;
+            isMetalEnabled = false;
         }
     }
@@ -184,6 +176,8 @@
     if (params[@"flash_attn"] && [params[@"flash_attn"] boolValue]) defaultParams.flash_attn = true;
+    if (params[@"ctx_shift"]) defaultParams.ctx_shift = [params[@"ctx_shift"] boolValue];
     if (params[@"cache_type_k"]) defaultParams.cache_type_k = rnllama::kv_cache_type_from_str([params[@"cache_type_k"] UTF8String]);
     if (params[@"cache_type_v"]) defaultParams.cache_type_v = rnllama::kv_cache_type_from_str([params[@"cache_type_v"] UTF8String]);
@@ -338,6 +332,30 @@
     return llama->is_predicting;
 }
+- (bool)initMultimodal:(NSDictionary *)params {
+    NSString *mmproj_path = params[@"path"];
+    BOOL use_gpu = params[@"use_gpu"] ? [params[@"use_gpu"] boolValue] : true;
+    return llama->initMultimodal([mmproj_path UTF8String], use_gpu);
+}
+- (NSDictionary *)getMultimodalSupport {
+    if (!is_model_loaded) return nil;
+    return @{
+        @"vision": @(llama->isMultimodalSupportVision()),
+        @"audio": @(llama->isMultimodalSupportAudio())
+    };
+}
+- (bool)isMultimodalEnabled {
+    if (!is_model_loaded) return false;
+    return llama->isMultimodalEnabled();
+}
+- (void)releaseMultimodal {
+    if (!is_model_loaded) return;
+    llama->releaseMultimodal();
+}
 - (NSDictionary *)getFormattedChatWithJinja:(NSString *)messages
     withChatTemplate:(NSString *)chatTemplate
     withJsonSchema:(NSString *)jsonSchema
@@ -566,8 +584,32 @@
     if (!llama->initSampling()) {
         @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
     }
     llama->beginCompletion();
-    llama->loadPrompt();
+    try {
+        // Use the unified loadPrompt function with image paths if available
+        NSArray *imagePaths = params[@"media_paths"];
+        if (imagePaths && [imagePaths count] > 0) {
+            // Multiple image paths
+            std::vector<std::string> media_paths_vector;
+            for (NSString *path in imagePaths) {
+                if ([path isKindOfClass:[NSString class]]) {
+                    media_paths_vector.push_back([path UTF8String]);
+                }
+            }
+            llama->loadPrompt(media_paths_vector);
+        } else {
+            llama->loadPrompt({});
+        }
+    } catch (const std::exception &e) {
+        llama->endCompletion();
+        @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
+    }
+    if (llama->context_full) {
+        llama->endCompletion();
+        @throw [NSException exceptionWithName:@"LlamaException" reason:@"Context is full" userInfo:nil];
+    }
     size_t sent_count = 0;
     size_t sent_token_probs_index = 0;
@@ -628,7 +670,7 @@
     }
     llama_perf_context_print(llama->ctx);
-    llama->is_predicting = false;
+    llama->endCompletion();
     const auto timings = llama_perf_context(llama->ctx);
@@ -655,7 +697,7 @@
                 }];
             }
         } catch (const std::exception &e) {
-            // NSLog(@"Error parsing tool calls: %s", e.what());
+        } catch (...) {
         }
     }
@@ -668,6 +710,7 @@
     result[@"tokens_predicted"] = @(llama->num_tokens_predicted);
     result[@"tokens_evaluated"] = @(llama->num_prompt_tokens);
     result[@"truncated"] = @(llama->truncated);
+    result[@"context_full"] = @(llama->context_full);
     result[@"stopped_eos"] = @(llama->stopped_eos);
     result[@"stopped_word"] = @(llama->stopped_word);
     result[@"stopped_limit"] = @(llama->stopped_limit);
@@ -691,13 +734,48 @@
     llama->is_interrupted = true;
 }
-- (NSArray *)tokenize:(NSString *)text {
-    const std::vector<llama_token> toks = common_tokenize(llama->ctx, [text UTF8String], false);
-    NSMutableArray *result = [[NSMutableArray alloc] init];
-    for (llama_token tok : toks) {
-        [result addObject:@(tok)];
+- (NSDictionary *)tokenize:(NSString *)text imagePaths:(NSArray *)imagePaths {
+    std::vector<std::string> media_paths_vector;
+    if (imagePaths && [imagePaths count] > 0) {
+        for (NSString *path in imagePaths) {
+            if ([path isKindOfClass:[NSString class]]) {
+                media_paths_vector.push_back([path UTF8String]);
+            }
+        }
+    }
+    try {
+        rnllama::llama_rn_tokenize_result tokenize_result = llama->tokenize([text UTF8String], media_paths_vector);
+        NSMutableDictionary *result = [[NSMutableDictionary alloc] init];
+        result[@"tokens"] = [NSMutableArray arrayWithCapacity:tokenize_result.tokens.size()];
+        for (llama_token tok : tokenize_result.tokens) {
+            [result[@"tokens"] addObject:@(tok)];
+        }
+        result[@"has_media"] = @(tokenize_result.has_media);
+        NSMutableArray *bitmap_hashes = [[NSMutableArray alloc] init];
+        for (std::string hash : tokenize_result.bitmap_hashes) {
+            [bitmap_hashes addObject:[NSString stringWithUTF8String:hash.c_str()]];
+        }
+        result[@"bitmap_hashes"] = bitmap_hashes;
+        NSMutableArray *chunk_pos = [[NSMutableArray alloc] init];
+        for (int pos : tokenize_result.chunk_pos) {
+            [chunk_pos addObject:@(pos)];
+        }
+        result[@"chunk_pos"] = chunk_pos;
+        NSMutableArray *chunk_pos_media = [[NSMutableArray alloc] init];
+        for (int pos : tokenize_result.chunk_pos_media) {
+            [chunk_pos_media addObject:@(pos)];
+        }
+        result[@"chunk_pos_media"] = chunk_pos_media;
+        return result;
+    } catch (const std::exception &e) {
+        @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
     }
-    return result;
 }
 - (NSString *)detokenize:(NSArray *)tokens {
@@ -734,7 +812,12 @@
         @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
     }
     llama->beginCompletion();
-    llama->loadPrompt();
+    try {
+      llama->loadPrompt({});
+    } catch (const std::exception &e) {
+      llama->endCompletion();
+      @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
+    }
     llama->doCompletion();
     std::vector<float> result = llama->getEmbedding(embdParams);
@@ -751,7 +834,7 @@
     }
     resultDict[@"prompt_tokens"] = promptTokens;
-    llama->is_predicting = false;
+    llama->endCompletion();
     return resultDict;
 }
@@ -769,6 +852,11 @@
         @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to load session" userInfo:nil];
     }
     llama->embd.resize(n_token_count_out);
+    // Find LLAMA_TOKEN_NULL in the tokens and resize the array to the index of the null token
+    auto null_token_iter = std::find(llama->embd.begin(), llama->embd.end(), LLAMA_TOKEN_NULL);
+    if (null_token_iter != llama->embd.end()) {
+        llama->embd.resize(std::distance(llama->embd.begin(), null_token_iter));
+    }
     const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
     return @{
         @"tokens_loaded": @(n_token_count_out),
@@ -781,6 +869,11 @@
         @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session path is empty" userInfo:nil];
     }
     std::vector<llama_token> session_tokens = llama->embd;
+    // Find LLAMA_TOKEN_NULL in the tokens and resize the array to the index of the null token
+    auto null_token_iter = std::find(session_tokens.begin(), session_tokens.end(), LLAMA_TOKEN_NULL);
+    if (null_token_iter != session_tokens.end()) {
+        session_tokens.resize(std::distance(session_tokens.begin(), null_token_iter));
+    }
     int default_size = session_tokens.size();
     int save_size = size > 0 && size <= default_size ? size : default_size;
     if (!llama_state_save_file(llama->ctx, [path UTF8String], session_tokens.data(), save_size)) {

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h CHANGED Viewed

@@ -3,6 +3,7 @@
 #pragma once
 #include "common.h"
+#include <chrono>
 #include <string>
 #include <vector>
 #include "minja/chat-template.hpp"
@@ -79,6 +80,7 @@ struct common_chat_templates_inputs {
     common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
     bool extract_reasoning     = true;
+    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 struct common_chat_params {

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h CHANGED Viewed

@@ -6,6 +6,7 @@
 #include <set>
 #include <string>
+#include <string_view>
 #include <vector>
 #include <sstream>
@@ -77,7 +78,6 @@ enum llama_example {
     LLAMA_EXAMPLE_COMMON,
     LLAMA_EXAMPLE_SPECULATIVE,
     LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
     LLAMA_EXAMPLE_EMBEDDING,
     LLAMA_EXAMPLE_PERPLEXITY,
     LLAMA_EXAMPLE_RETRIEVAL,
@@ -87,7 +87,7 @@ enum llama_example {
     LLAMA_EXAMPLE_SERVER,
     LLAMA_EXAMPLE_CVECTOR_GENERATOR,
     LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
@@ -107,6 +107,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_XTC         = 8,
     COMMON_SAMPLER_TYPE_INFILL      = 9,
     COMMON_SAMPLER_TYPE_PENALTIES   = 10,
+    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
 };
 // dimensionality reduction methods, used by cvector-generator
@@ -172,6 +173,7 @@ struct common_params_sampling {
     std::vector<enum common_sampler_type> samplers = {
         COMMON_SAMPLER_TYPE_PENALTIES,
         COMMON_SAMPLER_TYPE_DRY,
+        COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
         COMMON_SAMPLER_TYPE_TOP_K,
         COMMON_SAMPLER_TYPE_TYPICAL_P,
         COMMON_SAMPLER_TYPE_TOP_P,
@@ -336,17 +338,17 @@ struct common_params {
     bool flash_attn        = false; // flash attention
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = true;  // context shift on inifinite text generation
+    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
-    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
     bool no_kv_offload     = false; // disable KV offloading
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
+    bool no_op_offload     = false; // globally disable offload host tensor operations to device
     bool single_turn       = false; // single turn chat conversation
@@ -355,8 +357,10 @@ struct common_params {
     common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
-    // multimodal models (see examples/llava)
+    // multimodal models (see tools/mtmd)
     struct common_params_model mmproj;
+    bool mmproj_use_gpu = true;     // use GPU for multimodal model
+    bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
     // embedding
@@ -379,6 +383,7 @@ struct common_params {
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
     std::vector<std::string> api_keys;
@@ -422,13 +427,14 @@ struct common_params {
     bool process_output = false; // collect data for the output tensor
     bool compute_ppl    = true;  // whether to compute perplexity
+    bool parse_special  = false; // whether to parse special tokens during imatrix tokenization
     // cvector-generator params
     int n_pca_batch = 100;
     int n_pca_iterations = 1000;
     dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
@@ -437,6 +443,11 @@ struct common_params {
     // common params
     std::string out_file; // output filename for all example programs
+    // optional callback for model loading progress and cancellation:
+    // called with a progress value between 0.0 and 1.0.
+    // return false from callback to abort model loading or true to continue
+    llama_progress_callback load_progress_callback = NULL;
+    void *                  load_progress_callback_user_data = NULL;
 };
 // call once at the start of a program if it uses libcommon
@@ -514,10 +525,9 @@ static bool string_starts_with(const std::string & str,
     return str.rfind(prefix, 0) == 0;
 }
-static bool string_ends_with(const std::string & str,
-                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
-    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
-}
+// While we wait for C++20's std::string::ends_with...
+bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
+size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@@ -558,6 +568,8 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
+std::string                   get_model_endpoint();
 //
 // Batch utils
 //
@@ -624,16 +636,6 @@ std::string common_detokenize(
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
-//
-// KV cache utils
-//
-// Dump the KV cache view with the number of sequences per cell.
-void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
-// Dump the KV cache view showing individual sequences in each cell (long output).
-void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
 //
 // Embedding utils
 //
@@ -675,3 +677,9 @@ const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 }
+//
+// training utils
+//
+lm_ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);