npm - cui-llama.rn - Versions diffs - 1.1.4 → 1.1.6 - Mend

cui-llama.rn 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/android/src/main/CMakeLists.txt +1 -0
package/android/src/main/jni.cpp +3 -4
package/cpp/common.cpp +183 -1990
package/cpp/common.h +101 -130
package/cpp/ggml-impl.h +32 -0
package/cpp/ggml-metal.m +38 -28
package/cpp/ggml-quants.c +275 -84
package/cpp/ggml.c +89 -35
package/cpp/ggml.h +30 -67
package/cpp/llama-impl.h +1 -0
package/cpp/llama-sampling.cpp +218 -102
package/cpp/llama.cpp +599 -120
package/cpp/llama.h +33 -25
package/cpp/log.cpp +401 -0
package/cpp/log.h +85 -703
package/cpp/rn-llama.hpp +9 -11
package/cpp/sampling.cpp +12 -9
package/cpp/sampling.h +4 -56
package/cpp/sgemm.cpp +38 -0
package/package.json +1 -1

package/cpp/common.h CHANGED Viewed

@@ -4,20 +4,9 @@
 #include "llama.h"
-#include "sampling.h"
-#define LOG_NO_FILE_LINE_FUNCTION
-#include "log.h"
-#include <cmath>
 #include <string>
 #include <vector>
-#include <random>
-#include <thread>
-#include <set>
-#include <unordered_map>
-#include <tuple>
-#include <functional>
+#include <sstream>
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -67,11 +56,20 @@ extern char const *LLAMA_BUILD_TARGET;
 // CPU utils
 //
+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum lm_ggml_sched_priority  priority   = LM_GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();
 //
-// CLI argument parsing
+// Common params
 //
 enum llama_example {
@@ -89,27 +87,76 @@ enum llama_example {
     LLAMA_EXAMPLE_CVECTOR_GENERATOR,
     LLAMA_EXAMPLE_EXPORT_LORA,
     LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_LOOKUP,
+    LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_COUNT,
 };
+enum gpt_sampler_type {
+    GPT_SAMPLER_TYPE_NONE        = 0,
+    GPT_SAMPLER_TYPE_TOP_K       = 1,
+    GPT_SAMPLER_TYPE_TOP_P       = 2,
+    GPT_SAMPLER_TYPE_MIN_P       = 3,
+    GPT_SAMPLER_TYPE_TFS_Z       = 4,
+    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
+    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
+    GPT_SAMPLER_TYPE_XTC         = 7,
+};
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
     DIMRE_METHOD_PCA,
     DIMRE_METHOD_MEAN,
 };
-struct cpu_params {
-    int      n_threads                   = -1;
-    bool     cpumask[LM_GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
-    bool     mask_valid                  = false;   // Default: any CPU
-    enum lm_ggml_sched_priority  priority   = LM_GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
-    bool     strict_cpu                  = false;   // Use strict CPU placement
-    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+// sampler parameters
+struct gpt_sampler_params {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   xtc_t             = 0.0f;  // 0.0 = disabled
+    float   xtc_p             = 0.0f;
+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = false; // consider newlines as a repeatable token
+    bool    ignore_eos        = false;
+    bool    no_perf           = false; // disable performance metrics
+    std::vector<enum gpt_sampler_type> samplers = {
+        GPT_SAMPLER_TYPE_TOP_K,
+        GPT_SAMPLER_TYPE_TFS_Z,
+        GPT_SAMPLER_TYPE_TYPICAL_P,
+        GPT_SAMPLER_TYPE_TOP_P,
+        GPT_SAMPLER_TYPE_MIN_P,
+        GPT_SAMPLER_TYPE_TEMPERATURE,
+        GPT_SAMPLER_TYPE_XTC
+    };
+    std::string grammar; // optional BNF-like grammar to constrain sampling
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+    // print the parameters into a string
+    std::string print() const;
 };
 struct gpt_params {
-    enum llama_example curr_ex    = LLAMA_EXAMPLE_COMMON;
     bool vocab_only               = false;
     int32_t n_predict             =    -1; // new tokens to predict
@@ -155,23 +202,23 @@ struct gpt_params {
     struct gpt_sampler_params sparams;
-    std::string model                = ""; // model path
-    std::string model_draft          = ""; // draft model for speculative decoding
-    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = ""; // model url to download
-    std::string hf_token             = ""; // HF token
-    std::string hf_repo              = ""; // HF repo
-    std::string hf_file              = ""; // HF file
-    std::string prompt               = "";
-    std::string prompt_file          = ""; // store the external prompt file name
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
-    std::string input_prefix         = ""; // string to prefix user inputs with
-    std::string input_suffix         = ""; // string to suffix user inputs with
-    std::string logdir               = ""; // directory in which to save YAML log files
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = ""; // file for saving *all* logits
-    std::string rpc_servers          = ""; // comma separated list of RPC servers
+    std::string model                = ""; // model path                                                    // NOLINT
+    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_url            = ""; // model url to download                                         // NOLINT
+    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
+    std::string prompt               = "";                                                                  // NOLINT
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
+    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -201,7 +248,6 @@ struct gpt_params {
     bool   kl_divergence    = false; // compute KL divergence
-    std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
     bool usage             = false; // print usage
     bool use_color         = false; // use color to distinguish generations and inputs
     bool special           = false; // enable special token output
@@ -216,6 +262,8 @@ struct gpt_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
+    bool no_perf           = false; // disable performance metrics
+    bool ctx_shift         = true;  // context shift on inifinite text generation
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool logits_all        = false; // return logits for all tokens in the batch
@@ -232,7 +280,7 @@ struct gpt_params {
     std::string cache_type_v = "f16"; // KV cache data type for the V
     // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
     std::vector<std::string> image; // path to image file(s)
     // embedding
@@ -248,15 +296,15 @@ struct gpt_params {
     int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
     std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";
-    std::string chat_template = "";
-    std::string system_prompt = "";
+    std::string public_path   = "";                                                                         // NOLINT
+    std::string chat_template = "";                                                                         // NOLINT
+    std::string system_prompt = "";                                                                         // NOLINT
     bool enable_chat_template = true;
     std::vector<std::string> api_keys;
-    std::string ssl_file_key  = "";
-    std::string ssl_file_cert = "";
+    std::string ssl_file_key  = "";                                                                         // NOLINT
+    std::string ssl_file_cert = "";                                                                         // NOLINT
     bool endpoint_slots   = true;
     bool endpoint_metrics = false;
@@ -311,91 +359,9 @@ struct gpt_params {
     bool batched_bench_output_jsonl = false;
 };
-struct llama_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::vector<const char *> args;
-    const char * value_hint   = nullptr; // help text or example for arg value
-    const char * value_hint_2 = nullptr; // for second arg value
-    const char * env          = nullptr;
-    std::string help;
-    void (*handler_void)   (gpt_params & params) = nullptr;
-    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, int) = nullptr;
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
-        void (*handler)(gpt_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
-    // support 2 values for arg
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
-    llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
-        this->examples = std::move(examples);
-        return *this;
-    }
-    llama_arg & set_env(const char * env) {
-        help = help + "\n(env: " + env + ")";
-        this->env = env;
-        return *this;
-    }
-    bool in_example(enum llama_example ex) {
-        return examples.find(ex) != examples.end();
-    }
-    bool get_value_from_env(std::string & output) const {
-        if (env == nullptr) return false;
-        char * value = std::getenv(env);
-        if (value) {
-            output = value;
-            return true;
-        }
-        return false;
-    }
-    bool has_value_from_env() const {
-        return env != nullptr && std::getenv(env);
-    }
-    std::string to_string();
-};
-// initialize list of options (arguments) that can be used by the current example
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
-// optionally, we can provide "print_usage" to print example usage
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
-// parse input arguments from CLI
-// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool gpt_params_parse   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
-void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
+// call once at the start of a program if it uses libcommon
+// initializes the logging system and prints info about the build
+void gpt_init();
 std::string gpt_params_get_system_info(const gpt_params & params);
@@ -432,6 +398,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
+std::string string_from(bool value);
+std::string string_from(const std::vector<int> & values);
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
 //
 // Filesystem utils
 //

package/cpp/ggml-impl.h CHANGED Viewed

@@ -629,8 +629,16 @@ inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
 #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
+enum lm_ggml_cgraph_eval_order {
+    LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+    LM_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+    LM_GGML_CGRAPH_EVAL_ORDER_COUNT
+};
 // bitset
+typedef uint32_t lm_ggml_bitset_t;
 static_assert(sizeof(lm_ggml_bitset_t) == 4, "bitset_t constants must be updated");
 #define BITSET_SHR 5 // log2(sizeof(lm_ggml_bitset_t)*8)
 #define BITSET_MASK (sizeof(lm_ggml_bitset_t)*8 - 1)
@@ -656,6 +664,12 @@ static inline void lm_ggml_bitset_clear(lm_ggml_bitset_t * bitset, size_t i) {
 #define LM_GGML_HASHSET_FULL ((size_t)-1)
 #define LM_GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
+struct lm_ggml_hash_set {
+    size_t size;
+    lm_ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+    struct lm_ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if lm_ggml_bitset_get(used, i)
+};
 struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size);
 void                 lm_ggml_hash_set_free(struct lm_ggml_hash_set * hash_set);
@@ -745,6 +759,24 @@ static size_t lm_ggml_hash_find_or_insert(struct lm_ggml_hash_set * hash_set, st
     LM_GGML_ABORT("fatal error");
 }
+// computation graph
+struct lm_ggml_cgraph {
+    int size;
+    int n_nodes;
+    int n_leafs;
+    struct lm_ggml_tensor ** nodes;
+    struct lm_ggml_tensor ** grads;
+    struct lm_ggml_tensor ** leafs;
+    struct lm_ggml_hash_set visited_hash_set;
+    enum lm_ggml_cgraph_eval_order order;
+};
+struct lm_ggml_cgraph lm_ggml_graph_view(struct lm_ggml_cgraph * cgraph, int i0, int i1);
 #ifdef __cplusplus
 }
 #endif

package/cpp/ggml-metal.m CHANGED Viewed

@@ -1,7 +1,7 @@
 #import "ggml-metal.h"
+#import "ggml-impl.h"
 #import "ggml-backend-impl.h"
-#import "ggml.h"
 #import <Foundation/Foundation.h>
@@ -13,13 +13,16 @@
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #ifdef LM_GGML_METAL_NDEBUG
+#define LM_GGML_METAL_LOG(...)
 #define LM_GGML_METAL_LOG_INFO(...)
 #define LM_GGML_METAL_LOG_WARN(...)
 #define LM_GGML_METAL_LOG_ERROR(...)
 #else
-#define LM_GGML_METAL_LOG_INFO(...)  lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO, __VA_ARGS__)
-#define LM_GGML_METAL_LOG_WARN(...)  lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define LM_GGML_METAL_LOG(...)       lm_ggml_metal_log(LM_GGML_LOG_LEVEL_NONE,  __VA_ARGS__)
+#define LM_GGML_METAL_LOG_INFO(...)  lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
+#define LM_GGML_METAL_LOG_WARN(...)  lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define LM_GGML_METAL_LOG_ERROR(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LM_GGML_METAL_LOG_DEBUG(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #endif
 #define UNUSED(x) (void)(x)
@@ -882,7 +885,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
-    const int n_nodes  = gf->n_nodes;
+    const int n_nodes = gf->n_nodes;
     const int n_cb = ctx->n_cb;
     const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
@@ -3039,8 +3042,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
         if (status != MTLCommandBufferStatusCompleted) {
             LM_GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
             if (status == MTLCommandBufferStatusError) {
-                NSString * error_code = [command_buffer error].localizedDescription;
-                LM_GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
+                LM_GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
             }
             return LM_GGML_STATUS_FAILED;
@@ -3184,7 +3186,7 @@ static void lm_ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_
 #ifndef LM_GGML_METAL_NDEBUG
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
-        LM_GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
+        LM_GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
                 __func__,
                 size_aligned / 1024.0 / 1024.0,
                 device.currentAllocatedSize / 1024.0 / 1024.0,
@@ -3192,8 +3194,6 @@ static void lm_ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_
         if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
             LM_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-        } else {
-            LM_GGML_METAL_LOG_INFO("\n");
         }
     } else {
         LM_GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
@@ -3225,15 +3225,19 @@ LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_a
     ctx->n_buffers = 1;
     if (ctx->all_data != NULL) {
-        ctx->buffers[0].data = ctx->all_data;
-        ctx->buffers[0].size = size;
-        ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
-                        length:size_aligned
-                        options:MTLResourceStorageModeShared
-                        deallocator:nil];
+        ctx->buffers[0].data  = ctx->all_data;
+        ctx->buffers[0].size  = size;
+        ctx->buffers[0].metal = nil;
+        if (size_aligned > 0) {
+            ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
+                            length:size_aligned
+                            options:MTLResourceStorageModeShared
+                            deallocator:nil];
+        }
     }
-    if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
+    if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
         LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
         free(ctx);
         lm_ggml_backend_metal_free_device();
@@ -3310,14 +3314,17 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {
-        ctx->buffers[ctx->n_buffers].data = data;
-        ctx->buffers[ctx->n_buffers].size = size;
+        ctx->buffers[ctx->n_buffers].data  = data;
+        ctx->buffers[ctx->n_buffers].size  = size;
+        ctx->buffers[ctx->n_buffers].metal = nil;
-        ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+        if (size_aligned > 0) {
+            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
-        if (ctx->buffers[ctx->n_buffers].metal == nil) {
-            LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
-            return false;
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+                return false;
+            }
         }
         lm_ggml_backend_metal_log_allocated_size(device, size_aligned);
@@ -3333,14 +3340,17 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void
         for (size_t i = 0; i < size; i += size_step) {
             const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
-            ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
-            ctx->buffers[ctx->n_buffers].size = size_step_aligned;
+            ctx->buffers[ctx->n_buffers].data  = (void *) ((uint8_t *) data + i);
+            ctx->buffers[ctx->n_buffers].size  = size_step_aligned;
+            ctx->buffers[ctx->n_buffers].metal = nil;
-            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+            if (size_step_aligned > 0) {
+                ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
-            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
-                return false;
+                if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                    LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                    return false;
+                }
             }
             lm_ggml_backend_metal_log_allocated_size(device, size_step_aligned);