RubyGems - llama_cpp - Versions diffs - 0.0.1 → 0.0.2 - Mend

llama_cpp 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +2 -0
data/ext/llama_cpp/src/ggml.c +354 -51
data/ext/llama_cpp/src/ggml.h +6 -1
data/ext/llama_cpp/src/llama.cpp +210 -259
data/ext/llama_cpp/src/llama.h +2 -2
data/lib/llama_cpp/version.rb +2 -2
data/lib/llama_cpp.rb +3 -2
metadata +1 -1

data/ext/llama_cpp/src/llama.cpp CHANGED Viewed

@@ -12,6 +12,19 @@
 #include <cassert>
 #include <cstring>
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#else
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#endif
+#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
+#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -142,6 +155,10 @@ struct llama_model {
     // the model memory buffer
     std::vector<uint8_t> buf;
+    // model memory mapped file
+    void * mm_addr = NULL;
+    uint64_t mm_length = 0;
     // tensors
     int n_loaded;
     std::unordered_map<std::string, struct ggml_tensor *> tensors;
@@ -165,6 +182,7 @@ struct llama_context {
     int64_t t_load_us = 0;
     int64_t t_start_us = 0;
+    bool has_evaluated_once = false;
     int64_t t_sample_us = 0;
     int64_t t_eval_us   = 0;
@@ -206,7 +224,7 @@ struct llama_context {
         }
         if (buf_last >= 0) {
-            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
+            buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
         }
         buf_last = i;
@@ -246,6 +264,7 @@ static bool kv_cache_init(
     struct ggml_init_params params;
     params.mem_size   = cache.buf.size();
     params.mem_buffer = cache.buf.data();
+    params.no_alloc   = false;
     cache.ctx = ggml_init(params);
@@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
 // model loading
 //
+static void *mmap_file(const char *fname, uint64_t *mm_length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    HANDLE hFile = CreateFileA(fname,
+                               GENERIC_READ,
+                               FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                               NULL,
+                               OPEN_EXISTING,
+                               FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
+                               NULL);
+    if (hFile == INVALID_HANDLE_VALUE) return 0;
+    LARGE_INTEGER fileSize;
+    fileSize.QuadPart = -1;
+    GetFileSizeEx(hFile, &fileSize);
+    int64_t length = fileSize.QuadPart;
+    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+    CloseHandle(hFile);
+    if (!hMapping) return 0;
+    void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+    CloseHandle(hMapping);
+    if (!addr) return 0;
+#else
+    int fd = open(fname, O_RDONLY);
+    if (fd == -1) return 0;
+    int64_t length = lseek(fd, 0, SEEK_END);
+    void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
+    close(fd);
+    if (addr == MAP_FAILED) return 0;
+#endif
+    *mm_length = length;
+    return addr;
+}
+static void munmap_file(void * addr, size_t length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    UnmapViewOfFile(addr);
+#else
+    munmap(addr, length);
+#endif
+}
+static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
+    fprintf(stderr,
+            "%s: invalid model file (bad magic [got %#x want %#x])\n"
+            "\tyou most likely need to regenerate your ggml files\n"
+            "\tthe benefit is you'll get 10-100x faster load times\n"
+            "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
+            "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
+            "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
+            path, got, want);
+    return false;
+}
 static bool llama_model_load(
         const std::string & fname,
         llama_context & lctx,
@@ -299,22 +370,24 @@ static bool llama_model_load(
         void *progress_callback_user_data) {
     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-    const int64_t t_start_us = ggml_time_us();
-    lctx.t_start_us = t_start_us;
-    std::vector<char> f_buf(1024*1024);
+    lctx.t_start_us = ggml_time_us();
     auto & model = lctx.model;
     auto & vocab = lctx.vocab;
     auto fin = std::ifstream(fname, std::ios::binary);
-    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
         return false;
     }
+    std::vector<char> f_buf(1024*1024);
+    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+    fin.seekg(0, fin.end);
+    const size_t file_size = fin.tellg();
+    fin.seekg(0);
     // verify magic
     {
         uint32_t magic;
@@ -325,8 +398,7 @@ static bool llama_model_load(
             return false;
         }
         if (magic != LLAMA_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
+            return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
         }
         uint32_t format_version;
@@ -449,43 +521,24 @@ static bool llama_model_load(
                 }
     }
+    // map model into memory
+    char *mm_addr = NULL;
+    model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
+    if (model.mm_addr == NULL) {
+        fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+    mm_addr = (char *)model.mm_addr;
+    fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
     auto & ctx = model.ctx;
     size_t ctx_size = 0;
     {
-        const auto & hparams = model.hparams;
-        const int n_embd  = hparams.n_embd;
+        const auto &hparams = model.hparams;
         const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
         ctx_size += (5 + 10*n_layer)*256; // object overhead
-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
     }
     // print memory requirements
@@ -495,6 +548,7 @@ static bool llama_model_load(
         // this is the total memory required to run the inference
         const size_t mem_required =
             ctx_size +
+            model.mm_length +
             MEM_REQ_SCRATCH0.at(model.type) +
             MEM_REQ_SCRATCH1.at(model.type) +
             MEM_REQ_EVAL.at    (model.type);
@@ -514,6 +568,7 @@ static bool llama_model_load(
         struct ggml_init_params params = {
             /*.mem_size   =*/ lctx.model.buf.size(),
             /*.mem_buffer =*/ lctx.model.buf.data(),
+            /*.no_alloc   =*/ true,
         };
         model.ctx = ggml_init(params);
@@ -576,234 +631,106 @@ static bool llama_model_load(
         }
     }
-    const size_t file_offset = fin.tellg();
-    fin.close();
     std::vector<uint8_t> tmp;
     if (progress_callback) {
         progress_callback(0.0, progress_callback_user_data);
     }
-    for (int i = 0; i < n_parts; ++i) {
-        const int part_id = i;
-        //const int part_id = n_parts - i - 1;
+    fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
-        std::string fname_part = fname;
-        if (i > 0) {
-            fname_part += "." + std::to_string(i);
-        }
-        fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
+    // load weights
+    {
+        size_t total_size = 0;
+        model.n_loaded = 0;
-        fin = std::ifstream(fname_part, std::ios::binary);
-        fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;
-        fin.seekg(0, fin.end);
-        const size_t file_size = fin.tellg();
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
-        fin.seekg(file_offset);
+            if (fin.eof()) {
+                break;
+            }
-        // load weights
-        {
-            size_t total_size = 0;
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }
-            model.n_loaded = 0;
+            std::string name(length, 0);
+            fin.read(&name[0], length);
-            fprintf(stderr, "%s: ", __func__);
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
+            }
-            while (true) {
-                int32_t n_dims;
-                int32_t length;
-                int32_t ftype;
+            auto tensor = model.tensors[name.data()];
-                fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-                fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-                fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            if (0) {
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
+            }
-                if (fin.eof()) {
+            switch (ftype) {
+                case 0:  // f32
+                case 1:  // f16
                     break;
-                }
-                int32_t nelements = 1;
-                int32_t ne[2] = { 1, 1 };
-                for (int i = 0; i < n_dims; ++i) {
-                    fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                    nelements *= ne[i];
-                }
-                std::string name(length, 0);
-                fin.read(&name[0], length);
-                if (model.tensors.find(name.data()) == model.tensors.end()) {
-                    fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                case 2:  // q4_0
+                case 3:  // q4_1
+                    assert(ne[0] % 64 == 0);
+                    break;
+                default:
+                    fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
                     return false;
-                }
-                // split_type = 0: split by columns
-                // split_type = 1: split by rows
-                int split_type = 0;
-                // split_type = 0:
-                // regex:
-                //   - tok_embeddings.*
-                //   - layers.*.attention.wo.weight
-                //   - layers.*.feed_forward.w2.weight
-                // split_type = 1:
-                // regex:
-                //   - output.*
-                //   - layers.*.attention.wq.weight
-                //   - layers.*.attention.wk.weight
-                //   - layers.*.attention.wv.weight
-                //   - layers.*.feed_forward.w1.weight
-                //   - layers.*.feed_forward.w3.weight
-                if (name.find("tok_embeddings") != std::string::npos) {
-                    split_type = 0;
-                } else if (name.find("layers") != std::string::npos) {
-                    if (name.find("attention.wo.weight") != std::string::npos) {
-                        split_type = 0;
-                    } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
-                        split_type = 0;
-                    } else {
-                        split_type = 1;
-                    }
-                } else if (name.find("output") != std::string::npos) {
-                    split_type = 1;
-                }
-                auto tensor = model.tensors[name.data()];
-                if (n_dims == 1) {
-                    if (ggml_nelements(tensor) != nelements) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                        return false;
-                    }
-                } else {
-                    if (ggml_nelements(tensor)/n_parts != nelements) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                        return false;
-                    }
-                }
-                if (n_dims == 1) {
-                    if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
-                        return false;
-                    }
-                } else {
-                    if (split_type == 0) {
-                        if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
-                            fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                    __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
-                            return false;
-                        }
-                    } else {
-                        if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
-                            fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                    __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
-                            return false;
-                        }
-                    }
-                }
-                if (0) {
-                    static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                    fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
-                }
-                size_t bpe = 0;
-                switch (ftype) {
-                    case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
-                    case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
-                    case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
-                    case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
-                    default:
-                            {
-                                fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
-                                return false;
-                            }
-                };
-                if (n_dims == 1 || n_parts == 1) {
-                    if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                                __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                        return false;
-                    }
-                    if (part_id == 0) {
-                        fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-                    } else {
-                        fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-                    }
-                    total_size += ggml_nbytes(tensor);
-                } else {
-                    if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                                __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
-                        return false;
-                    }
-                    if (split_type == 0) {
-                        const int np0 = ne[0];
-                        const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                        assert(row_size == tensor->nb[1]);
-                        for (int i1 = 0; i1 < ne[1]; ++i1) {
-                            const size_t offset_row = i1*row_size;
-                            const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                            fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
-                        }
-                    } else {
-                        const int np1 = ne[1];
-                        const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                        for (int i1 = 0; i1 < ne[1]; ++i1) {
-                            const size_t offset_row = (i1 + part_id*np1)*row_size;
-                            fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
-                        }
-                    }
-                    total_size += ggml_nbytes(tensor)/n_parts;
-                }
-                //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-                model.n_loaded++;
-                // progress
-                if (progress_callback) {
-                    float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
-                    float current_progress = (float(i) + current_file_progress) / float(n_parts);
-                    progress_callback(current_progress, progress_callback_user_data);
-                }
-                if (model.n_loaded % 8 == 0) {
-                    fprintf(stderr, ".");
-                    fflush(stderr);
-                }
-            }
-            fprintf(stderr, " done\n");
+            };
-            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
-            if (model.n_loaded == 0) {
-                fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
-            } else if (model.n_loaded != (int) model.tensors.size()) {
-                fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
-                return false;
+            // load the tensor data into memory without copying or reading it
+            size_t offset = fin.tellg();
+            size_t tensor_data_size = ggml_nbytes(tensor);
+            offset = (offset + 31) & -32;
+            tensor->data = mm_addr + offset;
+            fin.seekg(offset + tensor_data_size);
+            total_size += tensor_data_size;
+            model.n_loaded++;
+            // progress
+            if (progress_callback) {
+                double current_progress = size_t(fin.tellg()) / double(file_size);
+                progress_callback(current_progress, progress_callback_user_data);
             }
         }
         fin.close();
+        fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
+        if (model.n_loaded == 0) {
+            fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+        } else if (model.n_loaded != (int) model.tensors.size()) {
+            fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            return false;
+        }
     }
-    lctx.t_load_us = ggml_time_us() - t_start_us;
+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
     if (progress_callback) {
         progress_callback(1.0, progress_callback_user_data);
@@ -849,6 +776,7 @@ static bool llama_eval_internal(
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_compute.size(),
         /*.mem_buffer =*/ buf_compute.data(),
+        /*.no_alloc   =*/ false,
     };
     struct ggml_context * ctx0 = ggml_init(params);
@@ -856,7 +784,7 @@ static bool llama_eval_internal(
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
     ggml_cgraph gf = {};
-    gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads;
+    gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1126,7 +1054,7 @@ struct llama_tokenizer {
         size_t offs = 0;
         while (offs < text.size()) {
             llama_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
             sym.text = text.c_str() + offs;
             sym.n = char_len;
             offs += char_len;
@@ -1291,7 +1219,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
     float maxl = -std::numeric_limits<float>::infinity();
     for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
+        maxl = Max(maxl, kv.first);
     }
     // compute probs for the top k tokens
@@ -1385,8 +1313,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             return false;
         }
         if (magic != LLAMA_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
+            return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
         }
         fout.write((char *) &magic, sizeof(magic));
@@ -1444,7 +1371,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             return false;
         }
-        std::string word;
+        std::vector<char> word(32);
         vocab.id_to_token.resize(n_vocab);
         for (int i = 0; i < n_vocab; i++) {
             uint32_t len;
@@ -1452,17 +1379,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             fout.write((char *) &len, sizeof(len));
             word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
+            finp.read ((char *) &word[0], len);
+            fout.write((char *) &word[0], len);
             float score;
             finp.read ((char *) &score, sizeof(score));
             fout.write((char *) &score, sizeof(score));
-            vocab.token_to_id[word] = i;
+            vocab.token_to_id[word.data()] = i;
             auto &tok_score = vocab.id_to_token[i];
-            tok_score.tok = word;
+            tok_score.tok = word.data();
             tok_score.score = score;
         }
     }
@@ -1503,6 +1430,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             std::string name(length, 0);
             finp.read (&name[0], length);
+            {
+                // ensure tensor data is aligned
+                uint64_t offset = finp.tellg();
+                offset = (offset + 31) & -32;
+                finp.seekg(offset);
+            }
             {
                 static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
                 printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@@ -1558,6 +1492,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
             }
             fout.write(&name[0], length);
+            {
+                // ensure tensor data is aligned
+                uint64_t offset = fout.tellp();
+                offset = (offset + 31) & -32;
+                fout.seekp(offset);
+            }
             if (quantize) {
                 printf("quantizing .. ");
                 work.resize(nelements); // for quantization
@@ -1655,7 +1596,10 @@ struct llama_context * llama_init_from_file(
     if (params.use_mlock) {
         char *err;
-        if (!ggml_mlock(ctx->model.ctx, &err)) {
+        if (!ggml_mlock(ctx->model.ctx,
+                        ctx->model.mm_addr,
+                        ctx->model.mm_length,
+                        &err)) {
             fprintf(stderr, "%s\n", err);
             free(err);
             llama_free(ctx);
@@ -1705,6 +1649,10 @@ void llama_free(struct llama_context * ctx) {
         ggml_free(ctx->model.ctx);
     }
+    if (ctx->model.mm_addr) {
+        munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
+    }
     delete ctx;
 }
@@ -1730,7 +1678,11 @@ int llama_eval(
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
+    // get a more accurate load time, upon first eval
+    if (!ctx->has_evaluated_once) {
+        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->has_evaluated_once = true;
+    }
     return 0;
 }
@@ -1823,9 +1775,9 @@ llama_token llama_sample_top_p_top_k(
 void llama_print_timings(struct llama_context * ctx) {
     const int64_t t_end_us = ggml_time_us();
-    const int32_t n_sample = std::max(1, ctx->n_sample);
-    const int32_t n_eval   = std::max(1, ctx->n_eval);
-    const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
+    const int32_t n_sample = Max(1, ctx->n_sample);
+    const int32_t n_eval   = Max(1, ctx->n_eval);
+    const int32_t n_p_eval = Max(1, ctx->n_p_eval);
     fprintf(stderr, "\n");
     fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1837,7 +1789,6 @@ void llama_print_timings(struct llama_context * ctx) {
 void llama_reset_timings(struct llama_context * ctx) {
     ctx->t_start_us = ggml_time_us();
     ctx->t_sample_us = ctx->n_sample = 0;
     ctx->t_eval_us   = ctx->n_eval   = 0;
     ctx->t_p_eval_us = ctx->n_p_eval = 0;