npm - cui-llama.rn - Versions diffs - 1.6.1 → 1.7.0 - Mend

cui-llama.rn 1.6.1 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (196) hide show

package/cpp/ggml-opt.cpp CHANGED Viewed

@@ -28,16 +28,19 @@ struct lm_ggml_opt_dataset {
 };
 struct lm_ggml_opt_context {
-    lm_ggml_backend_sched_t    backend_sched        = nullptr;
-    lm_ggml_cgraph           * allocated_graph      = nullptr;
-    lm_ggml_cgraph           * allocated_graph_copy = nullptr;
-    struct lm_ggml_context   * ctx_static           = nullptr;
-    struct lm_ggml_context   * ctx_static_cpu       = nullptr;
-    struct lm_ggml_context   * ctx_compute          = nullptr;
-    struct lm_ggml_context   * ctx_copy             = nullptr;
-    lm_ggml_backend_buffer_t   buf_static           = nullptr;
-    lm_ggml_backend_buffer_t   buf_static_cpu       = nullptr;
-    std::mt19937            rng;
+    lm_ggml_backend_sched_t       backend_sched        = nullptr;
+    lm_ggml_cgraph              * allocated_graph      = nullptr;
+    lm_ggml_cgraph              * allocated_graph_copy = nullptr;
+    struct lm_ggml_context      * ctx_static           = nullptr;
+    struct lm_ggml_context      * ctx_cpu              = nullptr;
+    struct lm_ggml_context      * ctx_compute          = nullptr;
+    struct lm_ggml_context      * ctx_copy             = nullptr;
+    lm_ggml_backend_buffer_t      buf_static           = nullptr;
+    lm_ggml_backend_buffer_t      buf_cpu              = nullptr;
+    std::mt19937               rng;
+    enum lm_ggml_opt_loss_type    loss_type;
+    enum lm_ggml_opt_build_type   build_type;
+    enum lm_ggml_opt_build_type   build_type_alloc;
     struct lm_ggml_tensor * inputs  = nullptr;
     struct lm_ggml_tensor * outputs = nullptr;
@@ -50,6 +53,11 @@ struct lm_ggml_opt_context {
     struct lm_ggml_cgraph * gf      = nullptr;
     struct lm_ggml_cgraph * gb_grad = nullptr;
     struct lm_ggml_cgraph * gb_opt  = nullptr;
+    bool static_graphs           = false;
+    bool eval_ready              = false;
+    std::vector<struct lm_ggml_tensor *> grad_accs;
+    std::vector<struct lm_ggml_tensor *> grad_m;
+    std::vector<struct lm_ggml_tensor *> grad_v;
     int64_t iter               = 1;
     int32_t opt_period         = 1;
@@ -73,7 +81,13 @@ struct lm_ggml_opt_result {
 // ====== Dataset ======
-lm_ggml_opt_dataset_t lm_ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
+lm_ggml_opt_dataset_t lm_ggml_opt_dataset_init(
+        enum lm_ggml_type type_data,
+        enum lm_ggml_type type_label,
+        int64_t        ne_datapoint,
+        int64_t        ne_label,
+        int64_t        ndata,
+        int64_t        ndata_shard) {
     LM_GGML_ASSERT(ne_datapoint >  0);
     LM_GGML_ASSERT(ne_label     >= 0);
     LM_GGML_ASSERT(ndata        >  0);
@@ -92,11 +106,11 @@ lm_ggml_opt_dataset_t lm_ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_
         result->ctx = lm_ggml_init(params);
     }
-    result->data = lm_ggml_new_tensor_2d(result->ctx, LM_GGML_TYPE_F32, ne_datapoint, ndata);
+    result->data = lm_ggml_new_tensor_2d(result->ctx, type_data, ne_datapoint, ndata);
     result->nbs_data = lm_ggml_nbytes(result->data) * ndata_shard/ndata;
     if (ne_label > 0) {
-        result->labels = lm_ggml_new_tensor_2d(result->ctx, LM_GGML_TYPE_F32, ne_label, ndata);
+        result->labels = lm_ggml_new_tensor_2d(result->ctx, type_label, ne_label, ndata);
         result->nbs_labels = lm_ggml_nbytes(result->labels) * ndata_shard/ndata;
     } else {
         result->labels = nullptr;
@@ -119,6 +133,10 @@ void lm_ggml_opt_dataset_free(lm_ggml_opt_dataset_t dataset) {
     delete dataset;
 }
+int64_t lm_ggml_opt_dataset_ndata(lm_ggml_opt_dataset_t dataset) {
+    return dataset->ndata;
+}
 struct lm_ggml_tensor * lm_ggml_opt_dataset_data(lm_ggml_opt_dataset_t dataset) {
     return dataset->data;
 }
@@ -144,6 +162,8 @@ void lm_ggml_opt_dataset_get_batch(lm_ggml_opt_dataset_t dataset, struct lm_ggml
     LM_GGML_ASSERT(   data_batch && lm_ggml_is_contiguous(data_batch));
     LM_GGML_ASSERT(!labels_batch || lm_ggml_is_contiguous(labels_batch));
     LM_GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+    LM_GGML_ASSERT(                   data_batch->type == dataset->data->type);
+    LM_GGML_ASSERT(!labels_batch || labels_batch->type == dataset->labels->type);
     const size_t nb_data_batch = lm_ggml_nbytes(data_batch);
     LM_GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
@@ -171,6 +191,31 @@ void lm_ggml_opt_dataset_get_batch(lm_ggml_opt_dataset_t dataset, struct lm_ggml
     }
 }
+void lm_ggml_opt_dataset_get_batch_host(lm_ggml_opt_dataset_t dataset, void * data_batch, size_t nb_data_batch, void * labels_batch, int64_t ibatch) {
+    LM_GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+    LM_GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
+    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
+    LM_GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
+    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
+        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
+        const char * ptr_data       = (const char *) dataset->data->data + ishard      *dataset->nbs_data;
+        char       * ptr_data_batch = (char       *) data_batch          + ishard_batch*dataset->nbs_data;
+        memcpy(ptr_data_batch, ptr_data, dataset->nbs_data);
+        if (!labels_batch) {
+            continue;
+        }
+        const char * ptr_labels       = (const char *) dataset->labels->data + ishard      *dataset->nbs_labels;
+        char       * ptr_labels_batch = (char       *) labels_batch          + ishard_batch*dataset->nbs_labels;
+        memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels);
+    }
+}
 // ====== Model / Context ======
 struct lm_ggml_opt_optimizer_params lm_ggml_opt_get_default_optimizer_params(void * userdata) {
@@ -187,17 +232,18 @@ struct lm_ggml_opt_optimizer_params lm_ggml_opt_get_default_optimizer_params(voi
     return result;
 }
+struct lm_ggml_opt_optimizer_params lm_ggml_opt_get_constant_optimizer_params(void * userdata) {
+    return *((struct lm_ggml_opt_optimizer_params *) userdata);
+}
 struct lm_ggml_opt_params lm_ggml_opt_default_params(
         lm_ggml_backend_sched_t      backend_sched,
-        struct lm_ggml_context     * ctx_compute,
-        struct lm_ggml_tensor      * inputs,
-        struct lm_ggml_tensor      * outputs,
         enum lm_ggml_opt_loss_type   loss_type) {
     return {
         /*backend_sched   =*/ backend_sched,
-        /*ctx_compute     =*/ ctx_compute,
-        /*inputs          =*/ inputs,
-        /*logits          =*/ outputs,
+        /*ctx_compute     =*/ nullptr,
+        /*inputs          =*/ nullptr,
+        /*logits          =*/ nullptr,
         /*loss_type       =*/ loss_type,
         /*build_type      =*/ LM_GGML_OPT_BUILD_TYPE_OPT,
         /*opt_period      =*/ 1,
@@ -266,195 +312,246 @@ static lm_ggml_cgraph * dup_graph(lm_ggml_context * ctx, lm_ggml_cgraph * src) {
     return dst;
 }
-static void lm_ggml_opt_alloc_graph(lm_ggml_opt_context_t opt_ctx, lm_ggml_cgraph * graph) {
-    LM_GGML_ASSERT(graph);
-    if (opt_ctx->allocated_graph == graph) {
-        return;
-    }
+static void lm_ggml_opt_build(lm_ggml_opt_context_t opt_ctx) {
+    LM_GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with lm_ggml_opt_prepare_alloc");
+    LM_GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
-    lm_ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+    const bool accumulate = opt_ctx->build_type_alloc >= LM_GGML_OPT_BUILD_TYPE_GRAD &&
+        !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == LM_GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
-    {
-        lm_ggml_init_params params = {
-            /*.mem_size   =*/ lm_ggml_tensor_overhead() * LM_GGML_DEFAULT_GRAPH_SIZE,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        lm_ggml_free(opt_ctx->ctx_copy);
-        opt_ctx->ctx_copy = lm_ggml_init(params);
-    }
-    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
-    lm_ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
-    opt_ctx->allocated_graph = graph;
-}
-lm_ggml_opt_context_t lm_ggml_opt_init(struct lm_ggml_opt_params params) {
-    lm_ggml_opt_context_t result = new struct lm_ggml_opt_context;
-    result->backend_sched   = params.backend_sched;
-    result->ctx_compute     = params.ctx_compute;
-    result->inputs          = params.inputs;
-    result->outputs         = params.outputs;
-    result->opt_period      = params.opt_period;
-    result->get_opt_pars    = params.get_opt_pars;
-    result->get_opt_pars_ud = params.get_opt_pars_ud;
-    LM_GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
-    LM_GGML_ASSERT(result->opt_period >= 1);
-    const bool accumulate = params.build_type == LM_GGML_OPT_BUILD_TYPE_GRAD ||
-        (params.build_type == LM_GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
-    lm_ggml_set_input(result->inputs);
-    lm_ggml_set_output(result->outputs);
-    result->gf = lm_ggml_new_graph_custom(result->ctx_compute, LM_GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
-    lm_ggml_build_forward_expand(result->gf, result->outputs);
+    lm_ggml_set_input(opt_ctx->inputs);
+    lm_ggml_set_output(opt_ctx->outputs);
     int n_param = 0;
-    for (int i = 0; i < result->gf->n_nodes; ++i) {
-        if (result->gf->nodes[i]->flags & LM_GGML_TENSOR_FLAG_PARAM) {
+    for (int i = 0; i < opt_ctx->gf->n_nodes; ++i) {
+        const struct lm_ggml_tensor * node = opt_ctx->gf->nodes[i];
+        if (node->flags & LM_GGML_TENSOR_FLAG_PARAM) {
             n_param++;
         }
+        LM_GGML_ASSERT(!(node->flags & LM_GGML_TENSOR_FLAG_LOSS) && "support for extra loss terms not implemented");
     }
-    {
+    if (!opt_ctx->ctx_static) {
         // The static context is used for:
-        //   - gradients (1 tensor per param if using gradient accumulation)
+        //   - gradients (1 per loss, 1 tensor per param if using gradient accumulation)
         //   - optimizer momenta (2 tensors per param)
-        //   - labels
-        //   - loss + its gradient (up to 5 tensors)
-        //   - pred
-        //   - ncorrect (2 tensors).
-        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == LM_GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
-        const size_t size_meta = (tensors_per_param*n_param + 9) * lm_ggml_tensor_overhead();
+        //   - labels (if using static graphs)
+        //   - loss (if using static graphs, up to 5 tensors)
+        //   - pred (if using static graphs)
+        //   - ncorrect (if using static graphs, 2 tensors).
+        constexpr size_t n_loss = 1;
+        const size_t tensors_per_param = (accumulate ? 1 : 0) +
+            (opt_ctx->build_type_alloc == LM_GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
+        const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * lm_ggml_tensor_overhead();
         struct lm_ggml_init_params params = {
             /*.mem_size   =*/ size_meta,
             /*.mem_buffer =*/ nullptr,
             /*.no_alloc   =*/ true,
         };
-        result->ctx_static = lm_ggml_init(params);
+        opt_ctx->ctx_static = lm_ggml_init(params);
     }
+    LM_GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc);
     {
-        // The static cpu context is used for:
-        //   - optimizer parameters (1 for the entire context)
+        // The cpu context is allocated statically if using static graphs, dynamically otherwise.
+        // It is used for:
+        //   - optimizer parameters (1 shared for all optimizer invocations)
         const size_t size_meta = 1 * lm_ggml_tensor_overhead();
         struct lm_ggml_init_params params = {
             /*.mem_size   =*/ size_meta,
             /*.mem_buffer =*/ nullptr,
             /*.no_alloc   =*/ true,
         };
-        result->ctx_static_cpu = lm_ggml_init(params);
+        lm_ggml_free(opt_ctx->ctx_cpu);
+        opt_ctx->ctx_cpu = lm_ggml_init(params);
+        lm_ggml_backend_buffer_free(opt_ctx->buf_cpu);
+        opt_ctx->buf_cpu = nullptr;
     }
+    struct lm_ggml_context * ctx_results = opt_ctx->static_graphs ? opt_ctx->ctx_static : opt_ctx->ctx_compute;
-    switch (params.loss_type) {
+    switch (opt_ctx->loss_type) {
         case LM_GGML_OPT_LOSS_TYPE_MEAN: {
-            result->loss = lm_ggml_sum(result->ctx_static, result->outputs);
-            lm_ggml_set_name(result->loss, "loss_sum");
-            const float scale = 1.0f / (result->opt_period * lm_ggml_nelements(result->outputs));
-            result->loss = lm_ggml_scale(result->ctx_static, result->loss, scale);
-            lm_ggml_set_name(result->loss, "loss_mean");
-            result->loss_per_datapoint = true;
+            opt_ctx->loss = lm_ggml_sum(ctx_results, opt_ctx->outputs);
+            lm_ggml_set_name(opt_ctx->loss, "loss_sum");
+            const float scale = 1.0f / (opt_ctx->opt_period * lm_ggml_nelements(opt_ctx->outputs));
+            opt_ctx->loss = lm_ggml_scale(ctx_results, opt_ctx->loss, scale);
+            lm_ggml_set_name(opt_ctx->loss, "loss_mean");
+            opt_ctx->loss_per_datapoint = true;
             break;
         }
         case LM_GGML_OPT_LOSS_TYPE_SUM: {
-            result->loss = lm_ggml_sum(result->ctx_static, result->outputs);
-            lm_ggml_set_name(result->loss, "loss_sum");
-            result->loss_per_datapoint = false;
+            opt_ctx->loss = lm_ggml_sum(ctx_results, opt_ctx->outputs);
+            lm_ggml_set_name(opt_ctx->loss, "loss_sum");
+            opt_ctx->loss_per_datapoint = false;
             break;
         }
         case LM_GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
-            result->labels = lm_ggml_dup_tensor(result->ctx_static, result->outputs);
-            lm_ggml_set_input(result->labels);
-            lm_ggml_set_name(result->labels, "labels");
-            result->loss = lm_ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
-            lm_ggml_set_name(result->loss, "loss_cross_entropy");
-            if (result->opt_period > 1) {
-                result->loss = lm_ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
-                lm_ggml_set_name(result->loss, "loss_cross_entropy_scaled");
+            opt_ctx->labels = lm_ggml_dup_tensor(ctx_results, opt_ctx->outputs);
+            lm_ggml_set_input(opt_ctx->labels);
+            lm_ggml_set_name(opt_ctx->labels, "labels");
+            opt_ctx->loss = lm_ggml_cross_entropy_loss(ctx_results, opt_ctx->outputs, opt_ctx->labels);
+            lm_ggml_set_name(opt_ctx->loss, "loss_cross_entropy");
+            if (opt_ctx->opt_period > 1) {
+                opt_ctx->loss = lm_ggml_scale(ctx_results, opt_ctx->loss, 1.0f / opt_ctx->opt_period);
+                lm_ggml_set_name(opt_ctx->loss, "loss_cross_entropy_scaled");
             }
-            result->loss_per_datapoint = true;
+            opt_ctx->loss_per_datapoint = true;
             break;
         }
         case LM_GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
-            result->labels = lm_ggml_dup_tensor(result->ctx_static, result->outputs);
-            lm_ggml_set_input(result->labels);
-            lm_ggml_set_name(result->labels, "labels");
-            result->loss = lm_ggml_sub(result->ctx_static, result->outputs, result->labels);
-            lm_ggml_set_name(result->loss, "loss_error");
-            result->loss = lm_ggml_sqr(result->ctx_static, result->loss);
-            lm_ggml_set_name(result->loss, "loss_squared_error");
-            result->loss = lm_ggml_sum(result->ctx_static, result->loss);
-            lm_ggml_set_name(result->loss, "loss_sum_squared_error");
-            const float scale = 1.0f / (result->opt_period * lm_ggml_nelements(result->outputs));
-            result->loss = lm_ggml_scale(result->ctx_static, result->loss, scale);
-            lm_ggml_set_name(result->loss, "loss_mean_squared_error");
-            result->loss_per_datapoint = true;
+            opt_ctx->labels = lm_ggml_dup_tensor(ctx_results, opt_ctx->outputs);
+            lm_ggml_set_input(opt_ctx->labels);
+            lm_ggml_set_name(opt_ctx->labels, "labels");
+            opt_ctx->loss = lm_ggml_sub(ctx_results, opt_ctx->outputs, opt_ctx->labels);
+            lm_ggml_set_name(opt_ctx->loss, "loss_error");
+            opt_ctx->loss = lm_ggml_sqr(ctx_results, opt_ctx->loss);
+            lm_ggml_set_name(opt_ctx->loss, "loss_squared_error");
+            opt_ctx->loss = lm_ggml_sum(ctx_results, opt_ctx->loss);
+            lm_ggml_set_name(opt_ctx->loss, "loss_sum_squared_error");
+            const float scale = 1.0f / (opt_ctx->opt_period * lm_ggml_nelements(opt_ctx->outputs));
+            opt_ctx->loss = lm_ggml_scale(ctx_results, opt_ctx->loss, scale);
+            lm_ggml_set_name(opt_ctx->loss, "loss_mean_squared_error");
+            opt_ctx->loss_per_datapoint = true;
             break;
         }
     }
-    lm_ggml_set_output(result->loss);
-    lm_ggml_set_loss(result->loss);
-    lm_ggml_build_forward_expand(result->gf, result->loss);
-    result->pred = lm_ggml_argmax(result->ctx_static, result->outputs);
-    lm_ggml_set_name(result->pred, "pred");
-    lm_ggml_set_output(result->pred);
-    lm_ggml_build_forward_expand(result->gf, result->pred);
+    lm_ggml_set_output(opt_ctx->loss);
+    lm_ggml_set_loss(opt_ctx->loss);
+    lm_ggml_build_forward_expand(opt_ctx->gf, opt_ctx->loss);
+    if (opt_ctx->loss_type == LM_GGML_OPT_LOSS_TYPE_CROSS_ENTROPY) {
+        opt_ctx->pred = lm_ggml_argmax(ctx_results, opt_ctx->outputs);
+        lm_ggml_set_name(opt_ctx->pred, "pred");
+        lm_ggml_set_output(opt_ctx->pred);
+        lm_ggml_build_forward_expand(opt_ctx->gf, opt_ctx->pred);
+        opt_ctx->ncorrect = lm_ggml_count_equal(ctx_results, opt_ctx->pred, lm_ggml_argmax(ctx_results, opt_ctx->labels));
+        lm_ggml_set_name(opt_ctx->ncorrect, "ncorrect");
+        lm_ggml_set_output(opt_ctx->ncorrect);
+        lm_ggml_build_forward_expand(opt_ctx->gf, opt_ctx->ncorrect);
+    }
-    if (result->labels) {
-        result->ncorrect = lm_ggml_count_equal(result->ctx_static, result->pred, lm_ggml_argmax(result->ctx_static, result->labels));
-        lm_ggml_set_name(result->ncorrect, "ncorrect");
-        lm_ggml_set_output(result->ncorrect);
-        lm_ggml_build_forward_expand(result->gf, result->ncorrect);
-    } else {
-        result->ncorrect = nullptr;
+    if (opt_ctx->buf_static) {
+        if (opt_ctx->build_type == LM_GGML_OPT_BUILD_TYPE_FORWARD) {
+            return;
+        }
+    } else if (opt_ctx->build_type_alloc == LM_GGML_OPT_BUILD_TYPE_FORWARD) {
+        opt_ctx->buf_static = lm_ggml_backend_alloc_ctx_tensors(
+            opt_ctx->ctx_static, lm_ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        return;
     }
-    if (params.build_type == LM_GGML_OPT_BUILD_TYPE_FORWARD) {
-        result->buf_static = lm_ggml_backend_alloc_ctx_tensors(result->ctx_static, lm_ggml_backend_sched_get_backend(result->backend_sched, 0));
-        return result;
+    if (opt_ctx->grad_accs.empty()) {
+        LM_GGML_ASSERT(opt_ctx->build_type_alloc >= LM_GGML_OPT_BUILD_TYPE_GRAD);
+        const int n_nodes = opt_ctx->gf->n_nodes;
+        opt_ctx->grad_accs.resize(n_nodes);
+        for (int i = 0; i < n_nodes; ++i) {
+            lm_ggml_tensor * node = opt_ctx->gf->nodes[i];
+            if ((accumulate && (node->flags & LM_GGML_TENSOR_FLAG_PARAM)) || (node->flags & LM_GGML_TENSOR_FLAG_LOSS)) {
+                opt_ctx->grad_accs[i] = lm_ggml_new_tensor(opt_ctx->ctx_static, LM_GGML_TYPE_F32, LM_GGML_MAX_DIMS, node->ne);
+            } else {
+                opt_ctx->grad_accs[i] = nullptr;
+            }
+        }
+        if (opt_ctx->build_type_alloc >= LM_GGML_OPT_BUILD_TYPE_OPT) {
+            opt_ctx->grad_m.resize(n_nodes);
+            opt_ctx->grad_v.resize(n_nodes);
+            for (int i = 0; i < n_nodes; ++i) {
+                lm_ggml_tensor * node = opt_ctx->gf->nodes[i];
+                if (node->flags & LM_GGML_TENSOR_FLAG_PARAM) {
+                    opt_ctx->grad_m[i] = lm_ggml_new_tensor(opt_ctx->ctx_static, LM_GGML_TYPE_F32, LM_GGML_MAX_DIMS, node->ne);
+                    opt_ctx->grad_v[i] = lm_ggml_new_tensor(opt_ctx->ctx_static, LM_GGML_TYPE_F32, LM_GGML_MAX_DIMS, node->ne);
+                } else {
+                    opt_ctx->grad_m[i] = nullptr;
+                    opt_ctx->grad_v[i] = nullptr;
+                }
+            }
+        }
     }
     // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
-    result->gb_grad = lm_ggml_graph_dup(result->ctx_compute, result->gf);
-    lm_ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
+    opt_ctx->gb_grad = lm_ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
+    lm_ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());
-    if (params.build_type == LM_GGML_OPT_BUILD_TYPE_GRAD) {
-        result->buf_static = lm_ggml_backend_alloc_ctx_tensors(result->ctx_static, lm_ggml_backend_sched_get_backend(result->backend_sched, 0));
-        lm_ggml_graph_reset(result->gb_grad);
-        return result;
+    if (opt_ctx->buf_static) {
+        if (opt_ctx->build_type == LM_GGML_OPT_BUILD_TYPE_GRAD) {
+            return;
+        }
+    } else if (opt_ctx->build_type_alloc == LM_GGML_OPT_BUILD_TYPE_GRAD) {
+        opt_ctx->buf_static = lm_ggml_backend_alloc_ctx_tensors(opt_ctx->ctx_static, lm_ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        lm_ggml_graph_reset(opt_ctx->gb_grad);
     }
-    LM_GGML_ASSERT(params.build_type == LM_GGML_OPT_BUILD_TYPE_OPT);
+    LM_GGML_ASSERT(opt_ctx->build_type_alloc == LM_GGML_OPT_BUILD_TYPE_OPT);
     // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
-    result->gb_opt = lm_ggml_graph_dup(result->ctx_compute, result->gb_grad);
+    opt_ctx->gb_opt = lm_ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
-    result->adamw_params = lm_ggml_new_tensor_1d(result->ctx_static_cpu, LM_GGML_TYPE_F32, 7);
-    lm_ggml_set_input(result->adamw_params);
-    lm_ggml_set_name(result->adamw_params, "adamw_params");
+    opt_ctx->adamw_params = lm_ggml_new_tensor_1d(opt_ctx->ctx_cpu, LM_GGML_TYPE_F32, 7);
+    lm_ggml_set_input(opt_ctx->adamw_params);
+    lm_ggml_set_name(opt_ctx->adamw_params, "adamw_params");
-    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
-        struct lm_ggml_tensor * node = result->gb_opt->nodes[i];
-        struct lm_ggml_tensor * grad = lm_ggml_graph_get_grad(result->gb_opt, node);
+    for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
+        struct lm_ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
+        struct lm_ggml_tensor * grad = lm_ggml_graph_get_grad(opt_ctx->gb_opt, node);
-        if (node->flags & LM_GGML_TENSOR_FLAG_PARAM) {
-            struct lm_ggml_tensor * m        = lm_ggml_dup_tensor(result->ctx_static, node);
-            struct lm_ggml_tensor * v        = lm_ggml_dup_tensor(result->ctx_static, node);
-            struct lm_ggml_tensor * opt_step = lm_ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
-            lm_ggml_build_forward_expand(result->gb_opt, opt_step);
+        if (grad && (node->flags & LM_GGML_TENSOR_FLAG_PARAM)) {
+            struct lm_ggml_tensor * m        = opt_ctx->grad_m[i];
+            struct lm_ggml_tensor * v        = opt_ctx->grad_v[i];
+            struct lm_ggml_tensor * opt_step = lm_ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
+            lm_ggml_set_name(m,        (std::string("AdamW m for ")    + std::string(node->name)).c_str());
+            lm_ggml_set_name(v,        (std::string("AdamW v for ")    + std::string(node->name)).c_str());
+            lm_ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
+            lm_ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
         }
     }
-    result->buf_static = lm_ggml_backend_alloc_ctx_tensors(
-        result->ctx_static, lm_ggml_backend_sched_get_backend(result->backend_sched, 0));
+    if (!opt_ctx->buf_static) {
+        opt_ctx->buf_static = lm_ggml_backend_alloc_ctx_tensors(
+            opt_ctx->ctx_static, lm_ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        lm_ggml_graph_reset(opt_ctx->gb_opt);
+    }
-    result->buf_static_cpu = lm_ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, lm_ggml_backend_cpu_buffer_type());
+    opt_ctx->buf_cpu = lm_ggml_backend_alloc_ctx_tensors_from_buft(opt_ctx->ctx_cpu, lm_ggml_backend_cpu_buffer_type());
+}
-    lm_ggml_graph_reset(result->gb_opt);
+lm_ggml_opt_context_t lm_ggml_opt_init(struct lm_ggml_opt_params params) {
+    lm_ggml_opt_context_t result = new struct lm_ggml_opt_context;
+    result->backend_sched    = params.backend_sched;
+    result->ctx_compute      = params.ctx_compute;
+    result->loss_type        = params.loss_type;
+    result->build_type       = params.build_type;
+    result->build_type_alloc = params.build_type;
+    result->inputs           = params.inputs;
+    result->outputs          = params.outputs;
+    result->opt_period       = params.opt_period;
+    result->get_opt_pars     = params.get_opt_pars;
+    result->get_opt_pars_ud  = params.get_opt_pars_ud;
+    LM_GGML_ASSERT(result->opt_period >= 1);
+    result->static_graphs = result->ctx_compute;
+    if (!result->static_graphs) {
+        LM_GGML_ASSERT(!result->inputs);
+        LM_GGML_ASSERT(!result->outputs);
+        return result;
+    }
+    LM_GGML_ASSERT(result->inputs);
+    LM_GGML_ASSERT(result->outputs);
+    result->gf = lm_ggml_new_graph_custom(result->ctx_compute, LM_GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
+    lm_ggml_build_forward_expand(result->gf, result->outputs);
+    lm_ggml_opt_build(result);
     return result;
 }
@@ -464,9 +561,9 @@ void lm_ggml_opt_free(lm_ggml_opt_context_t opt_ctx) {
         return;
     }
     lm_ggml_backend_buffer_free(opt_ctx->buf_static);
-    lm_ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
+    lm_ggml_backend_buffer_free(opt_ctx->buf_cpu);
     lm_ggml_free(opt_ctx->ctx_static);
-    lm_ggml_free(opt_ctx->ctx_static_cpu);
+    lm_ggml_free(opt_ctx->ctx_cpu);
     delete opt_ctx;
 }
@@ -479,6 +576,10 @@ void lm_ggml_opt_reset(lm_ggml_opt_context_t opt_ctx, bool optimizer) {
     }
 }
+bool lm_ggml_opt_static_graphs(lm_ggml_opt_context_t opt_ctx) {
+    return opt_ctx->static_graphs;
+}
 struct lm_ggml_tensor * lm_ggml_opt_inputs(lm_ggml_opt_context_t opt_ctx) {
     return opt_ctx->inputs;
 }
@@ -582,8 +683,79 @@ void lm_ggml_opt_result_accuracy(lm_ggml_opt_result_t result, double * accuracy,
 // ====== Computation ======
-static void lm_ggml_opt_eval_graph(lm_ggml_opt_context_t opt_ctx, lm_ggml_cgraph * graph, lm_ggml_opt_result * result) {
-    if (graph != opt_ctx->gf) {
+void lm_ggml_opt_prepare_alloc(
+        lm_ggml_opt_context_t    opt_ctx,
+        struct lm_ggml_context * ctx_compute,
+        struct lm_ggml_cgraph  * gf,
+        struct lm_ggml_tensor  * inputs,
+        struct lm_ggml_tensor  * outputs) {
+    LM_GGML_ASSERT(!opt_ctx->static_graphs);
+    opt_ctx->ctx_compute = ctx_compute;
+    opt_ctx->gf          = gf;
+    opt_ctx->inputs      = inputs;
+    opt_ctx->outputs     = outputs;
+}
+void lm_ggml_opt_alloc(lm_ggml_opt_context_t opt_ctx, bool backward) {
+    LM_GGML_ASSERT(!opt_ctx->eval_ready);
+    if (opt_ctx->build_type == LM_GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
+        lm_ggml_graph_reset(opt_ctx->gb_grad);
+    }
+    if (backward) {
+        const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+        opt_ctx->build_type = opt_i_next == 0 ? LM_GGML_OPT_BUILD_TYPE_OPT : LM_GGML_OPT_BUILD_TYPE_GRAD;
+    } else {
+        opt_ctx->build_type = LM_GGML_OPT_BUILD_TYPE_FORWARD;
+    }
+    if (!opt_ctx->static_graphs) {
+        lm_ggml_opt_build(opt_ctx);
+    }
+    struct lm_ggml_cgraph * graph = nullptr;
+    switch (opt_ctx->build_type) {
+        case LM_GGML_OPT_BUILD_TYPE_FORWARD: {
+            graph = opt_ctx->gf;
+        } break;
+        case LM_GGML_OPT_BUILD_TYPE_GRAD: {
+            graph = opt_ctx->gb_grad;
+        } break;
+        case LM_GGML_OPT_BUILD_TYPE_OPT: {
+            graph = opt_ctx->gb_opt;
+        } break;
+    }
+    LM_GGML_ASSERT(graph);
+    if (opt_ctx->allocated_graph == graph) {
+        opt_ctx->eval_ready = true;
+        return;
+    }
+    lm_ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+    if (opt_ctx->static_graphs) {
+        lm_ggml_init_params params = {
+            /*.mem_size   =*/ graph->size*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead_custom(graph->size, graph->grads),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        lm_ggml_free(opt_ctx->ctx_copy);
+        opt_ctx->ctx_copy = lm_ggml_init(params);
+        opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+    } else {
+        opt_ctx->allocated_graph_copy = graph;
+    }
+    lm_ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->allocated_graph = graph;
+    opt_ctx->eval_ready = true;
+}
+void lm_ggml_opt_eval(lm_ggml_opt_context_t opt_ctx, lm_ggml_opt_result_t result) {
+    LM_GGML_ASSERT(opt_ctx->eval_ready);
+    if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
         struct lm_ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
         LM_GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
@@ -609,9 +781,19 @@ static void lm_ggml_opt_eval_graph(lm_ggml_opt_context_t opt_ctx, lm_ggml_cgraph
         adamw_par_data[6] = beta2h;
     }
-    lm_ggml_opt_alloc_graph(opt_ctx, graph);
     lm_ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
     opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
+    opt_ctx->opt_i = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+    if (!opt_ctx->static_graphs) {
+        opt_ctx->gf                   = nullptr;
+        opt_ctx->gb_grad              = nullptr;
+        opt_ctx->gb_opt               = nullptr;
+        opt_ctx->allocated_graph      = nullptr;
+        opt_ctx->allocated_graph_copy = nullptr;
+    }
+    opt_ctx->eval_ready = false;
     if (!result) {
         return;
@@ -635,12 +817,14 @@ static void lm_ggml_opt_eval_graph(lm_ggml_opt_context_t opt_ctx, lm_ggml_cgraph
     lm_ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, lm_ggml_nbytes(opt_ctx->loss));
     result->loss.push_back(loss);
-    LM_GGML_ASSERT(opt_ctx->pred->type == LM_GGML_TYPE_I32);
-    std::vector<int32_t> pred(ndata);
-    lm_ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, lm_ggml_nbytes(opt_ctx->pred));
-    result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+    if (opt_ctx->pred) {
+        LM_GGML_ASSERT(opt_ctx->pred->type == LM_GGML_TYPE_I32);
+        std::vector<int32_t> pred(ndata);
+        lm_ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, lm_ggml_nbytes(opt_ctx->pred));
+        result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+    }
-    if (!opt_ctx->labels || result->ncorrect < 0) {
+    if (!opt_ctx->ncorrect || result->ncorrect < 0) {
         result->ncorrect = -1;
         return;
     }
@@ -652,26 +836,6 @@ static void lm_ggml_opt_eval_graph(lm_ggml_opt_context_t opt_ctx, lm_ggml_cgraph
     result->ncorrect += ncorrect;
 }
-void lm_ggml_opt_forward(lm_ggml_opt_context_t opt_ctx, lm_ggml_opt_result * result) {
-    lm_ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
-}
-void lm_ggml_opt_forward_backward(lm_ggml_opt_context_t opt_ctx, lm_ggml_opt_result * result) {
-    if (opt_ctx->opt_period == 1) {
-        lm_ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
-        return;
-    }
-    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
-    if (opt_i_next == 0) {
-        lm_ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
-        lm_ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
-    } else {
-        lm_ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
-    }
-    opt_ctx->opt_i = opt_i_next;
-}
 // ====== High-Level Functions ======
 void lm_ggml_opt_epoch(
@@ -682,6 +846,7 @@ void lm_ggml_opt_epoch(
         int64_t                 idata_split,
         lm_ggml_opt_epoch_callback callback_train,
         lm_ggml_opt_epoch_callback callback_eval) {
+    LM_GGML_ASSERT(lm_ggml_opt_static_graphs(opt_ctx) && "lm_ggml_opt_epoch requires static graphs");
     struct lm_ggml_tensor * inputs = lm_ggml_opt_inputs(opt_ctx);
     struct lm_ggml_tensor * labels = lm_ggml_opt_labels(opt_ctx);
     struct lm_ggml_tensor * data   = lm_ggml_opt_dataset_data(dataset);
@@ -700,16 +865,18 @@ void lm_ggml_opt_epoch(
     int64_t ibatch = 0;
     int64_t t_loop_start = lm_ggml_time_us();
     for (; ibatch < ibatch_split; ++ibatch) {
+        lm_ggml_opt_alloc(opt_ctx, /*backward =*/ true);
         lm_ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        lm_ggml_opt_forward_backward(opt_ctx, result_train);
+        lm_ggml_opt_eval(opt_ctx, result_train);
         if (callback_train) {
             callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
         }
     }
     t_loop_start = lm_ggml_time_us();
     for (; ibatch < nbatches; ++ibatch) {
+        lm_ggml_opt_alloc(opt_ctx, /*backward =*/ false);
         lm_ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        lm_ggml_opt_forward(opt_ctx, result_eval);
+        lm_ggml_opt_eval(opt_ctx, result_eval);
         if (callback_eval) {
             callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
         }
@@ -726,13 +893,26 @@ void lm_ggml_opt_epoch_callback_progress_bar(
         int64_t            t_start_us) {
     fprintf(stderr, "%s[", train ? "train: " : "val:   ");
-    constexpr int64_t bar_length = 25;
+    // The progress bar consists of partially filled blocks, unicode has 8 separate fill levels.
+    constexpr int64_t bar_length = 8;
+    const int64_t ibatch8 = 8 * ibatch;
     for (int64_t j = 0; j < bar_length; ++j) {
-        const int64_t ibatch_j = ibatch_max * j/bar_length;
-        if (ibatch_j < ibatch) {
-            fprintf(stderr, "=");
-        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
-            fprintf(stderr, ">");
+        if        (ibatch_max * (8*j + 8) / bar_length < ibatch8) {
+            fprintf(stderr, "\u2588"); // full block
+        } else if (ibatch_max * (8*j + 7) / bar_length < ibatch8) {
+            fprintf(stderr, "\u2589"); // 7/8 filled
+        } else if (ibatch_max * (8*j + 6) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258A"); // 6/8 filled
+        } else if (ibatch_max * (8*j + 5) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258B"); // 5/8 filled
+        } else if (ibatch_max * (8*j + 4) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258C"); // 4/8 filled
+        } else if (ibatch_max * (8*j + 3) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258D"); // 3/8 filled
+        } else if (ibatch_max * (8*j + 2) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258E"); // 2/8 filled
+        } else if (ibatch_max * (8*j + 1) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258F"); // 1/8 filled
         } else {
             fprintf(stderr, " ");
         }
@@ -764,8 +944,8 @@ void lm_ggml_opt_epoch_callback_progress_bar(
     const int64_t t_eta_m = t_eta_s / 60;
     t_eta_s -= t_eta_m * 60;
-    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
-            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
+    fprintf(stderr, "] data=%07" PRId64 "/%07" PRId64 " loss=%.5lf±%.5lf acc=%.2lf±%.2lf%% "
+            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " \r",
             idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
             t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
     if (ibatch == ibatch_max) {
@@ -806,7 +986,10 @@ void lm_ggml_opt_fit(
     int64_t epoch = 1;
-    lm_ggml_opt_params params = lm_ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    lm_ggml_opt_params params = lm_ggml_opt_default_params(backend_sched, loss_type);
+    params.ctx_compute     = ctx_compute;
+    params.inputs          = inputs;
+    params.outputs         = outputs;
     params.opt_period      = opt_period;
     params.get_opt_pars    = get_opt_pars;
     params.get_opt_pars_ud = &epoch;