npm - cui-llama.rn - Versions diffs - 1.6.0 → 1.7.0 - Mend

cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (285) hide show

package/cpp/llama-context.h CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "llama-adapter.h"
 #include "ggml-cpp.h"
+#include "ggml-opt.h"
 #include <map>
 #include <vector>
@@ -27,7 +28,12 @@ struct llama_context {
     void synchronize();
-    const llama_model & get_model() const;
+    const llama_model   & get_model()   const;
+    const llama_cparams & get_cparams() const;
+    lm_ggml_backend_sched_t get_sched() const;
+    lm_ggml_context * get_ctx_compute() const;
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
@@ -128,6 +134,32 @@ struct llama_context {
     llama_perf_context_data perf_get_data() const;
     void perf_reset();
+    //
+    // training
+    //
+    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
+    void opt_epoch(
+            lm_ggml_opt_dataset_t      dataset,
+            lm_ggml_opt_result_t       result_train,
+            lm_ggml_opt_result_t       result_eval,
+            int64_t                 idata_split,
+            lm_ggml_opt_epoch_callback callback_train,
+            lm_ggml_opt_epoch_callback callback_eval);
+    void opt_epoch_iter(
+            lm_ggml_opt_dataset_t               dataset,
+            lm_ggml_opt_result_t                result,
+            const std::vector<llama_token> & tokens,
+            const std::vector<llama_token> & labels_sparse,
+            llama_batch                    & batch,
+            lm_ggml_opt_epoch_callback          callback,
+            bool                             train,
+            int64_t                          idata_in_loop,
+            int64_t                          ndata_in_loop,
+            int64_t                          t_loop_start);
 private:
     //
     // output
@@ -137,50 +169,30 @@ private:
     // Returns max number of outputs for which space was reserved.
     int32_t output_reserve(int32_t n_outputs);
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    void output_reorder();
     //
     // graph
     //
+public:
     int32_t graph_max_nodes() const;
     // zero-out inputs and create the ctx_compute for the compute graph
     lm_ggml_cgraph * graph_init();
+    // returns the result of lm_ggml_backend_sched_graph_compute_async execution
+    lm_ggml_status graph_compute(
+            lm_ggml_cgraph * gf,
+                   bool   batched);
+private:
     llm_graph_result_ptr graph_build(
             lm_ggml_context * ctx,
              lm_ggml_cgraph * gf,
       const llama_ubatch & ubatch,
           llm_graph_type   gtype);
-    // returns the result of lm_ggml_backend_sched_graph_compute_async execution
-    lm_ggml_status graph_compute(
-            lm_ggml_cgraph * gf,
-                   bool   batched);
     llm_graph_cb graph_get_cb() const;
-    // used by kv_self_update()
-    lm_ggml_tensor * build_rope_shift(
-        lm_ggml_context * ctx0,
-        lm_ggml_tensor * cur,
-        lm_ggml_tensor * shift,
-        lm_ggml_tensor * factors,
-              float   freq_base,
-              float   freq_scale,
-        lm_ggml_backend_buffer * bbuf) const;
-    llm_graph_result_ptr build_kv_self_shift(
-            lm_ggml_context * ctx0,
-            lm_ggml_cgraph * gf) const;
-    llm_graph_result_ptr build_kv_self_defrag(
-            lm_ggml_context * ctx0,
-            lm_ggml_cgraph * gf) const;
     // TODO: read/write lora adapters and cvec
     size_t state_write_data(llama_io_write_i & io);
     size_t state_read_data (llama_io_read_i  & io);
@@ -197,14 +209,10 @@ private:
     llama_cparams       cparams;
     llama_adapter_cvec  cvec;
     llama_adapter_loras loras;
-    llama_sbatch        sbatch;
     llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
-    std::unique_ptr<llama_kv_cache_unified> kv_self;
-    // TODO: remove
-    bool logits_all = false;
+    std::unique_ptr<llama_memory_i> memory;
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
@@ -231,6 +239,9 @@ private:
     lm_ggml_context_ptr ctx_compute;
+    // training
+    lm_ggml_opt_context_t opt_ctx = nullptr;
     lm_ggml_threadpool_t threadpool       = nullptr;
     lm_ggml_threadpool_t threadpool_batch = nullptr;

package/cpp/llama-cparams.h CHANGED Viewed

@@ -30,6 +30,7 @@ struct llama_cparams {
     bool flash_attn;
     bool no_perf;
     bool warmup;
+    bool op_offload;
     enum llama_pooling_type pooling_type;