cui-llama.rn 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +2 -0
  2. package/android/src/main/CMakeLists.txt +2 -2
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +39 -0
  5. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +5 -0
  6. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +5 -0
  7. package/cpp/common.cpp +36 -1
  8. package/cpp/common.h +5 -1
  9. package/cpp/ggml-aarch64.c +2 -11
  10. package/cpp/ggml-alloc.h +1 -1
  11. package/cpp/ggml-backend-impl.h +151 -78
  12. package/cpp/{ggml-backend.c → ggml-backend.cpp} +565 -269
  13. package/cpp/ggml-backend.h +147 -62
  14. package/cpp/ggml-impl.h +15 -0
  15. package/cpp/ggml-metal.h +8 -9
  16. package/cpp/ggml-metal.m +2428 -2111
  17. package/cpp/ggml-quants.c +2 -2
  18. package/cpp/ggml-quants.h +0 -4
  19. package/cpp/ggml.c +799 -1121
  20. package/cpp/ggml.h +79 -72
  21. package/cpp/llama-vocab.cpp +189 -106
  22. package/cpp/llama-vocab.h +18 -9
  23. package/cpp/llama.cpp +736 -341
  24. package/cpp/llama.h +9 -4
  25. package/cpp/unicode-data.cpp +6 -4
  26. package/cpp/unicode-data.h +4 -4
  27. package/cpp/unicode.cpp +14 -7
  28. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  29. package/lib/commonjs/index.js +4 -0
  30. package/lib/commonjs/index.js.map +1 -1
  31. package/lib/module/NativeRNLlama.js.map +1 -1
  32. package/lib/module/index.js +3 -0
  33. package/lib/module/index.js.map +1 -1
  34. package/lib/typescript/NativeRNLlama.d.ts +6 -0
  35. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  36. package/lib/typescript/index.d.ts +2 -1
  37. package/lib/typescript/index.d.ts.map +1 -1
  38. package/package.json +1 -1
  39. package/src/NativeRNLlama.ts +7 -0
  40. package/src/index.ts +5 -0
package/README.md CHANGED
@@ -12,6 +12,8 @@ The following features have been added for Android:
12
12
  - tokenizeSync: non-blocking, synchronous tokenizer function
13
13
  - Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
14
14
  - XTC sampling
15
+ - Progress callback
16
+ - Retrieving CPU Features to check for i8mm and dotprod flags
15
17
 
16
18
  Original repo README.md below.
17
19
 
@@ -16,7 +16,7 @@ set(
16
16
 
17
17
  ${RNLLAMA_LIB_DIR}/ggml-aarch64.c
18
18
  ${RNLLAMA_LIB_DIR}/ggml-alloc.c
19
- ${RNLLAMA_LIB_DIR}/ggml-backend.c
19
+ ${RNLLAMA_LIB_DIR}/ggml-backend.cpp
20
20
  ${RNLLAMA_LIB_DIR}/ggml.c
21
21
  ${RNLLAMA_LIB_DIR}/ggml-quants.c
22
22
  ${RNLLAMA_LIB_DIR}/common.cpp
@@ -55,7 +55,7 @@ function(build_library target_name cpu_flags)
55
55
  # NOTE: If you want to debug the native code, you can uncomment if and endif
56
56
  # Note that it will be extremely slow
57
57
  # if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
58
- target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG)
58
+ target_compile_options(${target_name} PRIVATE -O3 -DNDEBUG -DRNLLAMA_USE_FD_FILE)
59
59
  target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
60
60
  target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
61
61
 
@@ -10,6 +10,8 @@ import com.facebook.react.modules.core.DeviceEventManagerModule;
10
10
 
11
11
  import android.util.Log;
12
12
  import android.os.Build;
13
+ import android.os.ParcelFileDescriptor;
14
+ import android.net.Uri;
13
15
  import android.content.res.AssetManager;
14
16
 
15
17
  import java.lang.StringBuilder;
@@ -17,6 +19,7 @@ import java.io.BufferedReader;
17
19
  import java.io.FileReader;
18
20
  import java.io.File;
19
21
  import java.io.IOException;
22
+ import java.io.InputStream;
20
23
  import java.io.FileInputStream;
21
24
 
22
25
  public class LlamaContext {
@@ -31,11 +34,17 @@ public class LlamaContext {
31
34
 
32
35
  private byte[] ggufHeader = {0x47, 0x47, 0x55, 0x46};
33
36
 
34
- private boolean isGGUF(final String filepath) {
37
+ private boolean isGGUF(final String filepath, final ReactApplicationContext reactContext) {
35
38
  byte[] fileHeader = new byte[4];
36
- FileInputStream fis = null;
39
+ InputStream fis = null;
37
40
  try {
38
- fis = new FileInputStream(filepath);
41
+ if (filepath.startsWith("content")) {
42
+ fis = reactContext.getApplicationContext().getContentResolver().openInputStream(Uri.parse(filepath));
43
+ } else {
44
+ fis = new FileInputStream(filepath);
45
+ }
46
+
47
+
39
48
  int bytesRead = fis.read(fileHeader);
40
49
  if(bytesRead < 4) {
41
50
  return false;
@@ -52,7 +61,7 @@ public class LlamaContext {
52
61
  try {
53
62
  fis.close();
54
63
  } catch (Exception e) {
55
- Log.d(NAME, "Closing FileInputStream failed.");
64
+ Log.d(NAME, "Closing InputStream failed.");
56
65
  }
57
66
  }
58
67
  }
@@ -65,16 +74,29 @@ public class LlamaContext {
65
74
  if (!params.hasKey("model")) {
66
75
  throw new IllegalArgumentException("Missing required parameter: model");
67
76
  }
68
- // Check if file has GGUF magic numbers
69
- if(!isGGUF(params.getString("model"))) {
77
+
78
+ String modelName = params.getString("model");
79
+
80
+ if(!isGGUF(modelName, reactContext)) {
70
81
  throw new IllegalArgumentException("File is not in GGUF format");
71
82
  }
72
83
 
84
+ if ( modelName.startsWith("content://")) {
85
+ Uri uri = Uri.parse(modelName);
86
+ try {
87
+ ParcelFileDescriptor pfd = reactContext.getApplicationContext().getContentResolver().openFileDescriptor(uri, "r");
88
+ modelName = "" + pfd.getFd();
89
+ } catch (Exception e) {
90
+ Log.e(NAME, "Failed to convert to FD!");
91
+ }
92
+ }
93
+
94
+ // Check if file has GGUF magic numbers
73
95
  this.id = id;
74
96
  eventEmitter = reactContext.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter.class);
75
97
  this.context = initContext(
76
98
  // String model,
77
- params.getString("model"),
99
+ modelName,
78
100
  // boolean embedding,
79
101
  params.hasKey("embedding") ? params.getBoolean("embedding") : false,
80
102
  // int n_ctx,
@@ -325,7 +347,7 @@ public class LlamaContext {
325
347
  }
326
348
  }
327
349
 
328
- private static boolean isArm64V8a() {
350
+ public static boolean isArm64V8a() {
329
351
  return Build.SUPPORTED_ABIS[0].equals("arm64-v8a");
330
352
  }
331
353
 
@@ -333,7 +355,7 @@ public class LlamaContext {
333
355
  return Build.SUPPORTED_ABIS[0].equals("x86_64");
334
356
  }
335
357
 
336
- private static String getCpuFeatures() {
358
+ public static String getCpuFeatures() {
337
359
  File file = new File("/proc/cpuinfo");
338
360
  StringBuilder stringBuilder = new StringBuilder();
339
361
  try {
@@ -294,7 +294,46 @@ public class RNLlama implements LifecycleEventListener {
294
294
  return context.tokenize(text);
295
295
  }
296
296
 
297
+ public void getCpuFeatures(Promise promise) {
298
+ AsyncTask task = new AsyncTask<Void, Void, WritableMap>() {
299
+ private Exception exception;
300
+ @Override
301
+ protected WritableMap doInBackground(Void... voids) {
302
+ try {
303
+ WritableMap result = Arguments.createMap();
304
+ boolean isV8 = LlamaContext.isArm64V8a();
305
+ result.putBoolean("armv8", isV8);
306
+
307
+ if(isV8) {
308
+ String cpuFeatures = LlamaContext.getCpuFeatures();
309
+ boolean hasDotProd = cpuFeatures.contains("dotprod") || cpuFeatures.contains("asimddp");
310
+ boolean hasInt8Matmul = cpuFeatures.contains("i8mm");
311
+ result.putBoolean("i8mm", hasInt8Matmul);
312
+ result.putBoolean("dotprod", hasDotProd);
313
+ } else {
314
+ result.putBoolean("i8mm", false);
315
+ result.putBoolean("dotprod", false);
316
+ }
317
+ return result;
318
+ } catch (Exception e) {
319
+ exception = e;
320
+ return null;
321
+ }
322
+ }
297
323
 
324
+ @Override
325
+ protected void onPostExecute(WritableMap result) {
326
+ if (exception != null) {
327
+ promise.reject(exception);
328
+ return;
329
+ }
330
+ promise.resolve(result);
331
+ tasks.remove(this);
332
+ }
333
+ }.executeOnExecutor(AsyncTask.THREAD_POOL_EXECUTOR);
334
+ tasks.put(task, "getCPUFeatures");
335
+ }
336
+
298
337
  public void detokenize(double id, final ReadableArray tokens, final Promise promise) {
299
338
  final int contextId = (int) id;
300
339
  AsyncTask task = new AsyncTask<Void, Void, String>() {
@@ -78,6 +78,11 @@ public class RNLlamaModule extends NativeRNLlamaSpec {
78
78
  return rnllama.tokenizeSync(id, text);
79
79
  }
80
80
 
81
+ @ReactMethod
82
+ public void getCpuFeatures(final Promise promise) {
83
+ rnllama.getCpuFeatures(promise);
84
+ }
85
+
81
86
  @ReactMethod
82
87
  public void detokenize(double id, final ReadableArray tokens, final Promise promise) {
83
88
  rnllama.detokenize(id, tokens, promise);
@@ -79,6 +79,11 @@ public class RNLlamaModule extends ReactContextBaseJavaModule {
79
79
  return rnllama.tokenizeSync(id, text);
80
80
  }
81
81
 
82
+ @ReactMethod
83
+ public void getCpuFeatures(final Promise promise) {
84
+ rnllama.getCpuFeatures(promise);
85
+ }
86
+
82
87
  @ReactMethod
83
88
  public void detokenize(double id, final ReadableArray tokens, final Promise promise) {
84
89
  rnllama.detokenize(id, tokens, promise);
package/cpp/common.cpp CHANGED
@@ -844,6 +844,31 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
844
844
  return iparams;
845
845
  }
846
846
 
847
+ if (params.reranking) {
848
+ bool ok = true;
849
+
850
+ if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
851
+ LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
852
+ ok = false;
853
+ }
854
+
855
+ if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
856
+ LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
857
+ ok = false;
858
+ }
859
+
860
+ if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
861
+ LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
862
+ ok = false;
863
+ }
864
+
865
+ if (!ok) {
866
+ llama_free_model(model);
867
+
868
+ return iparams;
869
+ }
870
+ }
871
+
847
872
  auto cparams = llama_context_params_from_gpt_params(params);
848
873
 
849
874
  llama_context * lctx = llama_new_context_with_model(model, cparams);
@@ -861,6 +886,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
861
886
  if (cvec.n_embd == -1) {
862
887
  llama_free(lctx);
863
888
  llama_free_model(model);
889
+
864
890
  return iparams;
865
891
  }
866
892
 
@@ -873,6 +899,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
873
899
  if (err) {
874
900
  llama_free(lctx);
875
901
  llama_free_model(model);
902
+
876
903
  return iparams;
877
904
  }
878
905
  }
@@ -895,7 +922,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
895
922
  llama_lora_adapters_apply(lctx, iparams.lora_adapters);
896
923
  }
897
924
 
898
- if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
925
+ if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
899
926
  LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
900
927
  params.sparams.ignore_eos = false;
901
928
  }
@@ -936,6 +963,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
936
963
 
937
964
  iparams.model = model;
938
965
  iparams.context = lctx;
966
+
939
967
  return iparams;
940
968
  }
941
969
 
@@ -1033,6 +1061,11 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
1033
1061
  cparams.flash_attn = params.flash_attn;
1034
1062
  cparams.no_perf = params.no_perf;
1035
1063
 
1064
+ if (params.reranking) {
1065
+ cparams.embeddings = true;
1066
+ cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1067
+ }
1068
+
1036
1069
  cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1037
1070
  cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
1038
1071
 
@@ -1442,6 +1475,8 @@ void llama_batch_add(
1442
1475
  llama_pos pos,
1443
1476
  const std::vector<llama_seq_id> & seq_ids,
1444
1477
  bool logits) {
1478
+ LM_GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
1479
+
1445
1480
  batch.token [batch.n_tokens] = id;
1446
1481
  batch.pos [batch.n_tokens] = pos;
1447
1482
  batch.n_seq_id[batch.n_tokens] = seq_ids.size();
package/cpp/common.h CHANGED
@@ -290,6 +290,7 @@ struct gpt_params {
290
290
  int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
291
291
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
292
292
  std::string embd_sep = "\n"; // separator of embendings
293
+ bool reranking = false; // enable reranking support on server
293
294
 
294
295
  // server params
295
296
  int32_t port = 8080; // server listens on this network port
@@ -308,7 +309,10 @@ struct gpt_params {
308
309
  std::string ssl_file_key = ""; // NOLINT
309
310
  std::string ssl_file_cert = ""; // NOLINT
310
311
 
311
- bool endpoint_slots = true;
312
+ // "advanced" endpoints are disabled by default for better security
313
+ bool webui = true;
314
+ bool endpoint_slots = false;
315
+ bool endpoint_props = false; // only control POST requests, not GET
312
316
  bool endpoint_metrics = false;
313
317
 
314
318
  bool log_json = false;
@@ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
598
598
  return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
599
599
  }
600
600
 
601
- // Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
602
- static int sve_lane_count(void) {
603
- #if defined(__ARM_FEATURE_SVE)
604
- return lm_ggml_sve_cnt_b;
605
- #else
606
- return 0;
607
- #endif
608
- }
609
-
610
601
  void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
611
602
  const int qk = QK8_0;
612
603
  const int nb = n / qk;
@@ -843,7 +834,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
843
834
 
844
835
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
845
836
  #if defined(__ARM_FEATURE_SVE)
846
- if (lm_ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
837
+ if (lm_ggml_cpu_has_sve() && lm_ggml_cpu_get_sve_cnt() == QK8_0) {
847
838
  const void * b_ptr = vx;
848
839
  const void * a_ptr = vy;
849
840
  float * res_ptr = s;
@@ -2020,7 +2011,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2020
2011
 
2021
2012
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2022
2013
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2023
- if (lm_ggml_cpu_has_sve() && lm_ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
2014
+ if (lm_ggml_cpu_has_sve() && lm_ggml_cpu_has_matmul_int8() && lm_ggml_cpu_get_sve_cnt() == QK8_0) {
2024
2015
  const void * b_ptr = vx;
2025
2016
  const void * a_ptr = vy;
2026
2017
  float * res_ptr = s;
package/cpp/ggml-alloc.h CHANGED
@@ -24,7 +24,7 @@ LM_GGML_API void lm_ggml_tallocr_alloc(struct lm_ggml_tallocr * t
24
24
  // Graph allocator
25
25
  /*
26
26
  Example usage:
27
- lm_ggml_gallocr_t galloc = lm_ggml_gallocr_new(lm_ggml_bacckend_cpu_buffer_type());
27
+ lm_ggml_gallocr_t galloc = lm_ggml_gallocr_new(lm_ggml_backend_cpu_buffer_type());
28
28
 
29
29
  // optional: create a worst-case graph and reserve the buffers to avoid reallocations
30
30
  lm_ggml_gallocr_reserve(galloc, build_graph(max_batch));
@@ -9,145 +9,218 @@ extern "C" {
9
9
  #endif
10
10
 
11
11
  //
12
- // Backend buffer
12
+ // Backend buffer type
13
13
  //
14
14
 
15
- // buffer type
16
- typedef void * lm_ggml_backend_buffer_type_context_t;
17
-
18
15
  struct lm_ggml_backend_buffer_type_i {
19
- const char * (*LM_GGML_CALL get_name) (lm_ggml_backend_buffer_type_t buft);
16
+ const char * (*get_name) (lm_ggml_backend_buffer_type_t buft);
20
17
  // allocate a buffer of this type
21
- lm_ggml_backend_buffer_t (*LM_GGML_CALL alloc_buffer) (lm_ggml_backend_buffer_type_t buft, size_t size);
18
+ lm_ggml_backend_buffer_t (*alloc_buffer) (lm_ggml_backend_buffer_type_t buft, size_t size);
22
19
  // tensor alignment
23
- size_t (*LM_GGML_CALL get_alignment) (lm_ggml_backend_buffer_type_t buft);
24
- // max buffer size that can be allocated
25
- size_t (*LM_GGML_CALL get_max_size) (lm_ggml_backend_buffer_type_t buft);
26
- // data size needed to allocate the tensor, including padding
27
- size_t (*LM_GGML_CALL get_alloc_size) (lm_ggml_backend_buffer_type_t buft, const struct lm_ggml_tensor * tensor);
28
- // check if tensor data is in host memory
29
- bool (*LM_GGML_CALL is_host) (lm_ggml_backend_buffer_type_t buft);
20
+ size_t (*get_alignment) (lm_ggml_backend_buffer_type_t buft);
21
+ // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
22
+ size_t (*get_max_size) (lm_ggml_backend_buffer_type_t buft);
23
+ // (optional) data size needed to allocate the tensor, including padding (defaults to lm_ggml_nbytes)
24
+ size_t (*get_alloc_size)(lm_ggml_backend_buffer_type_t buft, const struct lm_ggml_tensor * tensor);
25
+ // (optional) check if tensor data is in host memory (defaults to false)
26
+ bool (*is_host) (lm_ggml_backend_buffer_type_t buft);
30
27
  };
31
28
 
32
29
  struct lm_ggml_backend_buffer_type {
33
30
  struct lm_ggml_backend_buffer_type_i iface;
34
- lm_ggml_backend_buffer_type_context_t context;
31
+ lm_ggml_backend_dev_t device;
32
+ void * context;
35
33
  };
36
34
 
37
- // buffer
38
- typedef void * lm_ggml_backend_buffer_context_t;
35
+ //
36
+ // Backend buffer
37
+ //
39
38
 
40
39
  struct lm_ggml_backend_buffer_i {
41
- const char * (*LM_GGML_CALL get_name) (lm_ggml_backend_buffer_t buffer);
42
- void (*LM_GGML_CALL free_buffer) (lm_ggml_backend_buffer_t buffer);
43
- void * (*LM_GGML_CALL get_base) (lm_ggml_backend_buffer_t buffer);
44
- void (*LM_GGML_CALL init_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
45
- void (*LM_GGML_CALL memset_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
46
- void (*LM_GGML_CALL set_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
47
- void (*LM_GGML_CALL get_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
48
- bool (*LM_GGML_CALL cpy_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
49
- void (*LM_GGML_CALL clear) (lm_ggml_backend_buffer_t buffer, uint8_t value);
50
- void (*LM_GGML_CALL reset) (lm_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
40
+ const char * (*get_name) (lm_ggml_backend_buffer_t buffer);
41
+ // (optional) free the buffer
42
+ void (*free_buffer) (lm_ggml_backend_buffer_t buffer);
43
+ // base address of the buffer
44
+ void * (*get_base) (lm_ggml_backend_buffer_t buffer);
45
+ // (optional) initialize a tensor in the buffer (eg. add tensor extras)
46
+ void (*init_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor);
47
+ // tensor data access
48
+ void (*memset_tensor)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
49
+ void (*set_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
50
+ void (*get_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
51
+ // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
52
+ bool (*cpy_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
53
+ // clear the entire buffer
54
+ void (*clear) (lm_ggml_backend_buffer_t buffer, uint8_t value);
55
+ // (optional) reset any internal state due to tensor initialization, such as tensor extras
56
+ void (*reset) (lm_ggml_backend_buffer_t buffer);
51
57
  };
52
58
 
53
59
  struct lm_ggml_backend_buffer {
54
60
  struct lm_ggml_backend_buffer_i iface;
55
61
  lm_ggml_backend_buffer_type_t buft;
56
- lm_ggml_backend_buffer_context_t context;
62
+ void * context;
57
63
  size_t size;
58
64
  enum lm_ggml_backend_buffer_usage usage;
59
65
  };
60
66
 
61
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
62
- lm_ggml_backend_buffer_type_t buft,
63
- struct lm_ggml_backend_buffer_i iface,
64
- lm_ggml_backend_buffer_context_t context,
65
- size_t size);
67
+ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
68
+ lm_ggml_backend_buffer_type_t buft,
69
+ struct lm_ggml_backend_buffer_i iface,
70
+ void * context,
71
+ size_t size);
66
72
 
67
73
  // do not use directly, use lm_ggml_backend_tensor_copy instead
68
74
  bool lm_ggml_backend_buffer_copy_tensor(const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
69
75
 
76
+ // multi-buffer
70
77
  // buffer that contains a collection of buffers
71
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers);
72
- LM_GGML_CALL bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer);
73
- LM_GGML_CALL void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage);
78
+ lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers);
79
+ bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer);
80
+ void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage);
74
81
 
75
82
  //
76
- // Backend
83
+ // Backend (stream)
77
84
  //
78
85
 
79
- typedef void * lm_ggml_backend_context_t;
80
-
81
86
  struct lm_ggml_backend_i {
82
- const char * (*LM_GGML_CALL get_name)(lm_ggml_backend_t backend);
87
+ const char * (*get_name)(lm_ggml_backend_t backend);
83
88
 
84
- void (*LM_GGML_CALL free)(lm_ggml_backend_t backend);
89
+ void (*free)(lm_ggml_backend_t backend);
85
90
 
91
+ // Will be moved to the device interface
86
92
  // buffer allocation
87
- lm_ggml_backend_buffer_type_t (*LM_GGML_CALL get_default_buffer_type)(lm_ggml_backend_t backend);
93
+ lm_ggml_backend_buffer_type_t (*get_default_buffer_type)(lm_ggml_backend_t backend);
88
94
 
89
95
  // (optional) asynchronous tensor data access
90
- void (*LM_GGML_CALL set_tensor_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
91
- void (*LM_GGML_CALL get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
92
- bool (*LM_GGML_CALL cpy_tensor_async)(lm_ggml_backend_t backend_src, lm_ggml_backend_t backend_dst, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
96
+ void (*set_tensor_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
97
+ void (*get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
98
+ bool (*cpy_tensor_async)(lm_ggml_backend_t backend_src, lm_ggml_backend_t backend_dst, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
93
99
 
94
100
  // (optional) complete all pending operations
95
- void (*LM_GGML_CALL synchronize)(lm_ggml_backend_t backend);
101
+ void (*synchronize)(lm_ggml_backend_t backend);
96
102
 
97
- // compute graph with a plan (not used currently)
98
- // create a new plan for a graph
99
- lm_ggml_backend_graph_plan_t (*LM_GGML_CALL graph_plan_create) (lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph);
100
- void (*LM_GGML_CALL graph_plan_free) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
103
+ // (optional) compute graph with a plan (not used currently)
104
+ lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph);
105
+ void (*graph_plan_free) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
101
106
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
102
- void (*LM_GGML_CALL graph_plan_update) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan, const struct lm_ggml_cgraph * cgraph);
107
+ void (*graph_plan_update) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan, const struct lm_ggml_cgraph * cgraph);
103
108
  // compute the graph with the plan
104
- enum lm_ggml_status (*LM_GGML_CALL graph_plan_compute)(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
105
-
106
- // compute graph without a plan (async)
107
- enum lm_ggml_status (*LM_GGML_CALL graph_compute) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph);
108
-
109
- // check if the backend can compute an operation
110
- bool (*LM_GGML_CALL supports_op)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
109
+ enum lm_ggml_status (*graph_plan_compute)(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
111
110
 
112
- // check if the backend can use tensors allocated in a buffer type
113
- bool (*LM_GGML_CALL supports_buft)(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft);
111
+ // compute graph (always async if supported by the backend)
112
+ enum lm_ggml_status (*graph_compute) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph);
114
113
 
115
- // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
116
- // these should be expensive operations with large batch sizes that may benefit from running on this backend
117
- // even if the weight has to be copied from the CPU temporarily
118
- bool (*LM_GGML_CALL offload_op)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
114
+ // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
115
+ // new backends should implement the device interface instead
116
+ // These functions are being moved to the device interface
117
+ bool (*supports_op) (lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
118
+ bool (*supports_buft)(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft);
119
+ bool (*offload_op) (lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
119
120
 
120
121
  // (optional) event synchronization
121
- // create a new event that can record events on this backend instance
122
- lm_ggml_backend_event_t (*LM_GGML_CALL event_new) (lm_ggml_backend_t backend);
123
- void (*LM_GGML_CALL event_free) (lm_ggml_backend_event_t event);
124
- // record an event on the backend instance that created it
125
- void (*LM_GGML_CALL event_record) (lm_ggml_backend_event_t event);
126
- // wait for an event on on a different backend instance
127
- void (*LM_GGML_CALL event_wait) (lm_ggml_backend_t backend, lm_ggml_backend_event_t event);
128
- // block until an event is recorded
129
- void (*LM_GGML_CALL event_synchronize) (lm_ggml_backend_event_t event);
122
+ // record an event on this stream
123
+ void (*event_record)(lm_ggml_backend_t backend, lm_ggml_backend_event_t event);
124
+ // wait for an event on on a different stream
125
+ void (*event_wait) (lm_ggml_backend_t backend, lm_ggml_backend_event_t event);
130
126
  };
131
127
 
132
128
  struct lm_ggml_backend {
133
129
  lm_ggml_guid_t guid;
134
-
135
130
  struct lm_ggml_backend_i iface;
136
- lm_ggml_backend_context_t context;
131
+ lm_ggml_backend_dev_t device;
132
+ void * context;
137
133
  };
138
134
 
139
135
  struct lm_ggml_backend_event {
140
- lm_ggml_backend_t backend;
136
+ struct lm_ggml_backend_device * device;
137
+ void * context;
138
+ };
139
+
140
+ //
141
+ // Backend device
142
+ //
143
+
144
+ // Note: if additional properties are needed, we should add a struct with all of them
145
+ // the current functions to obtain the properties can remain, since they are more convenient for often used properties
146
+ struct lm_ggml_backend_device_i {
147
+ // device name: short identifier for this device, such as "CPU" or "CUDA0"
148
+ const char * (*get_name)(lm_ggml_backend_dev_t dev);
149
+
150
+ // device description: short informative description of the device, could be the model name
151
+ const char * (*get_description)(lm_ggml_backend_dev_t dev);
152
+
153
+ // device memory in bytes
154
+ void (*get_memory)(lm_ggml_backend_dev_t dev, size_t * free, size_t * total);
155
+
156
+ // device type
157
+ enum lm_ggml_backend_dev_type (*get_type)(lm_ggml_backend_dev_t dev);
158
+
159
+ // device properties
160
+ void (*get_props)(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props);
161
+
162
+ // backend (stream) initialization
163
+ lm_ggml_backend_t (*init_backend)(lm_ggml_backend_dev_t dev, const char * params);
164
+
165
+ // preferred buffer type
166
+ lm_ggml_backend_buffer_type_t (*get_buffer_type)(lm_ggml_backend_dev_t dev);
167
+
168
+ // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
169
+ lm_ggml_backend_buffer_type_t (*get_host_buffer_type)(lm_ggml_backend_dev_t dev);
170
+
171
+ // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
172
+ lm_ggml_backend_buffer_t (*buffer_from_host_ptr)(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
173
+
174
+ // check if the backend can compute an operation
175
+ bool (*supports_op)(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op);
176
+
177
+ // check if the backend can use tensors allocated in a buffer type
178
+ bool (*supports_buft)(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft);
179
+
180
+ // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
181
+ // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
182
+ bool (*offload_op)(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op);
183
+
184
+ // (optional) event synchronization
185
+ lm_ggml_backend_event_t (*event_new) (lm_ggml_backend_dev_t dev);
186
+ void (*event_free) (lm_ggml_backend_dev_t dev, lm_ggml_backend_event_t event);
187
+ void (*event_synchronize) (lm_ggml_backend_dev_t dev, lm_ggml_backend_event_t event);
188
+ };
189
+
190
+ struct lm_ggml_backend_device {
191
+ struct lm_ggml_backend_device_i iface;
192
+ lm_ggml_backend_reg_t reg;
141
193
  void * context;
142
194
  };
143
195
 
144
196
  //
145
- // Backend registry
197
+ // Backend (reg)
146
198
  //
147
199
 
148
- typedef lm_ggml_backend_t (*LM_GGML_CALL lm_ggml_backend_init_fn)(const char * params, void * user_data);
200
+ struct lm_ggml_backend_reg_i {
201
+ const char * (*get_name)(lm_ggml_backend_reg_t reg);
202
+
203
+ // enumerate available devices
204
+ size_t (*get_device_count)(lm_ggml_backend_reg_t reg);
205
+ lm_ggml_backend_dev_t (*get_device)(lm_ggml_backend_reg_t reg, size_t index);
206
+
207
+ // (optional) get a pointer to a function in the backend
208
+ // backends can add custom functions that are not part of the standard ggml-backend interface
209
+ void * (*get_proc_address)(lm_ggml_backend_reg_t reg, const char * name);
210
+ };
211
+
212
+ struct lm_ggml_backend_reg {
213
+ // int api_version; // TODO: for dynamic loading
214
+ struct lm_ggml_backend_reg_i iface;
215
+ void * context;
216
+ };
217
+
149
218
 
150
- LM_GGML_CALL void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data);
219
+ // Internal backend registry API
220
+ void lm_ggml_backend_register(lm_ggml_backend_reg_t reg);
221
+ void lm_ggml_backend_device_register(lm_ggml_backend_dev_t device);
222
+ // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
223
+ // typedef lm_ggml_backend_register_t * (*lm_ggml_backend_init)(void);
151
224
 
152
225
  #ifdef __cplusplus
153
226
  }