whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
data/ext/extconf.rb CHANGED
@@ -1,13 +1,10 @@
1
1
  require 'mkmf'
2
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
3
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
4
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
5
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
6
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
7
-
8
2
 
9
3
  # need to use c++ compiler flags
10
- $CXXFLAGS << ' -std=c++11'
4
+ $CXXFLAGS << ' -std=c++17'
5
+
6
+ $LDFLAGS << ' -lstdc++'
7
+
11
8
  # Set to true when building binary gems
12
9
  if enable_config('static-stdlib', false)
13
10
  $LDFLAGS << ' -static-libgcc -static-libstdc++'
@@ -18,4 +15,185 @@ if enable_config('march-tune-native', false)
18
15
  $CXXFLAGS << ' -march=native -mtune=native'
19
16
  end
20
17
 
18
+ if ENV['WHISPER_METAL']
19
+ $GGML_METAL ||= true
20
+ $DEPRECATE_WARNING ||= true
21
+ end
22
+
23
+ $UNAME_S = `uname -s`.chomp
24
+ $UNAME_P = `uname -p`.chomp
25
+ $UNAME_M = `uname -m`.chomp
26
+
27
+ if $UNAME_S == 'Darwin'
28
+ unless ENV['GGML_NO_METAL']
29
+ $GGML_METAL ||= true
30
+ end
31
+ $GGML_NO_OPENMP ||= true
32
+ end
33
+
34
+ if $GGML_METAL
35
+ $GGML_METAL_EMBED_LIBRARY = true
36
+ end
37
+
38
+ $MK_CPPFLAGS = '-Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -Iexamples'
39
+ $MK_CFLAGS = '-std=c11 -fPIC'
40
+ $MK_CXXFLAGS = '-std=c++17 -fPIC'
41
+ $MK_NVCCFLAGS = '-std=c++17'
42
+ $MK_LDFLAGS = ''
43
+
44
+ $OBJ_GGML = []
45
+ $OBJ_WHISPER = []
46
+ $OBJ_COMMON = []
47
+ $OBJ_SDL = []
48
+
49
+ $MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
50
+
51
+ if $UNAME_S == 'Linux'
52
+ $MK_CPPFLAGS << ' -D_GNU_SOURCE'
53
+ end
54
+
55
+ if $UNAME_S == 'Darwin'
56
+ $MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
57
+ end
58
+
59
+ if ENV['WHISPER_DEBUG']
60
+ $MK_CFLAGS << ' -O0 -g'
61
+ $MK_CXXFLAGS << ' -O0 -g'
62
+ $MK_LDFLAGS << ' -g'
63
+ $MK_NVCCFLAGS << ' -O0 -g'
64
+ else
65
+ $MK_CPPFLAGS << ' -DNDEBUG'
66
+ $MK_CFLAGS << ' -O3'
67
+ $MK_CXXFLAGS << ' -O3'
68
+ $MK_NVCCFLAGS << ' -O3'
69
+ end
70
+
71
+ $WARN_FLAGS =
72
+ ' -Wall' <<
73
+ ' -Wextra' <<
74
+ ' -Wpedantic' <<
75
+ ' -Wcast-qual' <<
76
+ ' -Wno-unused-function'
77
+
78
+ $MK_CFLAGS <<
79
+ $WARN_FLAGS <<
80
+ ' -Wshadow' <<
81
+ ' -Wstrict-prototypes' <<
82
+ ' -Wpointer-arith' <<
83
+ ' -Wmissing-prototypes' <<
84
+ ' -Werror=implicit-int' <<
85
+ ' -Werror=implicit-function-declaration'
86
+
87
+ $MK_CXXFLAGS <<
88
+ $WARN_FLAGS <<
89
+ ' -Wmissing-declarations' <<
90
+ ' -Wmissing-noreturn'
91
+
92
+ unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
93
+ $MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
94
+ end
95
+
96
+ if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
97
+ $MK_CFLAGS << ' -pthread'
98
+ $MK_CXXFLAGS << ' -pthread'
99
+ end
100
+
101
+ unless $_WIN32
102
+ $DSO_EXT = '.so'
103
+ else
104
+ $DSO_EXT = '.dll'
105
+ end
106
+
107
+ unless ENV['RISCV']
108
+ if %w[x86_64 i686 amd64].include? $UNAME_M
109
+ $HOST_CXXFLAGS ||= ''
110
+
111
+ $MK_CFLAGS << ' -march=native -mtune=native'
112
+ $HOST_CXXFLAGS << ' -march=native -mtune=native'
113
+ end
114
+ else
115
+ $MK_CFLAGS << ' -march=rv64gcv -mabi=lp64d'
116
+ $MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
117
+ end
118
+
119
+ unless ENV['GGML_NO_ACCELERATE']
120
+ if $UNAME_S == 'Darwin'
121
+ $MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE'
122
+ $MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
123
+ $MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
124
+ $MK_LDFLAGS << ' -framework Accelerate'
125
+ $OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
126
+ end
127
+ end
128
+
129
+ if ENV['GGML_OPENBLAS']
130
+ $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
131
+ $MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
132
+ $MK_LDFLAGS << " #{`pkg-config --libs openblas`}"
133
+ $OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
134
+ end
135
+
136
+ if ENV['GGML_OPENBLAS64']
137
+ $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
138
+ $MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
139
+ $MK_LDFLAGS << " #{`pkg-config --libs openblas64`}"
140
+ $OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
141
+ end
142
+
143
+ if $GGML_METAL
144
+ $MK_CPPFLAGS << ' -DGGML_USE_METAL'
145
+ $MK_LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
146
+ $OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal.o'
147
+
148
+ if ENV['GGML_METAL_NDEBUG']
149
+ $MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
150
+ end
151
+
152
+ if $GGML_METAL_EMBED_LIBRARY
153
+ $MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
154
+ $OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal-embed.o'
155
+ end
156
+ end
157
+
158
+ $OBJ_GGML <<
159
+ 'ggml/src/ggml.o' <<
160
+ 'ggml/src/ggml-alloc.o' <<
161
+ 'ggml/src/ggml-backend.o' <<
162
+ 'ggml/src/ggml-backend-reg.o' <<
163
+ 'ggml/src/ggml-opt.o' <<
164
+ 'ggml/src/ggml-quants.o' <<
165
+ 'ggml/src/ggml-threading.o' <<
166
+ 'ggml/src/ggml-cpu/ggml-cpu.o' <<
167
+ 'ggml/src/ggml-cpu/ggml-cpu-cpp.o' <<
168
+ 'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
169
+ 'ggml/src/ggml-cpu/ggml-cpu-hbm.o' <<
170
+ 'ggml/src/ggml-cpu/ggml-cpu-quants.o' <<
171
+ 'ggml/src/ggml-cpu/ggml-cpu-traits.o'
172
+
173
+ $OBJ_WHISPER <<
174
+ 'src/whisper.o'
175
+
176
+ $objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL
177
+ $objs << "ruby_whisper.o"
178
+
179
+ $CPPFLAGS = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
180
+ $CFLAGS = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
181
+ $BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
182
+ $CXXFLAGS = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
183
+ $NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
184
+ $LDFLAGS = "#{$MK_LDFLAGS} #{$LDFLAGS}"
185
+
21
186
  create_makefile('whisper')
187
+
188
+ File.open 'Makefile', 'a' do |file|
189
+ file.puts 'include scripts/get-flags.mk'
190
+ file.puts 'include cpu.mk'
191
+
192
+ if $GGML_METAL
193
+ file.puts 'include metal.mk'
194
+
195
+ if $GGML_METAL_EMBED_LIBRARY
196
+ file.puts 'include metal-embed.mk'
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,76 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif
8
+
9
+ typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
10
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
11
+ typedef struct ggml_backend * ggml_backend_t;
12
+
13
+ // Tensor allocator
14
+ struct ggml_tallocr {
15
+ ggml_backend_buffer_t buffer;
16
+ void * base;
17
+ size_t alignment;
18
+ size_t offset;
19
+ };
20
+
21
+ GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
22
+ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
23
+
24
+ // Graph allocator
25
+ /*
26
+ Example usage:
27
+ ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
28
+
29
+ // optional: create a worst-case graph and reserve the buffers to avoid reallocations
30
+ ggml_gallocr_reserve(galloc, build_graph(max_batch));
31
+
32
+ // allocate the graph
33
+ struct ggml_cgraph * graph = build_graph(batch);
34
+ ggml_gallocr_alloc_graph(galloc, graph);
35
+
36
+ printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
37
+
38
+ // evaluate the graph
39
+ ggml_backend_graph_compute(backend, graph);
40
+ */
41
+
42
+ // special tensor flags for use with the graph allocator:
43
+ // ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
44
+ // ggml_set_output(): output tensors are never freed and never overwritten
45
+
46
+ typedef struct ggml_gallocr * ggml_gallocr_t;
47
+
48
+ GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
49
+ GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
50
+ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
51
+
52
+ // pre-allocate buffers from a measure graph - does not allocate or modify the graph
53
+ // call with a worst-case graph to avoid buffer reallocations
54
+ // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
55
+ // returns false if the buffer allocation failed
56
+ GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
57
+ GGML_API bool ggml_gallocr_reserve_n(
58
+ ggml_gallocr_t galloc,
59
+ struct ggml_cgraph * graph,
60
+ const int * node_buffer_ids,
61
+ const int * leaf_buffer_ids);
62
+
63
+ // automatic reallocation if the topology changes when using a single buffer
64
+ // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
65
+ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
66
+
67
+ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
68
+
69
+ // Utils
70
+ // Create a buffer and allocate all the tensors in a ggml_context
71
+ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
72
+ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
73
+
74
+ #ifdef __cplusplus
75
+ }
76
+ #endif
@@ -0,0 +1,352 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-alloc.h"
5
+
6
+ #ifdef GGML_BACKEND_SHARED
7
+ # if defined(_WIN32) && !defined(__MINGW32__)
8
+ # ifdef GGML_BACKEND_BUILD
9
+ # define GGML_BACKEND_API __declspec(dllexport) extern
10
+ # else
11
+ # define GGML_BACKEND_API __declspec(dllimport) extern
12
+ # endif
13
+ # else
14
+ # define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
15
+ # endif
16
+ #else
17
+ # define GGML_BACKEND_API extern
18
+ #endif
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
25
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
26
+ typedef struct ggml_backend_event * ggml_backend_event_t;
27
+ typedef struct ggml_backend * ggml_backend_t;
28
+ typedef void * ggml_backend_graph_plan_t;
29
+ typedef struct ggml_backend_reg * ggml_backend_reg_t;
30
+ typedef struct ggml_backend_device * ggml_backend_dev_t;
31
+
32
+
33
+ //
34
+ // Backend buffer type
35
+ //
36
+
37
+ GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
38
+ GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
39
+ GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
40
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
41
+ GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
42
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
43
+ GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
44
+
45
+ //
46
+ // Backend buffer
47
+ //
48
+
49
+ enum ggml_backend_buffer_usage {
50
+ GGML_BACKEND_BUFFER_USAGE_ANY = 0,
51
+ GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
52
+ GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
53
+ };
54
+
55
+ GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
56
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
57
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
58
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
59
+ GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
60
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
61
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
62
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
63
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
64
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
65
+ GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
66
+ GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
67
+ GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
68
+ GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
69
+
70
+ // tensor copy between different backends
71
+ GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
72
+
73
+ //
74
+ // Backend (stream)
75
+ //
76
+
77
+ GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
78
+ GGML_API const char * ggml_backend_name(ggml_backend_t backend);
79
+ GGML_API void ggml_backend_free(ggml_backend_t backend);
80
+
81
+ GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
82
+ GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
83
+ GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
84
+ GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
85
+
86
+ GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
87
+ GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
88
+
89
+ // "offset" refers to the offset in tensor->data for setting/getting data
90
+ GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
91
+ GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
92
+ GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
93
+
94
+ GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
95
+
96
+ GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
97
+ GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
98
+
99
+ GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
+ GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
101
+ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
102
+
103
+ // NOTE: will be removed, use device version instead
104
+ GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
105
+ GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
106
+ GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
107
+
108
+ // asynchronous copy
109
+ // the copy is performed after all the currently queued operations in backend_src
110
+ // backend_dst will wait for the copy to complete before performing other operations
111
+ // automatic fallback to sync copy if async is not supported
112
+ GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
113
+
114
+ GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
115
+
116
+ //
117
+ // Events
118
+ //
119
+
120
+ GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
121
+ GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
122
+ GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
123
+ GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
124
+ GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
125
+
126
+ //
127
+ // Backend device
128
+ //
129
+
130
+ enum ggml_backend_dev_type {
131
+ // CPU device using system memory
132
+ GGML_BACKEND_DEVICE_TYPE_CPU,
133
+ // GPU device using dedicated memory
134
+ GGML_BACKEND_DEVICE_TYPE_GPU,
135
+ // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136
+ GGML_BACKEND_DEVICE_TYPE_ACCEL
137
+ };
138
+
139
+ // functionality supported by the device
140
+ struct ggml_backend_dev_caps {
141
+ // asynchronous operations
142
+ bool async;
143
+ // pinned host buffer
144
+ bool host_buffer;
145
+ // creating buffers from host ptr
146
+ bool buffer_from_host_ptr;
147
+ // event synchronization
148
+ bool events;
149
+ };
150
+
151
+ // all the device properties
152
+ struct ggml_backend_dev_props {
153
+ const char * name;
154
+ const char * description;
155
+ size_t memory_free;
156
+ size_t memory_total;
157
+ enum ggml_backend_dev_type type;
158
+ struct ggml_backend_dev_caps caps;
159
+ };
160
+
161
+ GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
162
+ GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
163
+ GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
164
+ GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
165
+ GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
166
+ GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
167
+ GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
168
+ GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
169
+ GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
170
+ GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
171
+
172
+ GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
173
+ GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
174
+ GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
175
+
176
+ //
177
+ // Backend (reg)
178
+ //
179
+
180
+ GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
181
+ GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
182
+ GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
183
+ GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
184
+
185
+ // Common functions that may be obtained using ggml_backend_reg_get_proc_address
186
+
187
+ // Split buffer type for tensor parallelism
188
+ typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
189
+ // Set the number of threads for the backend
190
+ typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
191
+ // Get additional buffer types provided by the device (returns a NULL-terminated array)
192
+ typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
193
+ // Set the abort callback for the backend
194
+ typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
195
+ // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
196
+ struct ggml_backend_feature {
197
+ const char * name;
198
+ const char * value;
199
+ };
200
+ typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
201
+
202
+ //
203
+ // Backend registry
204
+ //
205
+
206
+ // Backend (reg) enumeration
207
+ GGML_API size_t ggml_backend_reg_count(void);
208
+ GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
209
+ GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
210
+
211
+ // Device enumeration
212
+ GGML_API size_t ggml_backend_dev_count(void);
213
+ GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
214
+ GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
215
+ GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
216
+
217
+ // Direct backend (stream) initialization
218
+ // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
219
+ GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
220
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
221
+ GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
222
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
223
+ GGML_API ggml_backend_t ggml_backend_init_best(void);
224
+
225
+ // Load a backend from a dynamic library and register it
226
+ GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
227
+ // Unload a backend if loaded dynamically and unregister it
228
+ GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
229
+ // Load all known backends from dynamic libraries
230
+ GGML_API void ggml_backend_load_all(void);
231
+ GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
232
+
233
+ //
234
+ // Backend scheduler
235
+ //
236
+
237
+ // The backend scheduler allows for multiple backend devices to be used together
238
+ // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
239
+ // The backends are selected based on:
240
+ // - the backend that supports the operation
241
+ // - the location of the pre-allocated tensors (e.g. the weights)
242
+ /*
243
+ Example usage:
244
+
245
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
246
+ // preferrably to run on the same backend as the buffer
247
+ ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
248
+
249
+ sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
250
+
251
+ // initialize buffers from a max size graph (optional)
252
+ reserve_graph = build_graph(sched, max_batch_size);
253
+
254
+ // manually assign nodes to a backend (optional, should not be needed in most cases)
255
+ struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
256
+ ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
257
+
258
+ ggml_backend_sched_reserve(sched, reserve_graph);
259
+
260
+ // compute
261
+ graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
262
+ for (int i = 0; i < 10; ++i) {
263
+ ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
264
+ }
265
+
266
+ // if there are graph inputs:
267
+ graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
268
+ ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
269
+ ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
270
+ ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
271
+ ggml_backend_sched_graph_compute(sched, graph); // execute the graph
272
+
273
+ // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
274
+ // allocate them statically via ggml_backend_alloc_ctx_tensors
275
+ }
276
+ */
277
+
278
+ typedef struct ggml_backend_sched * ggml_backend_sched_t;
279
+
280
+ // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
281
+ // when ask == true, the scheduler wants to know if the user wants to observe this node
282
+ // this allows the scheduler to batch nodes together in order to evaluate them in a single call
283
+ //
284
+ // when ask == false, the scheduler is passing the node tensor to the user for observation
285
+ // if the user returns false, the scheduler will cancel the graph compute
286
+ //
287
+ typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
288
+
289
+ // Initialize a backend scheduler, backends with low index are given priority over backends with high index
290
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
291
+ GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
292
+
293
+ // Initialize backend buffers from a measure graph
294
+ GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
295
+
296
+ GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
297
+ GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
298
+
299
+ // Get the number of splits of the last graph
300
+ GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
301
+ GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
302
+
303
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
304
+
305
+ GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
306
+ GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
307
+
308
+ // Allocate and compute graph on the backend scheduler
309
+ GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
310
+ GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
311
+ GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
312
+ GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
313
+
314
+ // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
315
+ // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
316
+ // The correct way to use this API is to discard the deallocated tensors and create new ones.
317
+ GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
318
+
319
+ // Set a callback to be called for each resulting node during graph compute
320
+ GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
321
+
322
+ //
323
+ // Utils
324
+ //
325
+
326
+ struct ggml_backend_graph_copy {
327
+ ggml_backend_buffer_t buffer;
328
+ struct ggml_context * ctx_allocated;
329
+ struct ggml_context * ctx_unallocated;
330
+ struct ggml_cgraph * graph;
331
+ };
332
+
333
+ // Copy a graph to a different backend
334
+ GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
335
+ GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
336
+
337
+ typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
338
+
339
+ // Compare the output of two backends
340
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
341
+
342
+ // Tensor initialization
343
+ GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
344
+ GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
345
+
346
+ // CPU buffer types are always available
347
+ GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
348
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
349
+
350
+ #ifdef __cplusplus
351
+ }
352
+ #endif
@@ -0,0 +1,25 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ // backend API
12
+ GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
13
+
14
+ GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
15
+
16
+ // number of threads used for conversion to float
17
+ // for openblas and blis, this will also set the number of threads used for blas operations
18
+ GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
19
+
20
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
21
+
22
+
23
+ #ifdef __cplusplus
24
+ }
25
+ #endif