@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
#include "ggml-impl.h"
|
|
1
2
|
#include "ggml-blas.h"
|
|
2
3
|
#include "ggml-backend-impl.h"
|
|
3
4
|
|
|
@@ -234,25 +235,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
|
|
|
234
235
|
|
|
235
236
|
// backend interface
|
|
236
237
|
|
|
237
|
-
|
|
238
|
+
static const char * ggml_backend_blas_name(ggml_backend_t backend) {
|
|
238
239
|
return "BLAS";
|
|
239
240
|
|
|
240
241
|
GGML_UNUSED(backend);
|
|
241
242
|
}
|
|
242
243
|
|
|
243
|
-
|
|
244
|
+
static void ggml_backend_blas_free(ggml_backend_t backend) {
|
|
244
245
|
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
|
245
246
|
delete ctx;
|
|
246
247
|
delete backend;
|
|
247
248
|
}
|
|
248
249
|
|
|
249
|
-
|
|
250
|
+
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
|
250
251
|
return ggml_backend_cpu_buffer_type();
|
|
251
252
|
|
|
252
253
|
GGML_UNUSED(backend);
|
|
253
254
|
}
|
|
254
255
|
|
|
255
|
-
|
|
256
|
+
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
256
257
|
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
|
257
258
|
|
|
258
259
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -284,7 +285,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
|
|
|
284
285
|
GGML_UNUSED(backend);
|
|
285
286
|
}
|
|
286
287
|
|
|
287
|
-
|
|
288
|
+
static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
288
289
|
const struct ggml_tensor * src0 = op->src[0];
|
|
289
290
|
const struct ggml_tensor * src1 = op->src[1];
|
|
290
291
|
|
|
@@ -299,7 +300,7 @@ GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, cons
|
|
|
299
300
|
GGML_UNUSED(backend);
|
|
300
301
|
}
|
|
301
302
|
|
|
302
|
-
|
|
303
|
+
static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
303
304
|
return ggml_backend_buft_is_host(buft);
|
|
304
305
|
|
|
305
306
|
GGML_UNUSED(backend);
|
|
@@ -321,11 +322,8 @@ static struct ggml_backend_i blas_backend_i = {
|
|
|
321
322
|
/* .supports_op = */ ggml_backend_blas_supports_op,
|
|
322
323
|
/* .supports_buft = */ ggml_backend_blas_supports_buft,
|
|
323
324
|
/* .offload_op = */ NULL,
|
|
324
|
-
/* .event_new = */ NULL,
|
|
325
|
-
/* .event_free = */ NULL,
|
|
326
325
|
/* .event_record = */ NULL,
|
|
327
326
|
/* .event_wait = */ NULL,
|
|
328
|
-
/* .event_synchronize = */ NULL,
|
|
329
327
|
};
|
|
330
328
|
|
|
331
329
|
static ggml_guid_t ggml_backend_blas_guid(void) {
|
|
@@ -339,6 +337,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|
|
339
337
|
ggml_backend_t backend = new ggml_backend {
|
|
340
338
|
/* .guid = */ ggml_backend_blas_guid(),
|
|
341
339
|
/* .interface = */ blas_backend_i,
|
|
340
|
+
/* .device = */ nullptr,
|
|
342
341
|
/* .context = */ ctx,
|
|
343
342
|
};
|
|
344
343
|
|
|
@@ -355,7 +354,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|
|
355
354
|
return backend;
|
|
356
355
|
}
|
|
357
356
|
|
|
358
|
-
|
|
357
|
+
bool ggml_backend_is_blas(ggml_backend_t backend) {
|
|
359
358
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
|
|
360
359
|
}
|
|
361
360
|
|
|
@@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
|
|
|
37
37
|
return ACL_INT16;
|
|
38
38
|
case GGML_TYPE_I32:
|
|
39
39
|
return ACL_INT32;
|
|
40
|
+
case GGML_TYPE_Q4_0:
|
|
41
|
+
return ACL_INT4;
|
|
42
|
+
case GGML_TYPE_Q8_0:
|
|
43
|
+
return ACL_INT8;
|
|
40
44
|
default:
|
|
41
45
|
return ACL_DT_UNDEFINED;
|
|
42
46
|
}
|
|
@@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
|
|
|
89
93
|
return false;
|
|
90
94
|
}
|
|
91
95
|
|
|
92
|
-
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
|
93
|
-
size_t type_size, int64_t* ne, size_t* nb,
|
|
94
|
-
int64_t dims, aclFormat format,
|
|
95
|
-
size_t offset) {
|
|
96
|
-
int64_t tmp_ne[GGML_MAX_DIMS * 2];
|
|
97
|
-
int64_t tmp_stride[GGML_MAX_DIMS * 2];
|
|
98
|
-
|
|
99
|
-
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
|
|
100
|
-
for (int i = 0; i < dims; i++) {
|
|
101
|
-
tmp_stride[i] = nb[i] / type_size;
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
std::reverse(tmp_ne, tmp_ne + dims);
|
|
105
|
-
std::reverse(tmp_stride, tmp_stride + dims);
|
|
106
|
-
|
|
107
|
-
int64_t acl_storage_len = 0;
|
|
108
|
-
for (int i = 0; i < dims; i++) {
|
|
109
|
-
acl_storage_len += (ne[i] - 1) * nb[i];
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
aclTensor* acl_tensor =
|
|
113
|
-
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
|
114
|
-
format, &acl_storage_len, 1, data_ptr);
|
|
115
|
-
|
|
116
|
-
return acl_tensor;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
96
|
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
|
|
120
97
|
const ggml_tensor* src1,
|
|
121
98
|
int64_t* bcast_src0_ne,
|
|
@@ -23,6 +23,9 @@
|
|
|
23
23
|
#ifndef CANN_ACL_TENSOR_H
|
|
24
24
|
#define CANN_ACL_TENSOR_H
|
|
25
25
|
|
|
26
|
+
#include <algorithm>
|
|
27
|
+
#include <cstring>
|
|
28
|
+
|
|
26
29
|
#include <aclnn/aclnn_base.h>
|
|
27
30
|
#include "common.h"
|
|
28
31
|
|
|
@@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
|
|
|
65
68
|
size_t offset = 0);
|
|
66
69
|
|
|
67
70
|
/**
|
|
68
|
-
* @brief
|
|
71
|
+
* @brief Template for creating an ACL tensor from provided parameters. typename TYPE
|
|
72
|
+
* should be size_t or float.
|
|
69
73
|
*
|
|
70
74
|
* @details This function creates an ACL tensor using the provided data pointer,
|
|
71
75
|
* data type, dimensions, strides, format, offset, and additional parameters.
|
|
@@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
|
|
|
83
87
|
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
|
|
84
88
|
* @return Pointer to the created ACL tensor.
|
|
85
89
|
*/
|
|
90
|
+
template<typename TYPE>
|
|
86
91
|
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
92
|
+
TYPE type_size, int64_t* ne, TYPE* nb,
|
|
93
|
+
int64_t dims,
|
|
94
|
+
aclFormat format = ACL_FORMAT_ND,
|
|
95
|
+
size_t offset = 0) {
|
|
96
|
+
int64_t tmp_ne[GGML_MAX_DIMS * 2];
|
|
97
|
+
int64_t tmp_stride[GGML_MAX_DIMS * 2];
|
|
98
|
+
|
|
99
|
+
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
|
|
100
|
+
for (int i = 0; i < dims; i++) {
|
|
101
|
+
tmp_stride[i] = nb[i] / type_size;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
std::reverse(tmp_ne, tmp_ne + dims);
|
|
105
|
+
std::reverse(tmp_stride, tmp_stride + dims);
|
|
106
|
+
|
|
107
|
+
int64_t acl_storage_len = 0;
|
|
108
|
+
for (int i = 0; i < dims; i++) {
|
|
109
|
+
acl_storage_len += (ne[i] - 1) * nb[i];
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
aclTensor* acl_tensor =
|
|
113
|
+
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
|
|
114
|
+
format, &acl_storage_len, 1, data_ptr);
|
|
115
|
+
|
|
116
|
+
return acl_tensor;
|
|
117
|
+
}
|
|
90
118
|
|
|
91
119
|
/**
|
|
92
120
|
* @brief Checks if tensors require broadcasting based on their shapes.
|
|
@@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
464
464
|
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
|
465
465
|
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
|
466
466
|
|
|
467
|
-
const float eps = 1e-6f; // TODO: make this a parameter
|
|
468
467
|
int n_groups = dst->op_params[0];
|
|
469
468
|
|
|
469
|
+
float eps;
|
|
470
|
+
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
|
471
|
+
|
|
470
472
|
uint64_t workspaceSize = 0;
|
|
471
473
|
aclOpExecutor* executor;
|
|
472
474
|
void* workspaceAddr = nullptr;
|
|
@@ -910,6 +912,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
910
912
|
((ggml_tensor*)dst->extra)->ne);
|
|
911
913
|
return;
|
|
912
914
|
}
|
|
915
|
+
if (dst->type == GGML_TYPE_Q4_0) {
|
|
916
|
+
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
|
|
917
|
+
24, ctx.stream(), src->data, dst->data,
|
|
918
|
+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
|
919
|
+
((ggml_tensor*)dst->extra)->ne);
|
|
920
|
+
return;
|
|
921
|
+
}
|
|
913
922
|
if (dst->type == GGML_TYPE_F16) {
|
|
914
923
|
if (ggml_are_same_shape(src, dst)) {
|
|
915
924
|
cann_copy(ctx, acl_src, acl_dst);
|
|
@@ -971,6 +980,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
971
980
|
((ggml_tensor*)dst->extra)->ne);
|
|
972
981
|
return;
|
|
973
982
|
}
|
|
983
|
+
if (dst->type == GGML_TYPE_Q4_0) {
|
|
984
|
+
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
|
|
985
|
+
24, ctx.stream(), src->data, dst->data,
|
|
986
|
+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
|
987
|
+
((ggml_tensor*)dst->extra)->ne);
|
|
988
|
+
return;
|
|
989
|
+
}
|
|
974
990
|
if (dst->type == GGML_TYPE_F32) {
|
|
975
991
|
if (ggml_are_same_shape(src, dst)) {
|
|
976
992
|
cann_copy(ctx, acl_src, acl_dst);
|
|
@@ -1312,6 +1328,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
|
|
|
1312
1328
|
#ifdef __cplusplus
|
|
1313
1329
|
}
|
|
1314
1330
|
#endif
|
|
1331
|
+
|
|
1332
|
+
static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
|
|
1333
|
+
ggml_tensor* dst,
|
|
1334
|
+
ggml_tensor* src1,
|
|
1335
|
+
aclTensor* tmp_cast_tensor,
|
|
1336
|
+
aclTensor* tmp_im2col_tensor) {
|
|
1337
|
+
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
|
1338
|
+
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
|
|
1339
|
+
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
|
|
1340
|
+
aclTensor* acl_dst =
|
|
1341
|
+
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
|
1342
|
+
|
|
1343
|
+
int64_t permute_dim[] = {0, 2, 1};
|
|
1344
|
+
if (src1->type != dst->type) {
|
|
1345
|
+
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
|
1346
|
+
} else {
|
|
1347
|
+
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
// release
|
|
1351
|
+
ACL_CHECK(aclDestroyTensor(acl_dst));
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
static void ggml_cann_im2col_1d_post_process(
|
|
1355
|
+
ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
|
|
1356
|
+
aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
|
|
1357
|
+
const std::vector<int64_t>& im2col_op_params) {
|
|
1358
|
+
// get params
|
|
1359
|
+
const int64_t KH = im2col_op_params[0];
|
|
1360
|
+
const int64_t KW = im2col_op_params[1];
|
|
1361
|
+
const int64_t IW = im2col_op_params[2];
|
|
1362
|
+
const int64_t IC = im2col_op_params[3];
|
|
1363
|
+
const int64_t N = im2col_op_params[4];
|
|
1364
|
+
const int64_t OH = im2col_op_params[5];
|
|
1365
|
+
const int64_t OW = im2col_op_params[6];
|
|
1366
|
+
const int64_t s0 = im2col_op_params[7];
|
|
1367
|
+
const int64_t p0 = im2col_op_params[8];
|
|
1368
|
+
const int64_t d0 = im2col_op_params[9];
|
|
1369
|
+
const int64_t n_bytes_factor = im2col_op_params[10];
|
|
1370
|
+
|
|
1371
|
+
// Permute: [N, IC * KH * KW, OW * OH] ->
|
|
1372
|
+
// [N, OW * OH * n_bytes_factor, IC * KH * KW]
|
|
1373
|
+
aclTensor* tmp_permute_tensor = nullptr;
|
|
1374
|
+
ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
|
|
1375
|
+
tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
|
1376
|
+
void* tmp_permute_buffer = tmp_permute_allocator.get();
|
|
1377
|
+
|
|
1378
|
+
int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
|
|
1379
|
+
size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
|
|
1380
|
+
tmp_permute_nb[0] = ggml_type_size(dst->type);
|
|
1381
|
+
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
|
1382
|
+
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
|
|
1383
|
+
}
|
|
1384
|
+
|
|
1385
|
+
tmp_permute_tensor = ggml_cann_create_tensor(
|
|
1386
|
+
tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
|
|
1387
|
+
ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
|
|
1388
|
+
GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
|
1389
|
+
|
|
1390
|
+
int64_t permute_dim[] = {0, 2, 1};
|
|
1391
|
+
if (src1->type != dst->type) {
|
|
1392
|
+
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
|
|
1393
|
+
} else {
|
|
1394
|
+
aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
|
|
1395
|
+
3);
|
|
1396
|
+
}
|
|
1397
|
+
|
|
1398
|
+
// number of times the kernel moves in W dimension
|
|
1399
|
+
const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
|
|
1400
|
+
size_t offset;
|
|
1401
|
+
void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
|
|
1402
|
+
|
|
1403
|
+
// memory copy with offset to restore 1D im2col from 2d
|
|
1404
|
+
if (IC > 1) {
|
|
1405
|
+
offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
|
|
1406
|
+
size_t size_cpy = KH * KW * ggml_type_size(dst->type);
|
|
1407
|
+
|
|
1408
|
+
for (int c = 0; c < IC; c++) {
|
|
1409
|
+
cur_permute_buffer = (char*)tmp_permute_buffer + offset +
|
|
1410
|
+
KH * KW * c * ggml_type_size(dst->type);
|
|
1411
|
+
cur_dst_buffer = (char*)dst->data +
|
|
1412
|
+
c * KH * KW * n_step_w * ggml_type_size(dst->type);
|
|
1413
|
+
|
|
1414
|
+
for (int i = 0; i < n_step_w; i++) {
|
|
1415
|
+
ACL_CHECK(aclrtMemcpyAsync(
|
|
1416
|
+
cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
|
|
1417
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
|
1418
|
+
cur_dst_buffer =
|
|
1419
|
+
(char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
|
|
1420
|
+
cur_permute_buffer = (char*)cur_permute_buffer +
|
|
1421
|
+
KH * KW * IC * ggml_type_size(dst->type);
|
|
1422
|
+
}
|
|
1423
|
+
}
|
|
1424
|
+
} else {
|
|
1425
|
+
offset = KH * KW * n_step_w *
|
|
1426
|
+
ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
|
|
1427
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
|
|
1428
|
+
(char*)tmp_permute_buffer + offset, offset,
|
|
1429
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
// release
|
|
1433
|
+
ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1315
1436
|
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
1316
1437
|
ggml_tensor* src0 = dst->src[0]; // kernel
|
|
1317
1438
|
ggml_tensor* src1 = dst->src[1]; // input
|
|
@@ -1320,21 +1441,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1320
1441
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
1321
1442
|
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
|
1322
1443
|
|
|
1444
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
1445
|
+
|
|
1446
|
+
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
|
1447
|
+
// im2col and do post-processing to restore it to 1D.
|
|
1448
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
|
1323
1449
|
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
|
1324
|
-
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
|
1450
|
+
const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
|
|
1325
1451
|
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
|
1326
|
-
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
|
1452
|
+
const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
|
|
1327
1453
|
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
|
1328
|
-
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
|
1329
|
-
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
|
1454
|
+
const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
|
|
1330
1455
|
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
const int64_t
|
|
1334
|
-
const int64_t IC = is_2D ? ne12 : ne11;
|
|
1335
|
-
|
|
1336
|
-
const int64_t KH = is_2D ? ne01 : 1;
|
|
1456
|
+
const int64_t N = ne13;
|
|
1457
|
+
const int64_t IC = ne12;
|
|
1458
|
+
const int64_t KH = ne01;
|
|
1337
1459
|
const int64_t KW = ne00;
|
|
1460
|
+
const int64_t IW = ne10;
|
|
1338
1461
|
|
|
1339
1462
|
const int64_t OH = is_2D ? ne2 : 1;
|
|
1340
1463
|
const int64_t OW = ne1;
|
|
@@ -1342,9 +1465,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1342
1465
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
1343
1466
|
GGML_ASSERT(nb10 == sizeof(float));
|
|
1344
1467
|
|
|
1345
|
-
//
|
|
1468
|
+
// memory allocated increased to 3x when is_2D == false
|
|
1469
|
+
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
|
1470
|
+
|
|
1471
|
+
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
|
|
1346
1472
|
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
|
1347
|
-
int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
|
|
1473
|
+
int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
|
|
1348
1474
|
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
|
|
1349
1475
|
|
|
1350
1476
|
tmp_im2col_nb[0] = ggml_type_size(src1->type);
|
|
@@ -1356,8 +1482,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1356
1482
|
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
|
|
1357
1483
|
// dst.elemcount.
|
|
1358
1484
|
ggml_cann_pool_alloc im2col_allocator(
|
|
1359
|
-
ctx.pool(),
|
|
1485
|
+
ctx.pool(),
|
|
1486
|
+
ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
|
|
1360
1487
|
void* tmp_im2col_buffer = im2col_allocator.get();
|
|
1488
|
+
|
|
1361
1489
|
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
|
|
1362
1490
|
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
|
|
1363
1491
|
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
|
|
@@ -1380,8 +1508,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1380
1508
|
paddings, strides, tmp_im2col_tensor,
|
|
1381
1509
|
&workspaceSize, &executor));
|
|
1382
1510
|
|
|
1511
|
+
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
|
1383
1512
|
if (workspaceSize > 0) {
|
|
1384
|
-
|
|
1513
|
+
workspace_allocator.alloc(workspaceSize);
|
|
1385
1514
|
workspaceAddr = workspace_allocator.get();
|
|
1386
1515
|
}
|
|
1387
1516
|
|
|
@@ -1391,9 +1520,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1391
1520
|
// Cast if dst is f16.
|
|
1392
1521
|
aclTensor* tmp_cast_tensor = nullptr;
|
|
1393
1522
|
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
|
|
1523
|
+
void* tmp_cast_buffer = nullptr;
|
|
1394
1524
|
if (src1->type != dst->type) {
|
|
1395
|
-
tmp_cast_allocator.alloc(ggml_nbytes(dst));
|
|
1396
|
-
|
|
1525
|
+
tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
|
1526
|
+
tmp_cast_buffer = tmp_cast_allocator.get();
|
|
1397
1527
|
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
|
|
1398
1528
|
temp_cast_nb[0] = ggml_type_size(dst->type);
|
|
1399
1529
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
|
@@ -1408,24 +1538,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1408
1538
|
ggml_cann_type_mapping(dst->type));
|
|
1409
1539
|
}
|
|
1410
1540
|
|
|
1411
|
-
//
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
|
1416
|
-
|
|
1417
|
-
int64_t permute_dim[] = {0, 2, 1};
|
|
1418
|
-
if (src1->type != dst->type) {
|
|
1419
|
-
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
|
1541
|
+
// post-processing
|
|
1542
|
+
if (is_2D) {
|
|
1543
|
+
ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
|
1544
|
+
tmp_im2col_tensor);
|
|
1420
1545
|
} else {
|
|
1421
|
-
|
|
1546
|
+
std::vector<int64_t> im2col_op_params = {
|
|
1547
|
+
KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
|
|
1548
|
+
ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
|
1549
|
+
tmp_im2col_tensor, im2col_op_params);
|
|
1422
1550
|
}
|
|
1423
1551
|
|
|
1424
1552
|
// release
|
|
1425
1553
|
ACL_CHECK(aclDestroyTensor(acl_src1));
|
|
1426
1554
|
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
|
|
1427
1555
|
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
|
|
1428
|
-
ACL_CHECK(aclDestroyTensor(acl_dst));
|
|
1429
1556
|
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
|
1430
1557
|
ACL_CHECK(aclDestroyIntArray(dilations));
|
|
1431
1558
|
ACL_CHECK(aclDestroyIntArray(paddings));
|
|
@@ -2352,21 +2479,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|
|
2352
2479
|
* @param dst The destination tensor where the result of the matrix
|
|
2353
2480
|
* multiplication will be stored.
|
|
2354
2481
|
*/
|
|
2355
|
-
static void
|
|
2356
|
-
ggml_tensor* dst
|
|
2482
|
+
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|
2483
|
+
ggml_tensor* dst,
|
|
2484
|
+
const enum ggml_type type) {
|
|
2357
2485
|
ggml_tensor* src0 = dst->src[0]; // weight
|
|
2358
2486
|
ggml_tensor* src1 = dst->src[1]; // input
|
|
2359
2487
|
|
|
2360
2488
|
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
|
|
2361
2489
|
// is regarded as batch. weight need transpose.
|
|
2362
2490
|
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
|
|
2363
|
-
|
|
2364
|
-
|
|
2491
|
+
float weight_elem_size;
|
|
2492
|
+
if (type == GGML_TYPE_Q4_0) {
|
|
2493
|
+
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
|
2494
|
+
}
|
|
2495
|
+
else if (type == GGML_TYPE_Q8_0) {
|
|
2496
|
+
weight_elem_size = float(sizeof(uint8_t));
|
|
2497
|
+
}
|
|
2498
|
+
else {
|
|
2499
|
+
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
|
2500
|
+
}
|
|
2501
|
+
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
|
|
2502
|
+
|
|
2365
2503
|
// size of one matrix is element_size * height * width.
|
|
2366
2504
|
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
|
|
2367
2505
|
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
|
2368
2506
|
|
|
2369
2507
|
// scale stored at the end of weight. Also need transpose.
|
|
2508
|
+
GGML_ASSERT(QK4_0 == QK8_0);
|
|
2370
2509
|
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
|
|
2371
2510
|
size_t scale_elem_size = sizeof(uint16_t);
|
|
2372
2511
|
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
|
@@ -2381,10 +2520,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
|
|
|
2381
2520
|
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
|
|
2382
2521
|
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
|
|
2383
2522
|
|
|
2523
|
+
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
|
2384
2524
|
if (src1->type != GGML_TYPE_F16) {
|
|
2385
2525
|
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
|
2386
|
-
|
|
2387
|
-
ctx.pool(), ggml_nelements(src1) * input_elem_size);
|
|
2526
|
+
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
|
2388
2527
|
input_buffer = input_alloctor.get();
|
|
2389
2528
|
|
|
2390
2529
|
int64_t* input_cast_ne = src1->ne;
|
|
@@ -2430,8 +2569,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
|
|
|
2430
2569
|
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
|
2431
2570
|
input_elem_size, input_ne, input_nb, 2);
|
|
2432
2571
|
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
|
2433
|
-
(char*)src0->data + batch0 * weight_stride,
|
|
2434
|
-
weight_elem_size, weight_ne,
|
|
2572
|
+
(char*)src0->data + batch0 * weight_stride,
|
|
2573
|
+
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
|
2574
|
+
weight_nb, 2);
|
|
2435
2575
|
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
|
2436
2576
|
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
|
2437
2577
|
scale_elem_size, scale_ne, scale_nb, 2);
|
|
@@ -2485,11 +2625,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
2485
2625
|
case GGML_TYPE_F16:
|
|
2486
2626
|
ggml_cann_mat_mul_fp(ctx, dst);
|
|
2487
2627
|
break;
|
|
2488
|
-
|
|
2489
|
-
// ggml_cann_mul_mat_q4_0(ctx, dst);
|
|
2490
|
-
// break;
|
|
2628
|
+
case GGML_TYPE_Q4_0:
|
|
2491
2629
|
case GGML_TYPE_Q8_0:
|
|
2492
|
-
|
|
2630
|
+
ggml_cann_mul_mat_quant(ctx, dst, type);
|
|
2493
2631
|
break;
|
|
2494
2632
|
default:
|
|
2495
2633
|
GGML_ABORT("fatal error");
|
|
@@ -2743,7 +2881,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
2743
2881
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
|
|
2744
2882
|
beta_slow, corr_dims);
|
|
2745
2883
|
|
|
2746
|
-
const bool is_neox = mode &
|
|
2884
|
+
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
2747
2885
|
|
|
2748
2886
|
// init cos/sin cache
|
|
2749
2887
|
ggml_cann_pool_alloc sin_allocator(
|
|
@@ -9,6 +9,7 @@ file(GLOB SRC_FILES
|
|
|
9
9
|
get_row_q8_0.cpp
|
|
10
10
|
quantize_f32_q8_0.cpp
|
|
11
11
|
quantize_f16_q8_0.cpp
|
|
12
|
+
quantize_float_to_q4_0.cpp
|
|
12
13
|
dup.cpp
|
|
13
14
|
)
|
|
14
15
|
|
|
@@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
|
|
|
29
30
|
${SRC_FILES}
|
|
30
31
|
)
|
|
31
32
|
|
|
32
|
-
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
|
33
|
+
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
|
|
9
9
|
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
|
|
10
10
|
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
|
|
11
|
+
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
|
|
12
|
+
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
|
|
11
13
|
|
|
12
14
|
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
|
|
13
15
|
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
|