@fugood/llama.node 0.0.1-alpha.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +42 -7
- package/README.md +10 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.js +1 -1
- package/lib/binding.ts +16 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/DetokenizeWorker.cpp +22 -0
- package/src/DetokenizeWorker.h +19 -0
- package/src/EmbeddingWorker.cpp +46 -0
- package/src/EmbeddingWorker.h +23 -0
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +80 -1
- package/src/LlamaContext.h +3 -0
- package/src/TokenizeWorker.cpp +26 -0
- package/src/TokenizeWorker.h +23 -0
- package/src/common.hpp +12 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
|
@@ -43,11 +43,7 @@ else()
|
|
|
43
43
|
set(LLAMA_METAL_DEFAULT OFF)
|
|
44
44
|
endif()
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
set(LLAMA_LLAMAFILE_DEFAULT OFF)
|
|
48
|
-
else()
|
|
49
|
-
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
|
50
|
-
endif()
|
|
46
|
+
set(LLAMA_LLAMAFILE_DEFAULT ON)
|
|
51
47
|
|
|
52
48
|
# general
|
|
53
49
|
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
|
@@ -107,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
|
|
|
107
103
|
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
108
104
|
"llama: max. batch size for using peer access")
|
|
109
105
|
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
|
106
|
+
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
|
|
107
|
+
|
|
110
108
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
|
111
109
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
|
112
110
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
|
@@ -407,12 +405,16 @@ if (LLAMA_CUDA)
|
|
|
407
405
|
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
|
408
406
|
|
|
409
407
|
add_compile_definitions(GGML_USE_CUDA)
|
|
408
|
+
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
|
410
409
|
if (LLAMA_CUDA_FORCE_DMMV)
|
|
411
410
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
|
412
411
|
endif()
|
|
413
412
|
if (LLAMA_CUDA_FORCE_MMQ)
|
|
414
413
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
|
415
414
|
endif()
|
|
415
|
+
if (LLAMA_CUDA_NO_VMM)
|
|
416
|
+
add_compile_definitions(GGML_CUDA_NO_VMM)
|
|
417
|
+
endif()
|
|
416
418
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
|
417
419
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
|
418
420
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
|
@@ -429,7 +431,7 @@ if (LLAMA_CUDA)
|
|
|
429
431
|
|
|
430
432
|
if (LLAMA_STATIC)
|
|
431
433
|
if (WIN32)
|
|
432
|
-
# As of 12.3.1 CUDA
|
|
434
|
+
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
|
433
435
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
|
434
436
|
else ()
|
|
435
437
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
|
@@ -438,7 +440,11 @@ if (LLAMA_CUDA)
|
|
|
438
440
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
|
439
441
|
endif()
|
|
440
442
|
|
|
441
|
-
|
|
443
|
+
if (LLAMA_CUDA_NO_VMM)
|
|
444
|
+
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
|
445
|
+
else()
|
|
446
|
+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
|
447
|
+
endif()
|
|
442
448
|
|
|
443
449
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
|
444
450
|
# 52 == lowest CUDA 12 standard
|