@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -24,12 +24,12 @@
24
24
 
25
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
26
 
27
- struct llama_lora_adapter_info {
27
+ struct common_lora_adapter_info {
28
28
  std::string path;
29
29
  float scale;
30
30
  };
31
31
 
32
- struct llama_lora_adapter_container : llama_lora_adapter_info {
32
+ struct common_lora_adapter_container : common_lora_adapter_info {
33
33
  struct llama_lora_adapter * adapter;
34
34
  };
35
35
 
@@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
39
39
  extern char const * LLAMA_COMPILER;
40
40
  extern char const * LLAMA_BUILD_TARGET;
41
41
 
42
- struct llama_control_vector_load_info;
42
+ struct common_control_vector_load_info;
43
43
 
44
44
  //
45
45
  // CPU utils
@@ -82,14 +82,17 @@ enum llama_example {
82
82
  LLAMA_EXAMPLE_COUNT,
83
83
  };
84
84
 
85
- enum gpt_sampler_type {
86
- GPT_SAMPLER_TYPE_NONE = 0,
87
- GPT_SAMPLER_TYPE_TOP_K = 1,
88
- GPT_SAMPLER_TYPE_TOP_P = 2,
89
- GPT_SAMPLER_TYPE_MIN_P = 3,
90
- GPT_SAMPLER_TYPE_TFS_Z = 4,
91
- GPT_SAMPLER_TYPE_TYPICAL_P = 5,
92
- GPT_SAMPLER_TYPE_TEMPERATURE = 6,
85
+ enum common_sampler_type {
86
+ COMMON_SAMPLER_TYPE_NONE = 0,
87
+ COMMON_SAMPLER_TYPE_DRY = 1,
88
+ COMMON_SAMPLER_TYPE_TOP_K = 2,
89
+ COMMON_SAMPLER_TYPE_TOP_P = 3,
90
+ COMMON_SAMPLER_TYPE_MIN_P = 4,
91
+ //COMMON_SAMPLER_TYPE_TFS_Z = 5,
92
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
93
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
94
+ COMMON_SAMPLER_TYPE_XTC = 8,
95
+ COMMON_SAMPLER_TYPE_INFILL = 9,
93
96
  };
94
97
 
95
98
  // dimensionality reduction methods, used by cvector-generator
@@ -99,38 +102,47 @@ enum dimre_method {
99
102
  };
100
103
 
101
104
  // sampler parameters
102
- struct gpt_sampler_params {
105
+ struct common_sampler_params {
103
106
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
104
107
 
105
- int32_t n_prev = 64; // number of previous tokens to remember
106
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
107
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
108
- int32_t top_k = 40; // <= 0 to use vocab size
109
- float top_p = 0.95f; // 1.0 = disabled
110
- float min_p = 0.05f; // 0.0 = disabled
111
- float tfs_z = 1.00f; // 1.0 = disabled
112
- float typ_p = 1.00f; // typical_p, 1.0 = disabled
113
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
114
- float dynatemp_range = 0.00f; // 0.0 = disabled
115
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
116
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
117
- float penalty_repeat = 1.00f; // 1.0 = disabled
118
- float penalty_freq = 0.00f; // 0.0 = disabled
119
- float penalty_present = 0.00f; // 0.0 = disabled
120
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
121
- float mirostat_tau = 5.00f; // target entropy
122
- float mirostat_eta = 0.10f; // learning rate
123
- bool penalize_nl = false; // consider newlines as a repeatable token
124
- bool ignore_eos = false;
125
- bool no_perf = false; // disable performance metrics
126
-
127
- std::vector<enum gpt_sampler_type> samplers = {
128
- GPT_SAMPLER_TYPE_TOP_K,
129
- GPT_SAMPLER_TYPE_TFS_Z,
130
- GPT_SAMPLER_TYPE_TYPICAL_P,
131
- GPT_SAMPLER_TYPE_TOP_P,
132
- GPT_SAMPLER_TYPE_MIN_P,
133
- GPT_SAMPLER_TYPE_TEMPERATURE
108
+ int32_t n_prev = 64; // number of previous tokens to remember
109
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
110
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
111
+ int32_t top_k = 40; // <= 0 to use vocab size
112
+ float top_p = 0.95f; // 1.0 = disabled
113
+ float min_p = 0.05f; // 0.0 = disabled
114
+ float xtc_probability = 0.00f; // 0.0 = disabled
115
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
116
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
117
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
118
+ float dynatemp_range = 0.00f; // 0.0 = disabled
119
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
120
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
121
+ float penalty_repeat = 1.00f; // 1.0 = disabled
122
+ float penalty_freq = 0.00f; // 0.0 = disabled
123
+ float penalty_present = 0.00f; // 0.0 = disabled
124
+ float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
125
+ float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
126
+ int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
127
+ int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
128
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
129
+ float mirostat_tau = 5.00f; // target entropy
130
+ float mirostat_eta = 0.10f; // learning rate
131
+ bool penalize_nl = false; // consider newlines as a repeatable token
132
+ bool ignore_eos = false;
133
+ bool no_perf = false; // disable performance metrics
134
+
135
+ std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
136
+
137
+
138
+ std::vector<enum common_sampler_type> samplers = {
139
+ COMMON_SAMPLER_TYPE_DRY,
140
+ COMMON_SAMPLER_TYPE_TOP_K,
141
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
142
+ COMMON_SAMPLER_TYPE_TOP_P,
143
+ COMMON_SAMPLER_TYPE_MIN_P,
144
+ COMMON_SAMPLER_TYPE_XTC,
145
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
134
146
  };
135
147
 
136
148
  std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -141,9 +153,9 @@ struct gpt_sampler_params {
141
153
  std::string print() const;
142
154
  };
143
155
 
144
- struct gpt_params {
156
+ struct common_params {
145
157
  int32_t n_predict = -1; // new tokens to predict
146
- int32_t n_ctx = 0; // context size
158
+ int32_t n_ctx = 4096; // context size
147
159
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
148
160
  int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
149
161
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -166,7 +178,7 @@ struct gpt_params {
166
178
  float yarn_beta_fast = 32.0f; // YaRN low correction dim
167
179
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
168
180
  int32_t yarn_orig_ctx = 0; // YaRN original context length
169
- float defrag_thold = -1.0f; // KV cache defragmentation threshold
181
+ float defrag_thold = 0.1f; // KV cache defragmentation threshold
170
182
 
171
183
  struct cpu_params cpuparams;
172
184
  struct cpu_params cpuparams_batch;
@@ -183,7 +195,7 @@ struct gpt_params {
183
195
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
184
196
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
185
197
 
186
- struct gpt_sampler_params sparams;
198
+ struct common_sampler_params sparams;
187
199
 
188
200
  std::string model = ""; // model path // NOLINT
189
201
  std::string model_draft = ""; // draft model for speculative decoding // NOLINT
@@ -197,7 +209,6 @@ struct gpt_params {
197
209
  std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
198
210
  std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
199
211
  std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
200
- std::string logdir = ""; // directory in which to save YAML log files // NOLINT
201
212
  std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
202
213
  std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
203
214
  std::string logits_file = ""; // file for saving *all* logits // NOLINT
@@ -208,9 +219,9 @@ struct gpt_params {
208
219
  std::vector<llama_model_kv_override> kv_overrides;
209
220
 
210
221
  bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
211
- std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
222
+ std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
212
223
 
213
- std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
224
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
214
225
 
215
226
  int32_t verbosity = 0;
216
227
  int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -268,21 +279,21 @@ struct gpt_params {
268
279
 
269
280
  // embedding
270
281
  bool embedding = false; // get only sentence embedding
271
- int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
282
+ int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
272
283
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
273
- std::string embd_sep = "\n"; // separator of embendings
284
+ std::string embd_sep = "\n"; // separator of embeddings
274
285
  bool reranking = false; // enable reranking support on server
275
286
 
276
287
  // server params
277
288
  int32_t port = 8080; // server listens on this network port
278
289
  int32_t timeout_read = 600; // http read timeout in seconds
279
290
  int32_t timeout_write = timeout_read; // http write timeout in seconds
280
- int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
291
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
292
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
281
293
 
282
294
  std::string hostname = "127.0.0.1";
283
295
  std::string public_path = ""; // NOLINT
284
296
  std::string chat_template = ""; // NOLINT
285
- std::string system_prompt = ""; // NOLINT
286
297
  bool enable_chat_template = true;
287
298
 
288
299
  std::vector<std::string> api_keys;
@@ -290,7 +301,10 @@ struct gpt_params {
290
301
  std::string ssl_file_key = ""; // NOLINT
291
302
  std::string ssl_file_cert = ""; // NOLINT
292
303
 
293
- bool endpoint_slots = true;
304
+ // "advanced" endpoints are disabled by default for better security
305
+ bool webui = true;
306
+ bool endpoint_slots = false;
307
+ bool endpoint_props = false; // only control POST requests, not GET
294
308
  bool endpoint_metrics = false;
295
309
 
296
310
  bool log_json = false;
@@ -345,20 +359,31 @@ struct gpt_params {
345
359
 
346
360
  // call once at the start of a program if it uses libcommon
347
361
  // initializes the logging system and prints info about the build
348
- void gpt_init();
362
+ void common_init();
349
363
 
350
- std::string gpt_params_get_system_info(const gpt_params & params);
364
+ std::string common_params_get_system_info(const common_params & params);
351
365
 
352
- bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
353
- bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
354
- void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
366
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
367
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
368
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
355
369
  bool set_process_priority(enum ggml_sched_priority prio);
356
370
 
357
371
  //
358
372
  // String utils
359
373
  //
360
374
 
361
- std::vector<std::string> string_split(std::string input, char separator);
375
+ #ifdef __GNUC__
376
+ #ifdef __MINGW32__
377
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
378
+ #else
379
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
380
+ #endif
381
+ #else
382
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
383
+ #endif
384
+
385
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
386
+ std::string string_format(const char * fmt, ...);
362
387
 
363
388
  std::string string_strip(const std::string & str);
364
389
  std::string string_get_sortable_timestamp();
@@ -367,6 +392,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
367
392
 
368
393
  template<class T>
369
394
  static std::vector<T> string_split(const std::string & str, char delim) {
395
+ static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
370
396
  std::vector<T> values;
371
397
  std::istringstream str_stream(str);
372
398
  std::string token;
@@ -379,6 +405,22 @@ static std::vector<T> string_split(const std::string & str, char delim) {
379
405
  return values;
380
406
  }
381
407
 
408
+ template<>
409
+ std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
410
+ {
411
+ std::vector<std::string> parts;
412
+ size_t begin_pos = 0;
413
+ size_t separator_pos = input.find(separator);
414
+ while (separator_pos != std::string::npos) {
415
+ std::string part = input.substr(begin_pos, separator_pos - begin_pos);
416
+ parts.emplace_back(part);
417
+ begin_pos = separator_pos + 1;
418
+ separator_pos = input.find(separator, begin_pos);
419
+ }
420
+ parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
421
+ return parts;
422
+ }
423
+
382
424
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
383
425
  void string_process_escapes(std::string & input);
384
426
 
@@ -401,29 +443,29 @@ std::string fs_get_cache_file(const std::string & filename);
401
443
  // Model utils
402
444
  //
403
445
 
404
- struct llama_init_result {
446
+ struct common_init_result {
405
447
  struct llama_model * model = nullptr;
406
448
  struct llama_context * context = nullptr;
407
- std::vector<llama_lora_adapter_container> lora_adapters;
449
+ std::vector<common_lora_adapter_container> lora_adapters;
408
450
  };
409
451
 
410
- struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
452
+ struct common_init_result common_init_from_params(common_params & params);
411
453
 
412
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
413
- struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
454
+ struct llama_model_params common_model_params_to_llama (const common_params & params);
455
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
414
456
  struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
415
457
 
416
- struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
417
- struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
458
+ struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
459
+ struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
418
460
 
419
461
  // clear LoRA adapters from context, then apply new list of adapters
420
- void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
462
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
421
463
 
422
464
  // Batch utils
423
465
 
424
- void llama_batch_clear(struct llama_batch & batch);
466
+ void common_batch_clear(struct llama_batch & batch);
425
467
 
426
- void llama_batch_add(
468
+ void common_batch_add(
427
469
  struct llama_batch & batch,
428
470
  llama_token id,
429
471
  llama_pos pos,
@@ -436,13 +478,13 @@ void llama_batch_add(
436
478
 
437
479
  // tokenizes a string into a vector of tokens
438
480
  // should work similar to Python's `tokenizer.encode`
439
- std::vector<llama_token> llama_tokenize(
481
+ std::vector<llama_token> common_tokenize(
440
482
  const struct llama_context * ctx,
441
483
  const std::string & text,
442
484
  bool add_special,
443
485
  bool parse_special = false);
444
486
 
445
- std::vector<llama_token> llama_tokenize(
487
+ std::vector<llama_token> common_tokenize(
446
488
  const struct llama_model * model,
447
489
  const std::string & text,
448
490
  bool add_special,
@@ -450,7 +492,7 @@ std::vector<llama_token> llama_tokenize(
450
492
 
451
493
  // tokenizes a token into a piece, optionally renders special/control tokens
452
494
  // should work similar to Python's `tokenizer.id_to_piece`
453
- std::string llama_token_to_piece(
495
+ std::string common_token_to_piece(
454
496
  const struct llama_context * ctx,
455
497
  llama_token token,
456
498
  bool special = true);
@@ -458,7 +500,7 @@ std::string llama_token_to_piece(
458
500
  // detokenizes a vector of tokens into a string
459
501
  // should work similar to Python's `tokenizer.decode`
460
502
  // optionally renders special/control tokens
461
- std::string llama_detokenize(
503
+ std::string common_detokenize(
462
504
  llama_context * ctx,
463
505
  const std::vector<llama_token> & tokens,
464
506
  bool special = true);
@@ -468,31 +510,31 @@ std::string llama_detokenize(
468
510
  //
469
511
 
470
512
  // same with llama_chat_message, but uses std::string
471
- struct llama_chat_msg {
513
+ struct common_chat_msg {
472
514
  std::string role;
473
515
  std::string content;
474
516
  };
475
517
 
476
518
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
477
- bool llama_chat_verify_template(const std::string & tmpl);
519
+ bool common_chat_verify_template(const std::string & tmpl);
478
520
 
479
521
  // CPP wrapper for llama_chat_apply_template
480
522
  // If the built-in template is not supported, we default to chatml
481
523
  // If the custom "tmpl" is not supported, we throw an error
482
- std::string llama_chat_apply_template(const struct llama_model * model,
524
+ std::string common_chat_apply_template(const struct llama_model * model,
483
525
  const std::string & tmpl,
484
- const std::vector<llama_chat_msg> & chat,
526
+ const std::vector<common_chat_msg> & chat,
485
527
  bool add_ass);
486
528
 
487
529
  // Format single message, while taking into account the position of that message in chat history
488
- std::string llama_chat_format_single(const struct llama_model * model,
530
+ std::string common_chat_format_single(const struct llama_model * model,
489
531
  const std::string & tmpl,
490
- const std::vector<llama_chat_msg> & past_msg,
491
- const llama_chat_msg & new_msg,
532
+ const std::vector<common_chat_msg> & past_msg,
533
+ const common_chat_msg & new_msg,
492
534
  bool add_ass);
493
535
 
494
536
  // Returns an example of formatted chat
495
- std::string llama_chat_format_example(const struct llama_model * model,
537
+ std::string common_chat_format_example(const struct llama_model * model,
496
538
  const std::string & tmpl);
497
539
 
498
540
  //
@@ -500,31 +542,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
500
542
  //
501
543
 
502
544
  // Dump the KV cache view with the number of sequences per cell.
503
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
545
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
504
546
 
505
547
  // Dump the KV cache view showing individual sequences in each cell (long output).
506
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
548
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
507
549
 
508
550
  //
509
551
  // Embedding utils
510
552
  //
511
553
 
512
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
554
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
513
555
 
514
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
556
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
515
557
 
516
558
  //
517
559
  // Control vector utils
518
560
  //
519
561
 
520
- struct llama_control_vector_data {
562
+ struct common_control_vector_data {
521
563
  int n_embd;
522
564
 
523
565
  // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
524
566
  std::vector<float> data;
525
567
  };
526
568
 
527
- struct llama_control_vector_load_info {
569
+ struct common_control_vector_load_info {
528
570
  float strength;
529
571
 
530
572
  std::string fname;
@@ -532,7 +574,7 @@ struct llama_control_vector_load_info {
532
574
 
533
575
  // Load control vectors, scale each by strength, and add them together.
534
576
  // On error, returns {-1, empty}
535
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
577
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
536
578
 
537
579
  //
538
580
  // Split utils
@@ -541,15 +583,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
541
583
  static const char * const LLM_KV_SPLIT_NO = "split.no";
542
584
  static const char * const LLM_KV_SPLIT_COUNT = "split.count";
543
585
  static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
544
-
545
- //
546
- // YAML utils
547
- //
548
-
549
- void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
550
- void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
551
- void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
552
-
553
- void yaml_dump_non_result_info(
554
- FILE * stream, const gpt_params & params, const llama_context * lctx,
555
- const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
@@ -611,7 +611,7 @@ private:
611
611
  }
612
612
  return join_seq();
613
613
  };
614
- return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
614
+ return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
615
615
  }
616
616
 
617
617
  /*