bigdl-core-npu 2.5.0__cp311-cp311-win_amd64.whl → 2.6.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. bigdl-core-npu/__init__.py +0 -0
  2. bigdl-core-npu/common.lib +0 -0
  3. bigdl-core-npu/ggml.dll +0 -0
  4. bigdl-core-npu/ggml.lib +0 -0
  5. bigdl-core-npu/include/llamacpp/arg.h +77 -0
  6. bigdl-core-npu/include/llamacpp/common.h +563 -0
  7. bigdl-core-npu/include/llamacpp/ggml-alloc.h +76 -0
  8. bigdl-core-npu/include/llamacpp/ggml-backend.h +241 -0
  9. bigdl-core-npu/include/llamacpp/ggml.h +2679 -0
  10. bigdl-core-npu/include/llamacpp/llama.h +1234 -0
  11. bigdl-core-npu/include/llamacpp/log.h +92 -0
  12. bigdl-core-npu/include/npu/npu_common.h +119 -0
  13. bigdl-core-npu/include/npu/npu_llm.h +77 -0
  14. bigdl-core-npu/llama-cli-npu.exe +0 -0
  15. bigdl-core-npu/llama.dll +0 -0
  16. bigdl-core-npu/llama.lib +0 -0
  17. bigdl-core-npu/llm-cli.exe +0 -0
  18. bigdl-core-npu/npu_llm.dll +0 -0
  19. bigdl-core-npu/npu_llm.lib +0 -0
  20. bigdl-core-npu/zlib1.dll +0 -0
  21. bigdl_core_npu-2.6.0.data/scripts/init-llama-cpp.bat +29 -0
  22. {bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/METADATA +12 -3
  23. {bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/RECORD +146 -96
  24. {bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/WHEEL +1 -1
  25. {bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/top_level.txt +1 -0
  26. intel_npu_acceleration_library/_version.py +1 -1
  27. intel_npu_acceleration_library/backend/base.py +39 -4
  28. intel_npu_acceleration_library/backend/bindings.py +109 -5
  29. intel_npu_acceleration_library/backend/factory.py +264 -47
  30. intel_npu_acceleration_library/backend/ops.py +2 -1
  31. intel_npu_acceleration_library/backend/qlinear.py +8 -4
  32. intel_npu_acceleration_library/backend/runtime.py +7 -2
  33. intel_npu_acceleration_library/backend/tensor.py +73 -3
  34. intel_npu_acceleration_library/bigdl-core-npu/cache.json +113732 -0
  35. intel_npu_acceleration_library/bigdl-core-npu/openvino.dll +0 -0
  36. intel_npu_acceleration_library/bigdl-core-npu/openvino_auto_batch_plugin.dll +0 -0
  37. intel_npu_acceleration_library/bigdl-core-npu/openvino_auto_plugin.dll +0 -0
  38. intel_npu_acceleration_library/bigdl-core-npu/openvino_c.dll +0 -0
  39. intel_npu_acceleration_library/bigdl-core-npu/openvino_hetero_plugin.dll +0 -0
  40. intel_npu_acceleration_library/bigdl-core-npu/openvino_intel_cpu_plugin.dll +0 -0
  41. intel_npu_acceleration_library/bigdl-core-npu/openvino_intel_gpu_plugin.dll +0 -0
  42. intel_npu_acceleration_library/bigdl-core-npu/openvino_intel_npu_plugin.dll +0 -0
  43. intel_npu_acceleration_library/bigdl-core-npu/openvino_ir_frontend.dll +0 -0
  44. intel_npu_acceleration_library/bigdl-core-npu/openvino_onnx_frontend.dll +0 -0
  45. intel_npu_acceleration_library/bigdl-core-npu/openvino_paddle_frontend.dll +0 -0
  46. intel_npu_acceleration_library/bigdl-core-npu/openvino_pytorch_frontend.dll +0 -0
  47. intel_npu_acceleration_library/bigdl-core-npu/openvino_tensorflow_frontend.dll +0 -0
  48. intel_npu_acceleration_library/bigdl-core-npu/openvino_tensorflow_lite_frontend.dll +0 -0
  49. intel_npu_acceleration_library/bigdl-core-npu/tbb12.dll +0 -0
  50. intel_npu_acceleration_library/bigdl-core-npu/tbb12_debug.dll +0 -0
  51. intel_npu_acceleration_library/bigdl-core-npu/tbbbind_2_5.dll +0 -0
  52. intel_npu_acceleration_library/bigdl-core-npu/tbbbind_2_5_debug.dll +0 -0
  53. intel_npu_acceleration_library/bigdl-core-npu/tbbmalloc.dll +0 -0
  54. intel_npu_acceleration_library/bigdl-core-npu/tbbmalloc_debug.dll +0 -0
  55. intel_npu_acceleration_library/bigdl-core-npu/tbbmalloc_proxy.dll +0 -0
  56. intel_npu_acceleration_library/bigdl-core-npu/tbbmalloc_proxy_debug.dll +0 -0
  57. intel_npu_acceleration_library/device.py +2 -2
  58. intel_npu_acceleration_library/dtypes.py +34 -1
  59. intel_npu_acceleration_library/external/openvino/__init__.py +1 -0
  60. intel_npu_acceleration_library/external/openvino/_offline_transformations/__init__.py +1 -0
  61. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp310-win_amd64.pyd +0 -0
  62. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp311-win_amd64.pyd +0 -0
  63. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp312-win_amd64.pyd +0 -0
  64. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp38-win_amd64.pyd +0 -0
  65. intel_npu_acceleration_library/external/openvino/_pyopenvino.cp39-win_amd64.pyd +0 -0
  66. intel_npu_acceleration_library/external/openvino/experimental/__init__.py +14 -0
  67. intel_npu_acceleration_library/external/openvino/frontend/jax/__init__.py +15 -0
  68. intel_npu_acceleration_library/external/openvino/frontend/jax/jaxpr_decoder.py +293 -0
  69. intel_npu_acceleration_library/external/openvino/frontend/jax/passes.py +65 -0
  70. intel_npu_acceleration_library/external/openvino/frontend/jax/utils.py +182 -0
  71. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp310-win_amd64.pyd +0 -0
  72. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp311-win_amd64.pyd +0 -0
  73. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp312-win_amd64.pyd +0 -0
  74. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp38-win_amd64.pyd +0 -0
  75. intel_npu_acceleration_library/external/openvino/frontend/onnx/py_onnx_frontend.cp39-win_amd64.pyd +0 -0
  76. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp310-win_amd64.pyd +0 -0
  77. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp311-win_amd64.pyd +0 -0
  78. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp312-win_amd64.pyd +0 -0
  79. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp38-win_amd64.pyd +0 -0
  80. intel_npu_acceleration_library/external/openvino/frontend/paddle/py_paddle_frontend.cp39-win_amd64.pyd +0 -0
  81. intel_npu_acceleration_library/external/openvino/frontend/pytorch/fx_decoder.py +37 -19
  82. intel_npu_acceleration_library/external/openvino/frontend/pytorch/gptq.py +47 -6
  83. intel_npu_acceleration_library/external/openvino/frontend/pytorch/patch_model.py +28 -8
  84. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp310-win_amd64.pyd +0 -0
  85. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp311-win_amd64.pyd +0 -0
  86. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp312-win_amd64.pyd +0 -0
  87. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp38-win_amd64.pyd +0 -0
  88. intel_npu_acceleration_library/external/openvino/frontend/pytorch/py_pytorch_frontend.cp39-win_amd64.pyd +0 -0
  89. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/backend.py +17 -5
  90. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/op_support.py +1 -0
  91. intel_npu_acceleration_library/external/openvino/frontend/pytorch/torchdynamo/partition.py +55 -47
  92. intel_npu_acceleration_library/external/openvino/frontend/pytorch/ts_decoder.py +95 -63
  93. intel_npu_acceleration_library/external/openvino/frontend/pytorch/utils.py +12 -10
  94. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp310-win_amd64.pyd +0 -0
  95. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp311-win_amd64.pyd +0 -0
  96. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp312-win_amd64.pyd +0 -0
  97. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp38-win_amd64.pyd +0 -0
  98. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/py_tensorflow_frontend.cp39-win_amd64.pyd +0 -0
  99. intel_npu_acceleration_library/external/openvino/frontend/tensorflow/utils.py +31 -10
  100. intel_npu_acceleration_library/external/openvino/helpers/packing.py +4 -4
  101. intel_npu_acceleration_library/external/openvino/preprocess/__init__.py +2 -0
  102. intel_npu_acceleration_library/external/openvino/preprocess/torchvision/requirements.txt +1 -0
  103. intel_npu_acceleration_library/external/openvino/properties/__init__.py +1 -0
  104. intel_npu_acceleration_library/external/openvino/runtime/ie_api.py +1 -1
  105. intel_npu_acceleration_library/external/openvino/runtime/op/__init__.py +1 -0
  106. intel_npu_acceleration_library/external/openvino/runtime/opset1/ops.py +2 -1
  107. intel_npu_acceleration_library/external/openvino/runtime/opset13/ops.py +5 -6
  108. intel_npu_acceleration_library/external/openvino/runtime/opset15/__init__.py +7 -0
  109. intel_npu_acceleration_library/external/openvino/runtime/opset15/ops.py +193 -2
  110. intel_npu_acceleration_library/external/openvino/runtime/opset6/ops.py +69 -43
  111. intel_npu_acceleration_library/external/openvino/runtime/opset8/ops.py +4 -0
  112. intel_npu_acceleration_library/external/openvino/runtime/properties/__init__.py +2 -0
  113. intel_npu_acceleration_library/external/openvino/runtime/utils/data_helpers/data_dispatcher.py +21 -3
  114. intel_npu_acceleration_library/external/openvino/runtime/utils/decorators.py +88 -2
  115. intel_npu_acceleration_library/external/openvino/tools/benchmark/utils/inputs_filling.py +9 -9
  116. intel_npu_acceleration_library/external/openvino/tools/ovc/convert_impl.py +16 -2
  117. intel_npu_acceleration_library/external/openvino/tools/ovc/main.py +5 -0
  118. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/jax_frontend_utils.py +19 -0
  119. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pipeline.py +68 -16
  120. intel_npu_acceleration_library/external/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +69 -60
  121. intel_npu_acceleration_library/external/openvino/tools/ovc/utils.py +90 -3
  122. intel_npu_acceleration_library/external/openvino/utils.py +17 -0
  123. intel_npu_acceleration_library/lib/Release/intel_npu_acceleration_library.dll +0 -0
  124. intel_npu_acceleration_library/lib/Release/openvino.dll +0 -0
  125. intel_npu_acceleration_library/lib/Release/openvino_auto_batch_plugin.dll +0 -0
  126. intel_npu_acceleration_library/lib/Release/openvino_auto_plugin.dll +0 -0
  127. intel_npu_acceleration_library/lib/Release/openvino_c.dll +0 -0
  128. intel_npu_acceleration_library/lib/Release/openvino_hetero_plugin.dll +0 -0
  129. intel_npu_acceleration_library/lib/Release/openvino_intel_cpu_plugin.dll +0 -0
  130. intel_npu_acceleration_library/lib/Release/openvino_intel_gpu_plugin.dll +0 -0
  131. intel_npu_acceleration_library/lib/Release/openvino_intel_npu_plugin.dll +0 -0
  132. intel_npu_acceleration_library/lib/Release/openvino_ir_frontend.dll +0 -0
  133. intel_npu_acceleration_library/lib/Release/openvino_onnx_frontend.dll +0 -0
  134. intel_npu_acceleration_library/lib/Release/openvino_paddle_frontend.dll +0 -0
  135. intel_npu_acceleration_library/lib/Release/openvino_pytorch_frontend.dll +0 -0
  136. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_frontend.dll +0 -0
  137. intel_npu_acceleration_library/lib/Release/openvino_tensorflow_lite_frontend.dll +0 -0
  138. intel_npu_acceleration_library/lib/Release/tbb12.dll +0 -0
  139. intel_npu_acceleration_library/lib/Release/tbb12_debug.dll +0 -0
  140. intel_npu_acceleration_library/lib/Release/tbbbind_2_5.dll +0 -0
  141. intel_npu_acceleration_library/lib/Release/tbbbind_2_5_debug.dll +0 -0
  142. intel_npu_acceleration_library/lib/Release/tbbmalloc.dll +0 -0
  143. intel_npu_acceleration_library/lib/Release/tbbmalloc_debug.dll +0 -0
  144. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy.dll +0 -0
  145. intel_npu_acceleration_library/lib/Release/tbbmalloc_proxy_debug.dll +0 -0
  146. intel_npu_acceleration_library/nn/module.py +17 -17
File without changes
Binary file
Binary file
Binary file
@@ -0,0 +1,77 @@
1
+ #pragma once
2
+
3
+ #include "common.h"
4
+
5
+ #include <set>
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ //
10
+ // CLI argument parsing
11
+ //
12
+
13
+ struct llama_arg {
14
+ std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
15
+ std::vector<const char *> args;
16
+ const char * value_hint = nullptr; // help text or example for arg value
17
+ const char * value_hint_2 = nullptr; // for second arg value
18
+ const char * env = nullptr;
19
+ std::string help;
20
+ bool is_sparam = false; // is current arg a sampling param?
21
+ void (*handler_void) (gpt_params & params) = nullptr;
22
+ void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
23
+ void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
24
+ void (*handler_int) (gpt_params & params, int) = nullptr;
25
+
26
+ llama_arg(
27
+ const std::initializer_list<const char *> & args,
28
+ const char * value_hint,
29
+ const std::string & help,
30
+ void (*handler)(gpt_params & params, const std::string &)
31
+ ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
32
+
33
+ llama_arg(
34
+ const std::initializer_list<const char *> & args,
35
+ const char * value_hint,
36
+ const std::string & help,
37
+ void (*handler)(gpt_params & params, int)
38
+ ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
39
+
40
+ llama_arg(
41
+ const std::initializer_list<const char *> & args,
42
+ const std::string & help,
43
+ void (*handler)(gpt_params & params)
44
+ ) : args(args), help(help), handler_void(handler) {}
45
+
46
+ // support 2 values for arg
47
+ llama_arg(
48
+ const std::initializer_list<const char *> & args,
49
+ const char * value_hint,
50
+ const char * value_hint_2,
51
+ const std::string & help,
52
+ void (*handler)(gpt_params & params, const std::string &, const std::string &)
53
+ ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
54
+
55
+ llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
56
+ llama_arg & set_env(const char * env);
57
+ llama_arg & set_sparam();
58
+ bool in_example(enum llama_example ex);
59
+ bool get_value_from_env(std::string & output);
60
+ bool has_value_from_env();
61
+ std::string to_string();
62
+ };
63
+
64
+ struct gpt_params_context {
65
+ enum llama_example ex = LLAMA_EXAMPLE_COMMON;
66
+ gpt_params & params;
67
+ std::vector<llama_arg> options;
68
+ void(*print_usage)(int, char **) = nullptr;
69
+ gpt_params_context(gpt_params & params) : params(params) {}
70
+ };
71
+
72
+ // parse input arguments from CLI
73
+ // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
74
+ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
75
+
76
+ // function to be used by test-arg-parser
77
+ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
@@ -0,0 +1,563 @@
1
+ // Various helper functions and utilities
2
+
3
+ #pragma once
4
+
5
+ #include "llama.h"
6
+
7
+ #include <string>
8
+ #include <vector>
9
+ #include <sstream>
10
+
11
+ #ifdef _WIN32
12
+ #define DIRECTORY_SEPARATOR '\\'
13
+ #else
14
+ #define DIRECTORY_SEPARATOR '/'
15
+ #endif // _WIN32
16
+
17
+ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
18
+ #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
19
+
20
+ #define print_build_info() do { \
21
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
22
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
23
+ } while(0)
24
+
25
+ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
+
27
+ struct llama_lora_adapter_info {
28
+ std::string path;
29
+ float scale;
30
+ };
31
+
32
+ struct llama_lora_adapter_container : llama_lora_adapter_info {
33
+ struct llama_lora_adapter * adapter;
34
+ };
35
+
36
+ // build info
37
+ extern int LLAMA_BUILD_NUMBER;
38
+ extern char const * LLAMA_COMMIT;
39
+ extern char const * LLAMA_COMPILER;
40
+ extern char const * LLAMA_BUILD_TARGET;
41
+
42
+ struct llama_control_vector_load_info;
43
+
44
+ //
45
+ // CPU utils
46
+ //
47
+
48
+ struct cpu_params {
49
+ int n_threads = -1;
50
+ bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
51
+ bool mask_valid = false; // Default: any CPU
52
+ enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
53
+ bool strict_cpu = false; // Use strict CPU placement
54
+ uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
55
+ };
56
+
57
+ int32_t cpu_get_num_physical_cores();
58
+ int32_t cpu_get_num_math();
59
+
60
+ //
61
+ // Common params
62
+ //
63
+
64
+ enum llama_example {
65
+ LLAMA_EXAMPLE_COMMON,
66
+ LLAMA_EXAMPLE_SPECULATIVE,
67
+ LLAMA_EXAMPLE_MAIN,
68
+ LLAMA_EXAMPLE_INFILL,
69
+ LLAMA_EXAMPLE_EMBEDDING,
70
+ LLAMA_EXAMPLE_PERPLEXITY,
71
+ LLAMA_EXAMPLE_RETRIEVAL,
72
+ LLAMA_EXAMPLE_PASSKEY,
73
+ LLAMA_EXAMPLE_IMATRIX,
74
+ LLAMA_EXAMPLE_BENCH,
75
+ LLAMA_EXAMPLE_SERVER,
76
+ LLAMA_EXAMPLE_CVECTOR_GENERATOR,
77
+ LLAMA_EXAMPLE_EXPORT_LORA,
78
+ LLAMA_EXAMPLE_LLAVA,
79
+ LLAMA_EXAMPLE_LOOKUP,
80
+ LLAMA_EXAMPLE_PARALLEL,
81
+ LLAMA_EXAMPLE_NPU,
82
+
83
+ LLAMA_EXAMPLE_COUNT,
84
+ };
85
+
86
+ enum gpt_sampler_type {
87
+ GPT_SAMPLER_TYPE_NONE = 0,
88
+ GPT_SAMPLER_TYPE_TOP_K = 1,
89
+ GPT_SAMPLER_TYPE_TOP_P = 2,
90
+ GPT_SAMPLER_TYPE_MIN_P = 3,
91
+ GPT_SAMPLER_TYPE_TFS_Z = 4,
92
+ GPT_SAMPLER_TYPE_TYPICAL_P = 5,
93
+ GPT_SAMPLER_TYPE_TEMPERATURE = 6,
94
+ };
95
+
96
+ // dimensionality reduction methods, used by cvector-generator
97
+ enum dimre_method {
98
+ DIMRE_METHOD_PCA,
99
+ DIMRE_METHOD_MEAN,
100
+ };
101
+
102
+ // sampler parameters
103
+ struct gpt_sampler_params {
104
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
105
+
106
+ int32_t n_prev = 64; // number of previous tokens to remember
107
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
108
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
109
+ int32_t top_k = 40; // <= 0 to use vocab size
110
+ float top_p = 0.95f; // 1.0 = disabled
111
+ float min_p = 0.05f; // 0.0 = disabled
112
+ float tfs_z = 1.00f; // 1.0 = disabled
113
+ float typ_p = 1.00f; // typical_p, 1.0 = disabled
114
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
115
+ float dynatemp_range = 0.00f; // 0.0 = disabled
116
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
117
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
118
+ float penalty_repeat = 1.00f; // 1.0 = disabled
119
+ float penalty_freq = 0.00f; // 0.0 = disabled
120
+ float penalty_present = 0.00f; // 0.0 = disabled
121
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
122
+ float mirostat_tau = 5.00f; // target entropy
123
+ float mirostat_eta = 0.10f; // learning rate
124
+ bool penalize_nl = false; // consider newlines as a repeatable token
125
+ bool ignore_eos = false;
126
+ bool no_perf = false; // disable performance metrics
127
+
128
+ std::vector<enum gpt_sampler_type> samplers = {
129
+ GPT_SAMPLER_TYPE_TOP_K,
130
+ GPT_SAMPLER_TYPE_TFS_Z,
131
+ GPT_SAMPLER_TYPE_TYPICAL_P,
132
+ GPT_SAMPLER_TYPE_TOP_P,
133
+ GPT_SAMPLER_TYPE_MIN_P,
134
+ GPT_SAMPLER_TYPE_TEMPERATURE
135
+ };
136
+
137
+ std::string grammar; // optional BNF-like grammar to constrain sampling
138
+
139
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
140
+
141
+ // print the parameters into a string
142
+ std::string print() const;
143
+ };
144
+
145
+ struct gpt_params {
146
+ int32_t n_predict = -1; // new tokens to predict
147
+ int32_t n_ctx = 0; // context size
148
+ int32_t n_batch = 4096; // logical batch size for prompt processing (must be >=32 to use BLAS)
149
+ int32_t n_ubatch = 4096; // physical batch size for prompt processing (must be >=32 to use BLAS)
150
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
151
+ int32_t n_draft = 5; // number of tokens to draft during speculative decoding
152
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
153
+ int32_t n_parallel = 1; // number of parallel sequences to decode
154
+ int32_t n_sequences = 1; // number of sequences to decode
155
+ float p_split = 0.1f; // speculative decoding split probability
156
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
157
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
158
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
159
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
160
+ int32_t grp_attn_n = 1; // group-attention factor
161
+ int32_t grp_attn_w = 512; // group-attention width
162
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
163
+ float rope_freq_base = 0.0f; // RoPE base frequency
164
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
165
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
166
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
167
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
168
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
169
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
170
+ float defrag_thold = -1.0f; // KV cache defragmentation threshold
171
+
172
+ struct cpu_params cpuparams;
173
+ struct cpu_params cpuparams_batch;
174
+ struct cpu_params draft_cpuparams;
175
+ struct cpu_params draft_cpuparams_batch;
176
+
177
+ ggml_backend_sched_eval_callback cb_eval = nullptr;
178
+ void * cb_eval_user_data = nullptr;
179
+
180
+ ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
181
+
182
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
183
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
184
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
185
+ enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
186
+
187
+ struct gpt_sampler_params sparams;
188
+
189
+ std::string model = ""; // model path // NOLINT
190
+ std::string model_draft = ""; // draft model for speculative decoding // NOLINT
191
+ std::string model_alias = "unknown"; // model alias // NOLINT
192
+ std::string model_url = ""; // model url to download // NOLINT
193
+ std::string hf_token = ""; // HF token // NOLINT
194
+ std::string hf_repo = ""; // HF repo // NOLINT
195
+ std::string hf_file = ""; // HF file // NOLINT
196
+ std::string prompt = ""; // NOLINT
197
+ std::string prompt_file = ""; // store the external prompt file name // NOLINT
198
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
199
+ std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
200
+ std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
201
+ std::string logdir = ""; // directory in which to save YAML log files // NOLINT
202
+ std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
203
+ std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
204
+ std::string logits_file = ""; // file for saving *all* logits // NOLINT
205
+ std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
206
+
207
+ std::vector<std::string> in_files; // all input files
208
+ std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
209
+ std::vector<llama_model_kv_override> kv_overrides;
210
+
211
+ bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
212
+ std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
213
+
214
+ std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
215
+
216
+ int32_t verbosity = 0;
217
+ int32_t control_vector_layer_start = -1; // layer range for control vector
218
+ int32_t control_vector_layer_end = -1; // layer range for control vector
219
+
220
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
221
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
222
+ // (which is more convenient to use for plotting)
223
+ //
224
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
225
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
226
+
227
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
228
+ size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
229
+
230
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
231
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
232
+
233
+ bool kl_divergence = false; // compute KL divergence
234
+
235
+ bool usage = false; // print usage
236
+ bool use_color = false; // use color to distinguish generations and inputs
237
+ bool special = false; // enable special token output
238
+ bool interactive = false; // interactive mode
239
+ bool interactive_first = false; // wait for user input immediately
240
+ bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
241
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
242
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
243
+
244
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
245
+ bool multiline_input = false; // reverse the usage of `\`
246
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
247
+ bool cont_batching = true; // insert new sequences for decoding on-the-fly
248
+ bool flash_attn = false; // flash attention
249
+ bool no_perf = false; // disable performance metrics
250
+ bool ctx_shift = true; // context shift on inifinite text generation
251
+
252
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
253
+ bool logits_all = false; // return logits for all tokens in the batch
254
+ bool use_mmap = true; // use mmap for faster loads
255
+ bool use_mlock = false; // use mlock to keep model in memory
256
+ bool verbose_prompt = false; // print prompt tokens before generation
257
+ bool display_prompt = true; // print prompt before generation
258
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
259
+ bool no_kv_offload = false; // disable KV offloading
260
+ bool warmup = true; // warmup run
261
+ bool check_tensors = false; // validate tensor data
262
+
263
+ std::string cache_type_k = "f16"; // KV cache data type for the K
264
+ std::string cache_type_v = "f16"; // KV cache data type for the V
265
+
266
+ // multimodal models (see examples/llava)
267
+ std::string mmproj = ""; // path to multimodal projector // NOLINT
268
+ std::vector<std::string> image; // path to image file(s)
269
+
270
+ // embedding
271
+ bool embedding = false; // get only sentence embedding
272
+ int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
273
+ std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
274
+ std::string embd_sep = "\n"; // separator of embendings
275
+ bool reranking = false; // enable reranking support on server
276
+
277
+ // server params
278
+ int32_t port = 8080; // server listens on this network port
279
+ int32_t timeout_read = 600; // http read timeout in seconds
280
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
281
+ int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
282
+
283
+ std::string hostname = "127.0.0.1";
284
+ std::string public_path = ""; // NOLINT
285
+ std::string chat_template = ""; // NOLINT
286
+ std::string system_prompt = ""; // NOLINT
287
+ bool enable_chat_template = true;
288
+
289
+ std::vector<std::string> api_keys;
290
+
291
+ std::string ssl_file_key = ""; // NOLINT
292
+ std::string ssl_file_cert = ""; // NOLINT
293
+
294
+ bool endpoint_slots = true;
295
+ bool endpoint_metrics = false;
296
+
297
+ bool log_json = false;
298
+
299
+ std::string slot_save_path;
300
+
301
+ float slot_prompt_similarity = 0.5f;
302
+
303
+ // batched-bench params
304
+ bool is_pp_shared = false;
305
+
306
+ std::vector<int32_t> n_pp;
307
+ std::vector<int32_t> n_tg;
308
+ std::vector<int32_t> n_pl;
309
+
310
+ // retrieval params
311
+ std::vector<std::string> context_files; // context files to embed
312
+
313
+ int32_t chunk_size = 64; // chunk size for context embedding
314
+
315
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
316
+
317
+ // passkey params
318
+ int32_t n_junk = 250; // number of times to repeat the junk text
319
+ int32_t i_pos = -1; // position of the passkey in the junk text
320
+
321
+ // imatrix params
322
+ std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
323
+
324
+ int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
325
+ int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
326
+ int32_t i_chunk = 0; // start processing from this chunk
327
+
328
+ bool process_output = false; // collect data for the output tensor
329
+ bool compute_ppl = true; // whether to compute perplexity
330
+
331
+ // cvector-generator params
332
+ int n_pca_batch = 100;
333
+ int n_pca_iterations = 1000;
334
+ dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
335
+ std::string cvector_outfile = "control_vector.gguf";
336
+ std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
337
+ std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
338
+
339
+ bool spm_infill = false; // suffix/prefix/middle pattern for infill
340
+
341
+ std::string lora_outfile = "ggml-lora-merged-f16.gguf";
342
+
343
+ // batched-bench params
344
+ bool batched_bench_output_jsonl = false;
345
+
346
+ // npu convert
347
+ std::string low_bit = "Q4_0";
348
+ int32_t quantization_group_size = 0;
349
+ int32_t max_context_len = 1024;
350
+ int32_t max_prompt_len = 512;
351
+ std::string npu_outfile = "NPU_MODEL";
352
+ };
353
+
354
+ // call once at the start of a program if it uses libcommon
355
+ // initializes the logging system and prints info about the build
356
+ void gpt_init();
357
+
358
+ std::string gpt_params_get_system_info(const gpt_params & params);
359
+
360
+ bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
361
+ bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
362
+ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
363
+ bool set_process_priority(enum ggml_sched_priority prio);
364
+
365
+ //
366
+ // String utils
367
+ //
368
+
369
+ std::vector<std::string> string_split(std::string input, char separator);
370
+
371
+ std::string string_strip(const std::string & str);
372
+ std::string string_get_sortable_timestamp();
373
+
374
+ void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
375
+
376
+ template<class T>
377
+ static std::vector<T> string_split(const std::string & str, char delim) {
378
+ std::vector<T> values;
379
+ std::istringstream str_stream(str);
380
+ std::string token;
381
+ while (std::getline(str_stream, token, delim)) {
382
+ T value;
383
+ std::istringstream token_stream(token);
384
+ token_stream >> value;
385
+ values.push_back(value);
386
+ }
387
+ return values;
388
+ }
389
+
390
+ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
391
+ void string_process_escapes(std::string & input);
392
+
393
+ std::string string_from(bool value);
394
+ std::string string_from(const std::vector<int> & values);
395
+ std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
396
+ std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
397
+
398
+ //
399
+ // Filesystem utils
400
+ //
401
+
402
+ bool fs_validate_filename(const std::string & filename);
403
+ bool fs_create_directory_with_parents(const std::string & path);
404
+
405
+ std::string fs_get_cache_directory();
406
+ std::string fs_get_cache_file(const std::string & filename);
407
+
408
+ //
409
+ // Model utils
410
+ //
411
+
412
+ struct llama_init_result {
413
+ struct llama_model * model = nullptr;
414
+ struct llama_context * context = nullptr;
415
+ std::vector<llama_lora_adapter_container> lora_adapters;
416
+ };
417
+
418
+ struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
419
+
420
+ struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
421
+ struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
422
+ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
423
+
424
+ struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
425
+ struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
426
+
427
+ // clear LoRA adapters from context, then apply new list of adapters
428
+ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
429
+
430
+ // Batch utils
431
+
432
+ void llama_batch_clear(struct llama_batch & batch);
433
+
434
+ void llama_batch_add(
435
+ struct llama_batch & batch,
436
+ llama_token id,
437
+ llama_pos pos,
438
+ const std::vector<llama_seq_id> & seq_ids,
439
+ bool logits);
440
+
441
+ //
442
+ // Vocab utils
443
+ //
444
+
445
+ // tokenizes a string into a vector of tokens
446
+ // should work similar to Python's `tokenizer.encode`
447
+ std::vector<llama_token> llama_tokenize(
448
+ const struct llama_context * ctx,
449
+ const std::string & text,
450
+ bool add_special,
451
+ bool parse_special = false);
452
+
453
+ std::vector<llama_token> llama_tokenize(
454
+ const struct llama_model * model,
455
+ const std::string & text,
456
+ bool add_special,
457
+ bool parse_special = false);
458
+
459
+ // tokenizes a token into a piece, optionally renders special/control tokens
460
+ // should work similar to Python's `tokenizer.id_to_piece`
461
+ std::string llama_token_to_piece(
462
+ const struct llama_context * ctx,
463
+ llama_token token,
464
+ bool special = true);
465
+
466
+ // detokenizes a vector of tokens into a string
467
+ // should work similar to Python's `tokenizer.decode`
468
+ // optionally renders special/control tokens
469
+ std::string llama_detokenize(
470
+ llama_context * ctx,
471
+ const std::vector<llama_token> & tokens,
472
+ bool special = true);
473
+
474
+ //
475
+ // Chat template utils
476
+ //
477
+
478
+ // same with llama_chat_message, but uses std::string
479
+ struct llama_chat_msg {
480
+ std::string role;
481
+ std::string content;
482
+ };
483
+
484
+ // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
485
+ bool llama_chat_verify_template(const std::string & tmpl);
486
+
487
+ // CPP wrapper for llama_chat_apply_template
488
+ // If the built-in template is not supported, we default to chatml
489
+ // If the custom "tmpl" is not supported, we throw an error
490
+ std::string llama_chat_apply_template(const struct llama_model * model,
491
+ const std::string & tmpl,
492
+ const std::vector<llama_chat_msg> & chat,
493
+ bool add_ass);
494
+
495
+ // Format single message, while taking into account the position of that message in chat history
496
+ std::string llama_chat_format_single(const struct llama_model * model,
497
+ const std::string & tmpl,
498
+ const std::vector<llama_chat_msg> & past_msg,
499
+ const llama_chat_msg & new_msg,
500
+ bool add_ass);
501
+
502
+ // Returns an example of formatted chat
503
+ std::string llama_chat_format_example(const struct llama_model * model,
504
+ const std::string & tmpl);
505
+
506
+ //
507
+ // KV cache utils
508
+ //
509
+
510
+ // Dump the KV cache view with the number of sequences per cell.
511
+ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
512
+
513
+ // Dump the KV cache view showing individual sequences in each cell (long output).
514
+ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
515
+
516
+ //
517
+ // Embedding utils
518
+ //
519
+
520
+ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
521
+
522
+ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
523
+
524
+ //
525
+ // Control vector utils
526
+ //
527
+
528
+ struct llama_control_vector_data {
529
+ int n_embd;
530
+
531
+ // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
532
+ std::vector<float> data;
533
+ };
534
+
535
+ struct llama_control_vector_load_info {
536
+ float strength;
537
+
538
+ std::string fname;
539
+ };
540
+
541
+ // Load control vectors, scale each by strength, and add them together.
542
+ // On error, returns {-1, empty}
543
+ llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
544
+
545
+ //
546
+ // Split utils
547
+ //
548
+
549
+ static const char * const LLM_KV_SPLIT_NO = "split.no";
550
+ static const char * const LLM_KV_SPLIT_COUNT = "split.count";
551
+ static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
552
+
553
+ //
554
+ // YAML utils
555
+ //
556
+
557
+ void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
558
+ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
559
+ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
560
+
561
+ void yaml_dump_non_result_info(
562
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
563
+ const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);