PyPI - bigdl-core-npu - Versions diffs - 2.5.0__cp311-cp311-win_amd64.whl → 2.6.0__cp311-cp311-win_amd64.whl - Mend

bigdl-core-npu 2.5.0__cp311-cp311-win_amd64.whl → 2.6.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

bigdl-core-npu/include/llamacpp/log.h ADDED Viewed

@@ -0,0 +1,92 @@
+#pragma once
+#include "ggml.h" // for ggml_log_level
+#ifndef __GNUC__
+#    define LOG_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#define LOG_DEFAULT_DEBUG 1
+#define LOG_DEFAULT_LLAMA 0
+// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
+// set via gpt_log_set_verbosity()
+extern int gpt_log_verbosity_thold;
+void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
+// the gpt_log uses an internal worker thread to print/write log messages
+// when the worker thread is paused, incoming log messages are discarded
+struct gpt_log;
+struct gpt_log * gpt_log_init();
+struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
+void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
+void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
+void             gpt_log_free  (struct gpt_log * log);
+LOG_ATTRIBUTE_FORMAT(3, 4)
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
+// defaults: file = NULL, colors = false, prefix = false, timestamps = false
+//
+// regular log output:
+//
+//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   llm_load_tensors: ggml ctx size =    0.27 MiB
+//   llm_load_tensors: offloading 32 repeating layers to GPU
+//   llm_load_tensors: offloading non-repeating layers to GPU
+//
+// with prefix = true, timestamps = true, the log output will look like this:
+//
+//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
+//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
+//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
+//
+// I - info    (stdout, V = 0)
+// W - warning (stderr, V = 0)
+// E - error   (stderr, V = 0)
+// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
+//
+void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
+void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
+void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
+void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+// helper macros for logging
+// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
+//
+// for example:
+//
+//   LOG_DBG("this is a debug message: %d\n", expensive_function());
+//
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
+//
+#define LOG_TMPL(level, verbosity, ...) \
+    do { \
+        if ((verbosity) <= gpt_log_verbosity_thold) { \
+            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
+        } \
+    } while (0)
+#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
+#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
+#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
+#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
+#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)

bigdl-core-npu/include/npu/npu_common.h ADDED Viewed

@@ -0,0 +1,119 @@
+//
+// Copyright 2016 The BigDL Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#pragma once
+#include <string>
+#include <vector>
+#include <sstream>
+#include <chrono>
+#ifdef __linux__
+#define EXPORT_API extern "C"
+#else
+#define EXPORT_API extern "C" __declspec(dllexport)
+#endif
+struct common_params {
+    int32_t n_predict             =    -1; // new tokens to predict
+    char* model                = ""; // model path                                                    // NOLINT
+    std::string prompt               = "";                                                                  // NOLINT
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
+    std::string cache_type_k = "f16"; // KV cache data type for the K
+    std::string cache_type_v = "f16"; // KV cache data type for the V
+};
+struct npu_model_params {
+    int32_t kv_len;
+    int32_t max_prompt_len;
+    int32_t num_head;
+    int32_t head_dim;
+    int32_t num_layers;
+    int32_t vocab_size;
+    int32_t hidden_size;
+    int32_t intermediate_size;
+    int32_t group_size;
+    int32_t fused_layers_num;
+    int32_t fused_layers;
+    int32_t weight_num;
+    int32_t weight_idx;
+    int32_t n_splits_linear;
+    int32_t n_splits_down_proj;
+    int32_t max_position_embeddings;
+    bool embedding_post;
+    std::string model_dir;
+    std::string model_weight_dir;
+    std::string model_name;
+    std::string prefill_layer_blob_name;
+    std::string lmhead_blob_name;
+    std::string embedding_post_prefill_blob_name;
+    std::string embedding_post_blob_name;
+    std::string prefill_layer_ir_name;
+    std::string lmhead_ir_name;
+    std::string embedding_post_prefill_ir_name;
+    std::string embedding_post_ir_name;
+    std::string config;
+    std::string low_bit;
+    std::string lm_head_low_bit;
+    bool const_parameter;
+    std::string model_type;
+    bool transpose_value_cache;
+    bool qkv_bias;
+    bool use_prefill_sdp;
+    bool cos_sin_input;
+    bool use_level_zero;
+};
+struct tokenizer_params {
+    std::string tokenizer_file;
+    int32_t bos_token_id;
+    std::vector<int32_t> eos_token_id;
+};
+struct npu_generation_params {
+    // may add more later when dealing with more cases
+    float repetition_penalty;
+    int32_t max_new_token;
+};
+struct llm_perf_data {
+    std::chrono::time_point<std::chrono::high_resolution_clock> t_start;
+    double t_load_ms;
+    double t_p_eval_ms;
+    double t_eval_ms;
+    uint32_t n_p_eval;
+    uint32_t n_eval;
+};
+#ifndef BASE64_H
+#define BASE64_H
+namespace base64 {
+    std::string encode(const std::string &data);
+    std::string decode(const std::string &data);
+}
+#endif // BASE64_H to encode and decode
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __cplusplus
+}
+#endif

bigdl-core-npu/include/npu/npu_llm.h ADDED Viewed

@@ -0,0 +1,77 @@
+//
+// Copyright 2016 The BigDL Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#pragma once
+#include <string>
+#include <vector>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <memory>
+#include <vector>
+#include "npu_common.h"
+using namespace std;
+#ifdef __linux__
+#define EXPORT_API extern "C"
+#else
+#define EXPORT_API extern "C" __declspec(dllexport)
+#endif
+class NPUModel;
+#ifdef __cplusplus
+extern "C" {
+#endif
+    EXPORT_API void load_tokenizer(tokenizer_params &tok_params, std::string model_str);
+    EXPORT_API vector<int32_t> llm_tokenize(std::string prompt, bool add_special);
+    EXPORT_API std::string llm_decode(vector<int32_t> tokens);
+    EXPORT_API void* load_model_from_file(const char* model_path);
+    EXPORT_API void load_config_from_file(npu_model_params &model_params, const char* model_path);
+    EXPORT_API void load_generation_config_from_file(npu_generation_params &generation_params, const char* model_path);
+    EXPORT_API std::string add_chat_template(npu_model_params model_params, std::string input_prompt);
+    EXPORT_API float* run_prefill(void* void_model, void* embd_inp_ptr, int32_t embd_inp_size, float repetition_penalty, bool skip_embd=false);
+    EXPORT_API float* run_decode(void* void_model, int32_t input_token, float repetition_penalty);
+    EXPORT_API void run_prefill_with_logits(void* void_model, void* embd_inp_ptr, int32_t embd_inp_size, float* logits, int32_t vocab_size, bool skip_embd=false);
+    EXPORT_API void run_decode_with_logits(void* void_model, int32_t input_token, float* logits, int32_t vocab_size);
+    EXPORT_API float* process_logits(float* logits, int32_t vocab_size, int32_t* p_updated_input_ids, int32_t updated_input_id_size, float repetition_penalty);
+    EXPORT_API int32_t llm_sample_token(float* logits, bool greedy_search, int32_t vocab_size);
+    EXPORT_API void reset(void* void_model);
+    EXPORT_API void llm_perf_print(void * void_model);
+    EXPORT_API void prepare_ir(const char* model_path);
+#ifdef __cplusplus
+}
+#endif

bigdl-core-npu/llama-cli-npu.exe ADDED Viewed

Binary file

bigdl-core-npu/llama.dll ADDED Viewed

Binary file

bigdl-core-npu/llama.lib ADDED Viewed

Binary file

bigdl-core-npu/llm-cli.exe ADDED Viewed

Binary file

bigdl-core-npu/npu_llm.dll ADDED Viewed

Binary file

bigdl-core-npu/npu_llm.lib ADDED Viewed

Binary file

bigdl-core-npu/zlib1.dll ADDED Viewed

Binary file

bigdl_core_npu-2.6.0.data/scripts/init-llama-cpp.bat ADDED Viewed

@@ -0,0 +1,29 @@
+@echo off
+for /f "delims=" %%i in ('python -c "import importlib; print(importlib.import_module('bigdl-core-npu').__file__)"') do set "cpp_file=%%i"
+for %%a in ("%cpp_file%") do set "cpp_dir=%%~dpa"
+set "cpp_dir=%cpp_dir:~0,-1%"
+set "lib_dir=%cpp_dir:bigdl-core-npu=intel_npu_acceleration_library%\lib\Release"
+set "destination_folder=%cd%"
+pushd "%lib_dir%"
+for %%f in (*) do (
+    if exist "%destination_folder%\%%~nxf" (
+        del /f "%destination_folder%\%%~nxf"
+    )
+    mklink "%destination_folder%\%%~nxf" "%%~ff"
+)
+popd
+pushd "%cpp_dir%"
+for %%f in (*) do (
+    if not "%%f"=="llama-cli-npu.exe" (
+        if exist "%destination_folder%\%%~nxf" (
+            del /f "%destination_folder%\%%~nxf"
+        )
+        mklink "%destination_folder%\%%~nxf" "%%~ff"
+    )
+)
+popd
+copy "%cpp_dir%\llama-cli-npu.exe" .

{bigdl_core_npu-2.5.0.dist-info → bigdl_core_npu-2.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: bigdl-core-npu
-Version: 2.5.0
+Version: 2.6.0
 Summary: Intel® NPU Acceleration Library
 Home-page: https://github.com/intel/intel-npu-acceleration-library
 Author: Alessandro Palla
@@ -32,4 +32,13 @@ Requires-Dist: numpy
 Requires-Dist: torch
 Requires-Dist: transformers>=4.39.3
 Requires-Dist: neural-compressor
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary