torchpipe 0.1.23__py3-none-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchpipe/__init__.py +99 -0
- torchpipe/csrc/core_cuda/SyncTensor.cpp +148 -0
- torchpipe/csrc/core_cuda/SyncTensor.hpp +134 -0
- torchpipe/csrc/core_cuda/kvcacheTensor.cpp +160 -0
- torchpipe/csrc/helper/dlpack_helper.hpp +26 -0
- torchpipe/csrc/helper/mat.cpp +18 -0
- torchpipe/csrc/helper/mat.hpp +22 -0
- torchpipe/csrc/helper/net_info.cpp +57 -0
- torchpipe/csrc/helper/net_info.hpp +60 -0
- torchpipe/csrc/helper/task_keys.hpp +9 -0
- torchpipe/csrc/helper/torch.cpp +794 -0
- torchpipe/csrc/helper/torch.hpp +149 -0
- torchpipe/csrc/helper_cuda/torch.cpp +180 -0
- torchpipe/csrc/helper_cuda/torch.hpp +101 -0
- torchpipe/csrc/mat_torch/CvtColorMat.cpp +43 -0
- torchpipe/csrc/mat_torch/CvtColorMat.hpp +18 -0
- torchpipe/csrc/mat_torch/DecodeMat.cpp +61 -0
- torchpipe/csrc/mat_torch/DecodeMat.hpp +19 -0
- torchpipe/csrc/mat_torch/ResizeMat.cpp +179 -0
- torchpipe/csrc/mat_torch/ResizeMat.hpp +60 -0
- torchpipe/csrc/mat_torch/converts.cpp +119 -0
- torchpipe/csrc/mat_torch/converts.hpp +33 -0
- torchpipe/csrc/mat_torch/mat2tensor.hpp +2 -0
- torchpipe/csrc/nvjpeg_torch/DecodeTensor.cpp +186 -0
- torchpipe/csrc/nvjpeg_torch/DecodeTensor.hpp +24 -0
- torchpipe/csrc/nvjpeg_torch/README.md +3 -0
- torchpipe/csrc/tensorrt_torch/TensorrtInferTensor.cpp +234 -0
- torchpipe/csrc/tensorrt_torch/TensorrtInferTensor.hpp +47 -0
- torchpipe/csrc/tensorrt_torch/TensorrtTensor.cpp +20 -0
- torchpipe/csrc/tensorrt_torch/TensorrtTensor.hpp +3 -0
- torchpipe/csrc/tensorrt_torch/aes.cpp +574 -0
- torchpipe/csrc/tensorrt_torch/aes.h +856 -0
- torchpipe/csrc/tensorrt_torch/allocator.cpp +91 -0
- torchpipe/csrc/tensorrt_torch/allocator.hpp +27 -0
- torchpipe/csrc/tensorrt_torch/encrypt.cpp +164 -0
- torchpipe/csrc/tensorrt_torch/encrypt.hpp +33 -0
- torchpipe/csrc/tensorrt_torch/model.cpp +315 -0
- torchpipe/csrc/tensorrt_torch/model.hpp +68 -0
- torchpipe/csrc/tensorrt_torch/tensorrt_helper.cpp +1102 -0
- torchpipe/csrc/tensorrt_torch/tensorrt_helper.hpp +109 -0
- torchpipe/csrc/tensorrt_torch/tensorrt_plugins.cpp +532 -0
- torchpipe/csrc/tensorrt_torch/tensorrt_plugins.hpp +190 -0
- torchpipe/csrc/tensorrt_torch/tensorrt_plugins_anchor.cpp_ +525 -0
- torchpipe/csrc/tensorrt_torch/tensorrt_plugins_anchor.hpp +195 -0
- torchpipe/csrc/torchplugins/CropTensor.cpp +143 -0
- torchpipe/csrc/torchplugins/CropTensor.hpp +40 -0
- torchpipe/csrc/torchplugins/CvtColorTensor.cpp +61 -0
- torchpipe/csrc/torchplugins/CvtColorTensor.hpp +21 -0
- torchpipe/csrc/torchplugins/GpuTensor.cpp +293 -0
- torchpipe/csrc/torchplugins/GpuTensor.hpp +37 -0
- torchpipe/csrc/torchplugins/ResizeTensor.cpp +107 -0
- torchpipe/csrc/torchplugins/ResizeTensor.hpp +9 -0
- torchpipe/csrc/torchplugins/cat_split_tensor.cpp +217 -0
- torchpipe/csrc/torchplugins/cat_split_tensor.hpp +61 -0
- torchpipe/csrc/torchplugins/continuous_batching_tensor.cpp +43 -0
- torchpipe/csrc/torchplugins/continuous_batching_tensor.hpp +18 -0
- torchpipe/csrc/torchplugins/torch_helper.cpp +16 -0
- torchpipe/csrc/torchplugins/torch_helper.hpp +18 -0
- torchpipe/extension.py +61 -0
- torchpipe/group-torchpipe.toml +33 -0
- torchpipe/jit/_build_SyncTensor.py +51 -0
- torchpipe/lib/.gitignore +2 -0
- torchpipe/lib/torchpipe_core-torch113-cpu-abiflag0.so +0 -0
- torchpipe/lib/torchpipe_core-torch210-cpu-abiflag1.so +0 -0
- torchpipe/lib/torchpipe_core-torch23-cpu-abiflag0.so +0 -0
- torchpipe/lib/torchpipe_core-torch24-cpu-abiflag0.so +0 -0
- torchpipe/lib/torchpipe_core-torch25-cpu-abiflag0.so +0 -0
- torchpipe/lib/torchpipe_core-torch26-cpu-abiflag0.so +0 -0
- torchpipe/lib/torchpipe_core-torch27-cpu-abiflag1.so +0 -0
- torchpipe/lib/torchpipe_core-torch28-cpu-abiflag1.so +0 -0
- torchpipe/lib/torchpipe_core-torch29-cpu-abiflag1.so +0 -0
- torchpipe/lib/torchpipe_opencv-abiflag0.so +0 -0
- torchpipe/lib/torchpipe_opencv-abiflag1.so +0 -0
- torchpipe/load_libs.py +372 -0
- torchpipe/load_omniback.py_ +74 -0
- torchpipe/serve/__init__.py +0 -0
- torchpipe/serve/api_protocol.py +267 -0
- torchpipe/serve/errors.py +11 -0
- torchpipe/serve/openai/__init__.py +20 -0
- torchpipe/serve/openai/async_backend_engine.py +178 -0
- torchpipe/serve/openai/client.sh +9 -0
- torchpipe/serve/openai/openai_server_api.py +445 -0
- torchpipe/serve/output.py +93 -0
- torchpipe/serve/register.py +36 -0
- torchpipe/serve/server_args.py +127 -0
- torchpipe/serve/streaming_response.py +90 -0
- torchpipe/utils/.gitignore +2 -0
- torchpipe/utils/__init__.py +15 -0
- torchpipe/utils/_build_cv.py +372 -0
- torchpipe/utils/_build_trt.py +264 -0
- torchpipe/utils/_cache_setting.py +8 -0
- torchpipe/utils/_group_setting.py_ +14 -0
- torchpipe/utils/benchmark.py +953 -0
- torchpipe/utils/model_helper.py +1065 -0
- torchpipe-0.1.23.dist-info/METADATA +33 -0
- torchpipe-0.1.23.dist-info/RECORD +99 -0
- torchpipe-0.1.23.dist-info/WHEEL +7 -0
- torchpipe-0.1.23.dist-info/licenses/LICENSE +210 -0
- torchpipe-0.1.23.dist-info/top_level.txt +1 -0
torchpipe/__init__.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# isort: skip_file
|
|
2
|
+
|
|
3
|
+
from packaging import version
|
|
4
|
+
import logging
|
|
5
|
+
logger = logging.getLogger(__name__) # type: ignore
|
|
6
|
+
|
|
7
|
+
import ctypes, os
|
|
8
|
+
|
|
9
|
+
ORI_TVM_FFI_DISABLE_TORCH_C_DLPACK = os.environ.get(
|
|
10
|
+
"TVM_FFI_DISABLE_TORCH_C_DLPACK", "0")
|
|
11
|
+
if ORI_TVM_FFI_DISABLE_TORCH_C_DLPACK == "0":
|
|
12
|
+
os.environ["TVM_FFI_DISABLE_TORCH_C_DLPACK"] = "1"
|
|
13
|
+
|
|
14
|
+
import omniback
|
|
15
|
+
|
|
16
|
+
import torch
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from importlib.metadata import version as _get_version
|
|
21
|
+
__version__ = _get_version("torchpipe")
|
|
22
|
+
except Exception:
|
|
23
|
+
__version__ = "0.0.0-dev"
|
|
24
|
+
|
|
25
|
+
# -----------------------
|
|
26
|
+
# assert omniback.compiled_with_cxx11_abi() == torch.compiled_with_cxx11_abi()
|
|
27
|
+
|
|
28
|
+
logger.info(f'torch.cuda.is_available() = {torch.cuda.is_available()}')
|
|
29
|
+
|
|
30
|
+
torch.set_num_threads(torch.get_num_threads())
|
|
31
|
+
|
|
32
|
+
# -----------------------
|
|
33
|
+
from .load_libs import _load_or_build_lib, _load_or_build_lib_skip_if_error # nosort
|
|
34
|
+
from .load_libs import _setting_group_handle # nosort
|
|
35
|
+
|
|
36
|
+
SKIP_ALL=os.environ.get("TORCHPIPE_SKIP_ALL", "0")
|
|
37
|
+
|
|
38
|
+
if SKIP_ALL != "1":
|
|
39
|
+
try:
|
|
40
|
+
_load_or_build_lib("torchpipe_core")
|
|
41
|
+
if torch.cuda.is_available():
|
|
42
|
+
_load_or_build_lib("torchpipe_core_cuda")
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.warning(f'Failed to load or JIT compile builtin extensions: \n{e}')
|
|
45
|
+
SKIP_ALL = "1"
|
|
46
|
+
else:
|
|
47
|
+
SKIP_TENSORRT=os.environ.get("TORCHPIPE_SKIP_TENSORRT", "0")
|
|
48
|
+
|
|
49
|
+
if torch.cuda.is_available():
|
|
50
|
+
# _load_or_build_lib_skip_if_error("torchpipe_core_cuda")
|
|
51
|
+
if SKIP_TENSORRT != "1":
|
|
52
|
+
_load_or_build_lib_skip_if_error("torchpipe_tensorrt")
|
|
53
|
+
|
|
54
|
+
_load_or_build_lib_skip_if_error("torchpipe_nvjpeg")
|
|
55
|
+
else:
|
|
56
|
+
logger.warning("[JIT] CUDA is not available, skip loading CUDA extensions.")
|
|
57
|
+
|
|
58
|
+
SKIP_OPENCV=os.environ.get("TORCHPIPE_SKIP_OPENCV", "0")
|
|
59
|
+
if SKIP_OPENCV != "1":
|
|
60
|
+
_load_or_build_lib_skip_if_error("torchpipe_opencv")
|
|
61
|
+
|
|
62
|
+
grp_config = os.path.join(os.path.dirname(__file__), "group-torchpipe.toml")
|
|
63
|
+
assert os.path.exists(grp_config), grp_config
|
|
64
|
+
_setting_group_handle(grp_config)
|
|
65
|
+
logger.info(f"Loaded group config from {grp_config}")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# -----------------------
|
|
69
|
+
pipe = omniback.pipe
|
|
70
|
+
Dict = omniback.Dict
|
|
71
|
+
register = omniback.register
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# -----------------------
|
|
75
|
+
def set_fast_dlpack():
|
|
76
|
+
import tvm_ffi
|
|
77
|
+
tvm_ffi._optional_torch_c_dlpack.load_torch_c_dlpack_extension()
|
|
78
|
+
tvm_ffi._optional_torch_c_dlpack.patch_torch_cuda_stream_protocol()
|
|
79
|
+
if hasattr(torch.Tensor, "__dlpack_c_exchange_api__"):
|
|
80
|
+
# type: ignore[attr-defined]
|
|
81
|
+
api_attr = torch.Tensor.__dlpack_c_exchange_api__
|
|
82
|
+
if api_attr:
|
|
83
|
+
# PyCapsule - extract the pointer as integer
|
|
84
|
+
pythonapi = ctypes.pythonapi
|
|
85
|
+
# Set restype to c_size_t to get integer directly (avoids c_void_p quirks)
|
|
86
|
+
pythonapi.PyCapsule_GetPointer.restype = ctypes.c_size_t
|
|
87
|
+
pythonapi.PyCapsule_GetPointer.argtypes = [
|
|
88
|
+
ctypes.py_object, ctypes.c_char_p]
|
|
89
|
+
capsule_name = b"dlpack_exchange_api"
|
|
90
|
+
api_ptr = pythonapi.PyCapsule_GetPointer(api_attr, capsule_name)
|
|
91
|
+
assert api_ptr != 0, "API pointer from PyCapsule should not be NULL"
|
|
92
|
+
omniback.ffi.set_dlpack_exchange_api(api_ptr)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if torch.__version__ >= torch.torch_version.TorchVersion("2.3.0"):
|
|
96
|
+
if ORI_TVM_FFI_DISABLE_TORCH_C_DLPACK == "0":
|
|
97
|
+
os.environ["TVM_FFI_DISABLE_TORCH_C_DLPACK"] = "0"
|
|
98
|
+
|
|
99
|
+
set_fast_dlpack()
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
#include "c10/cuda/CUDAFunctions.h"
|
|
2
|
+
#include "c10/cuda/CUDAStream.h"
|
|
3
|
+
|
|
4
|
+
#include "ATen/cuda/CUDAEvent.h"
|
|
5
|
+
#include "core_cuda/SyncTensor.hpp"
|
|
6
|
+
#include "helper/torch.hpp"
|
|
7
|
+
#include "helper_cuda/torch.hpp"
|
|
8
|
+
|
|
9
|
+
#include "omniback/core/helper.hpp"
|
|
10
|
+
#include "omniback/helper/resource_pool.hpp"
|
|
11
|
+
|
|
12
|
+
#include <tvm/ffi/extra/c_env_api.h>
|
|
13
|
+
#include "helper/dlpack_helper.hpp"
|
|
14
|
+
|
|
15
|
+
namespace torchpipe {
|
|
16
|
+
void SyncTensor::impl_init(
|
|
17
|
+
const std::unordered_map<std::string, std::string>& config,
|
|
18
|
+
const dict& kwargs) {
|
|
19
|
+
auto dep = om::parser_v2::get_opt_dependency_name(this, config);
|
|
20
|
+
|
|
21
|
+
const auto cls_name = om::get_cls_name(this, "SyncTensor");
|
|
22
|
+
|
|
23
|
+
auto iter = config.find(TASK_INDEX_KEY);
|
|
24
|
+
OMNI_ASSERT(
|
|
25
|
+
iter != config.end(),
|
|
26
|
+
"You are not in an independent thread mode(TASK_INDEX_KEY was not detected. Maybe use `With[StreamPool, *]` instead");
|
|
27
|
+
|
|
28
|
+
independent_thread_index_ = std::stoi(iter->second);
|
|
29
|
+
OMNI_ASSERT(independent_thread_index_ >= 0);
|
|
30
|
+
|
|
31
|
+
if (config.find("device_id") != config.end()) {
|
|
32
|
+
throw std::runtime_error(
|
|
33
|
+
"SyncTensor: device_id is not supported by SyncTensor yet.");
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const auto device_id_int = c10::cuda::current_device(); //- 1;
|
|
37
|
+
|
|
38
|
+
c10::cuda::getCurrentCUDAStream().synchronize();
|
|
39
|
+
|
|
40
|
+
bNeedSync_ = torch_not_use_default_stream(device_id_int, true);
|
|
41
|
+
// Schedule保证了init和forward在同一个线程
|
|
42
|
+
OMNI_ASSERT(
|
|
43
|
+
bNeedSync_,
|
|
44
|
+
"This backend can only be used in default current stream. may be use `With[StreamPool,*]` instead.");
|
|
45
|
+
|
|
46
|
+
TVMFFIStreamHandle out_original_stream{nullptr};
|
|
47
|
+
TVMFFIStreamHandle in_stream = c10::cuda::getCurrentCUDAStream().stream();
|
|
48
|
+
TVM_FFI_ICHECK(
|
|
49
|
+
0 ==
|
|
50
|
+
TVMFFIEnvSetStream(
|
|
51
|
+
kDLCUDA, device_id_int, in_stream, &out_original_stream));
|
|
52
|
+
DLPackManagedTensorAllocator opt_out_original_allocator{nullptr};
|
|
53
|
+
TVM_FFI_ICHECK(nullptr == TVMFFIEnvGetDLPackManagedTensorAllocator());
|
|
54
|
+
// https: //
|
|
55
|
+
// github.com/apache/tvm-ffi/blob/6e7cafab78cb007d066bc860c600e2ba80b4d1a7/python/tvm_ffi/utils/_build_optional_torch_c_dlpack.py#L535
|
|
56
|
+
TVM_FFI_ICHECK(
|
|
57
|
+
0 ==
|
|
58
|
+
TVMFFIEnvSetDLPackManagedTensorAllocator(
|
|
59
|
+
torch_allocator(), 0, &opt_out_original_allocator));
|
|
60
|
+
TVM_FFI_ICHECK(nullptr == out_original_stream);
|
|
61
|
+
// TVM_FFI_ICHECK(nullptr == opt_out_original_allocator);
|
|
62
|
+
|
|
63
|
+
if (dep && !owned_backend_) {
|
|
64
|
+
owned_backend_ = om::init_backend(*dep, config, kwargs);
|
|
65
|
+
}
|
|
66
|
+
// ManagedTensorAllocator
|
|
67
|
+
|
|
68
|
+
c10::cuda::getCurrentCUDAStream().synchronize();
|
|
69
|
+
|
|
70
|
+
OMNI_ASSERT(c10::cuda::device_count() >= 1);
|
|
71
|
+
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
void SyncTensor::impl_forward(const std::vector<dict>& ios) {
|
|
76
|
+
// std::string sync_stream = dict_get<std::string>(ios[0],
|
|
77
|
+
// "sync_stream",true);
|
|
78
|
+
|
|
79
|
+
if (owned_backend_) {
|
|
80
|
+
static auto curr_stream = c10::cuda::getCurrentCUDAStream(-1);
|
|
81
|
+
static auto default_stream = c10::cuda::getDefaultCUDAStream(-1);
|
|
82
|
+
|
|
83
|
+
event_.record(default_stream);
|
|
84
|
+
event_.block(curr_stream);
|
|
85
|
+
}
|
|
86
|
+
impl_dep_forward(ios);
|
|
87
|
+
|
|
88
|
+
c10::cuda::getCurrentCUDAStream().synchronize();
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
OMNI_REGISTER(om::Backend, SyncTensor, "SyncTensor,StreamGuard");
|
|
92
|
+
|
|
93
|
+
class TorchStreamPool : public om::Backend {
|
|
94
|
+
void impl_init(
|
|
95
|
+
const std::unordered_map<std::string, std::string>& params,
|
|
96
|
+
const dict& options) override {
|
|
97
|
+
auto [args, kwargs] =
|
|
98
|
+
om::parser_v2::get_args_kwargs(this, "TorchStreamPool", params);
|
|
99
|
+
om::str::try_update<size_t>(kwargs, "max_stream", max_stream_count_);
|
|
100
|
+
OMNI_ASSERT(max_stream_count_ > 0 && max_stream_count_ < 32);
|
|
101
|
+
stream_pool_ =
|
|
102
|
+
std::make_unique<om::pool::ResourcePool<size_t>>(max_stream_count_);
|
|
103
|
+
for (size_t i = 0; i < max_stream_count_; ++i) {
|
|
104
|
+
auto stream = c10::cuda::getStreamFromPool(true, -1);
|
|
105
|
+
stream_event_.emplace_back(
|
|
106
|
+
StreamWithEvent{std::move(stream), at::cuda::CUDAEvent()});
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
void impl_forward_with_dep(const std::vector<om::dict>& ios, om::Backend& dep)
|
|
111
|
+
override {
|
|
112
|
+
auto index = stream_pool_->acquire();
|
|
113
|
+
om::pool::ResourcePool<size_t>::lease_guard guard(
|
|
114
|
+
stream_pool_.get(), index);
|
|
115
|
+
auto& se = stream_event_.at(index);
|
|
116
|
+
auto original_stream = c10::cuda::getCurrentCUDAStream(-1);
|
|
117
|
+
if (se.stream != original_stream) {
|
|
118
|
+
c10::cuda::CUDAStreamGuard s(se.stream);
|
|
119
|
+
// https://stackoverflow.com/questions/15501699/cudastreamwaitevent-does-not-seem-to-wait
|
|
120
|
+
se.event.record(original_stream);
|
|
121
|
+
se.event.block(se.stream);
|
|
122
|
+
|
|
123
|
+
dep.safe_forward(ios);
|
|
124
|
+
|
|
125
|
+
se.event.record(se.stream);
|
|
126
|
+
se.event.block(original_stream);
|
|
127
|
+
} else {
|
|
128
|
+
dep.safe_forward(ios);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
[[nodiscard]] uint32_t impl_max() const override {
|
|
133
|
+
return std::numeric_limits<uint32_t>::max();
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
private:
|
|
137
|
+
struct StreamWithEvent {
|
|
138
|
+
c10::cuda::CUDAStream stream;
|
|
139
|
+
at::cuda::CUDAEvent event;
|
|
140
|
+
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT_1gf4fcb74343aa689f4159791967868446
|
|
141
|
+
};
|
|
142
|
+
size_t max_stream_count_{1};
|
|
143
|
+
std::vector<StreamWithEvent> stream_event_;
|
|
144
|
+
std::unique_ptr<om::pool::ResourcePool<size_t>> stream_pool_;
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
OMNI_REGISTER(om::Backend, TorchStreamPool, "TorchStreamPool, StreamPool");
|
|
148
|
+
} // namespace torchpipe
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <omniback/extension.hpp>
|
|
4
|
+
#include <string>
|
|
5
|
+
#include <unordered_set>
|
|
6
|
+
|
|
7
|
+
#include <ATen/cuda/CUDAEvent.h>
|
|
8
|
+
#include "c10/cuda/CUDAGuard.h"
|
|
9
|
+
|
|
10
|
+
using om::dict;
|
|
11
|
+
|
|
12
|
+
namespace torchpipe {
|
|
13
|
+
class SyncTensor : public om::Backend {
|
|
14
|
+
public:
|
|
15
|
+
/**
|
|
16
|
+
* @brief
|
|
17
|
+
* Initialization, determines whether the default stream is being used; if
|
|
18
|
+
* so, and in independent thread mode, it will bind a new CUDA stream to the
|
|
19
|
+
* current thread for GPU asynchronous execution. (Renamed from Torch)
|
|
20
|
+
*
|
|
21
|
+
* @param TASK_INDEX_KEY When the parameter is not null, it represents
|
|
22
|
+
* independent thread mode, at this time it can be assumed that init and
|
|
23
|
+
* forward are running in the same independent thread. Check if the CUDA
|
|
24
|
+
* stream is the default stream, if not, we bind the thread to a new stream
|
|
25
|
+
* and set bNeedSync_ to true, otherwise do nothing.
|
|
26
|
+
* @param SyncTensor::backend Default is Identity. The backend it forwards
|
|
27
|
+
* execution to.
|
|
28
|
+
* @note Usage: SyncTensor[A], SequentialV0[A,B,C,SyncTensor] or
|
|
29
|
+
* SequentialV0[A,B,SyncTensor[C]]. For serial units, such as
|
|
30
|
+
* SequentialV0[SyncTensor[A],SyncTensor[B]], it will initialize in reverse
|
|
31
|
+
* order and forward in order: SyncTensor[B].init -> SyncTensor[A].init ->
|
|
32
|
+
* SyncTensor[A].forward
|
|
33
|
+
* -> SyncTensor[B].forward; SyncTensor[A] is not the default stream at
|
|
34
|
+
* initialization, so it does not need to set a new stream, and it does not
|
|
35
|
+
* need to be responsible for stream synchronization during forward, at this
|
|
36
|
+
* time if SyncTensor[B] has set a new stream, then SyncTensor[B] is
|
|
37
|
+
* responsible for stream synchronization;
|
|
38
|
+
*
|
|
39
|
+
* @ref SequentialV0 and other containers can ensure that the
|
|
40
|
+
* initialization order of their child backends is opposite to the forward
|
|
41
|
+
* order, therefore, even in complex situations, the correct stream
|
|
42
|
+
* synchronization timing can be obtained.
|
|
43
|
+
*/
|
|
44
|
+
virtual void impl_init(
|
|
45
|
+
const std::unordered_map<std::string, std::string>&,
|
|
46
|
+
const dict&) override;
|
|
47
|
+
|
|
48
|
+
// virtual void post_init(
|
|
49
|
+
// const std::unordered_map<std::string, std::string>&,
|
|
50
|
+
// const dict&) override;
|
|
51
|
+
/**
|
|
52
|
+
* @brief
|
|
53
|
+
* 如果init时绑定了新的流(也就是bNeedSync_==true),则forward时当子后端执行完毕后执行当前流上的同步。
|
|
54
|
+
*
|
|
55
|
+
*/
|
|
56
|
+
virtual void impl_forward(const std::vector<dict>& ios) override;
|
|
57
|
+
|
|
58
|
+
[[nodiscard]] uint32_t impl_max() const override {
|
|
59
|
+
if (owned_backend_) {
|
|
60
|
+
return owned_backend_->max();
|
|
61
|
+
} else {
|
|
62
|
+
return std::numeric_limits<uint32_t>::max(); // default
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
[[nodiscard]] uint32_t impl_min() const override {
|
|
67
|
+
if (owned_backend_) {
|
|
68
|
+
return owned_backend_->min();
|
|
69
|
+
} else {
|
|
70
|
+
return 1; // default
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
private:
|
|
75
|
+
void impl_dep_forward(const std::vector<dict>& ios) {
|
|
76
|
+
if (owned_backend_)
|
|
77
|
+
owned_backend_->forward(ios);
|
|
78
|
+
else {
|
|
79
|
+
for (const auto& io : ios) {
|
|
80
|
+
(*io)[TASK_RESULT_KEY] = io->at(TASK_DATA_KEY);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
bool bNeedSync_ = false;
|
|
86
|
+
std::unique_ptr<Backend> owned_backend_;
|
|
87
|
+
int independent_thread_index_{-1};
|
|
88
|
+
std::optional<c10::cuda::CUDAStream> stream_;
|
|
89
|
+
at::cuda::CUDAEvent event_;
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
// class StreamGuard : public om::DependencyV0 {
|
|
93
|
+
// public:
|
|
94
|
+
// /**
|
|
95
|
+
// * @brief
|
|
96
|
+
// * Initialization, determines whether the default stream is being used; if
|
|
97
|
+
// * so, and in independent thread mode, it will bind a new CUDA stream to
|
|
98
|
+
// the
|
|
99
|
+
// * current thread for GPU asynchronous execution. (Renamed from Torch)
|
|
100
|
+
// *
|
|
101
|
+
// * @param TASK_INDEX_KEY When the parameter is not null, it represents
|
|
102
|
+
// * independent thread mode, at this time it can be assumed that init and
|
|
103
|
+
// * forward are running in the same independent thread. Check if the CUDA
|
|
104
|
+
// * stream is the default stream, if not, we bind the thread to a new stream
|
|
105
|
+
// * and set bNeedSync_ to true, otherwise do nothing.
|
|
106
|
+
// * @param StreamGuard::backend Default is Identity. The backend it forwards
|
|
107
|
+
// * execution to.
|
|
108
|
+
// *
|
|
109
|
+
// * @ref SequentialV0 and other containers can ensure that the
|
|
110
|
+
// * initialization order of their child backends is opposite to the forward
|
|
111
|
+
// * order, therefore, even in complex situations, the correct stream
|
|
112
|
+
// * synchronization timing can be obtained.
|
|
113
|
+
// */
|
|
114
|
+
// virtual void pre_init(
|
|
115
|
+
// const std::unordered_map<std::string, std::string>&,
|
|
116
|
+
// const dict&) override;
|
|
117
|
+
|
|
118
|
+
// virtual void post_init(
|
|
119
|
+
// const std::unordered_map<std::string, std::string>&,
|
|
120
|
+
// const dict&) override;
|
|
121
|
+
// /**
|
|
122
|
+
// * @brief
|
|
123
|
+
// * 如果init时绑定了新的流(也就是bNeedSync_==true),则forward时当子后端执行完毕后执行当前流上的同步。
|
|
124
|
+
// *
|
|
125
|
+
// */
|
|
126
|
+
// virtual void custom_forward_with_dep(const std::vector<dict>&, Backend*)
|
|
127
|
+
// override;
|
|
128
|
+
|
|
129
|
+
// private:
|
|
130
|
+
// // bool bNeedSync_ = false;
|
|
131
|
+
// std::optional<c10::cuda::CUDAStream> stream_;
|
|
132
|
+
// std::unique_ptr<c10::cuda::CUDAStreamGuard> stream_guard_;
|
|
133
|
+
// };
|
|
134
|
+
} // namespace torchpipe
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#include <numeric>
|
|
2
|
+
#include <unordered_map>
|
|
3
|
+
|
|
4
|
+
#include <cuda_runtime_api.h>
|
|
5
|
+
#include "c10/cuda/CUDAStream.h"
|
|
6
|
+
#include "omniback/builtin/page_table.hpp"
|
|
7
|
+
#include "omniback/core/backend.hpp"
|
|
8
|
+
|
|
9
|
+
#include <torch/torch.h>
|
|
10
|
+
#include "omniback/helper/base_logging.hpp"
|
|
11
|
+
|
|
12
|
+
namespace torchpipe {
|
|
13
|
+
class LocationManager {
|
|
14
|
+
public:
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
using namespace om;
|
|
18
|
+
class FIAppendTensor : public om::BackendOne {
|
|
19
|
+
private:
|
|
20
|
+
size_t max_num_req_{16};
|
|
21
|
+
size_t max_num_page_{0};
|
|
22
|
+
size_t max_context_len_{4096};
|
|
23
|
+
size_t num_layer_{32};
|
|
24
|
+
size_t head_num_{32};
|
|
25
|
+
size_t head_dim_{128};
|
|
26
|
+
size_t page_size_{16};
|
|
27
|
+
|
|
28
|
+
private:
|
|
29
|
+
size_t max_num_page_per_seq_{0};
|
|
30
|
+
bool inited_{false};
|
|
31
|
+
// std::vector<torch::Tensor> k_;
|
|
32
|
+
// std::vector<torch::Tensor> v_;
|
|
33
|
+
std::unique_ptr<PageTable> pool_;
|
|
34
|
+
|
|
35
|
+
private:
|
|
36
|
+
void impl_init(
|
|
37
|
+
const std::unordered_map<string, string>& params,
|
|
38
|
+
const dict& options) override {
|
|
39
|
+
str::try_update(params, "max_num_req", max_num_req_);
|
|
40
|
+
max_num_page_ = str::get<size_t>(params, "max_num_page");
|
|
41
|
+
// str::try_update(params, "max_context_len", max_context_len_);
|
|
42
|
+
// str::try_update(params, "num_layer", num_layer_);
|
|
43
|
+
// str::try_update(params, "head_num", head_num_);
|
|
44
|
+
// str::try_update(params, "head_dim", head_dim_);
|
|
45
|
+
OMNI_ASSERT(max_context_len_ % page_size_ == 0);
|
|
46
|
+
max_num_page_per_seq_ = max_context_len_ / page_size_;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// void get(torch::Tensor kv_append_length) {
|
|
50
|
+
// for (size_t i = 0; i < kv_append_length.size(0); ++i) {
|
|
51
|
+
// }
|
|
52
|
+
|
|
53
|
+
// torch::Tensor batch_indices;
|
|
54
|
+
// torch::Tensor positions;
|
|
55
|
+
|
|
56
|
+
// torch::Tensor kv_page_indptr;
|
|
57
|
+
// torch::Tensor kv_last_page_len;
|
|
58
|
+
// torch::Tensor kv_append_indptr;
|
|
59
|
+
// }
|
|
60
|
+
torch::Tensor vec2tensor(const std::vector<int>& data) {
|
|
61
|
+
thread_local auto options = torch::TensorOptions()
|
|
62
|
+
.device(torch::kCUDA, -1)
|
|
63
|
+
.dtype(torch::kInt) // torch::kByte
|
|
64
|
+
.layout(torch::kStrided)
|
|
65
|
+
.requires_grad(false);
|
|
66
|
+
torch::Tensor re =
|
|
67
|
+
torch::empty({static_cast<int64_t>(data.size())}, options);
|
|
68
|
+
cudaError_t cuda_status = cudaMemcpyAsync(
|
|
69
|
+
re.data_ptr(), // 目标设备指针
|
|
70
|
+
data.data(), // 主机源指针
|
|
71
|
+
data.size() * sizeof(int), // 字节大小
|
|
72
|
+
cudaMemcpyHostToDevice, // 传输方向
|
|
73
|
+
c10::cuda::getCurrentCUDAStream());
|
|
74
|
+
if (cuda_status != cudaSuccess) {
|
|
75
|
+
throw std::runtime_error(
|
|
76
|
+
"CUDA 拷贝失败: " + std::string(cudaGetErrorString(cuda_status)));
|
|
77
|
+
}
|
|
78
|
+
return re;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
void forward(const dict& io) override {
|
|
82
|
+
if (!inited_)
|
|
83
|
+
lazy_init();
|
|
84
|
+
// in
|
|
85
|
+
// id, type(prefill, decode), seq_len,
|
|
86
|
+
// out
|
|
87
|
+
bool success = true;
|
|
88
|
+
std::vector<id_type> id = dict_gets<id_type>(io, "request_ids");
|
|
89
|
+
// https://docs.flashinfer.ai/generated/flashinfer.page.append_paged_kv_cache.html#flashinfer.page.append_paged_kv_cache
|
|
90
|
+
torch::Tensor seq_lens = dict_get<torch::Tensor>(io, "kv_append_length");
|
|
91
|
+
OMNI_ASSERT(id.size() == seq_lens.size(0) && seq_lens.is_cpu());
|
|
92
|
+
size_t total{0};
|
|
93
|
+
|
|
94
|
+
for (size_t i = 0; i < id.size(); ++i) {
|
|
95
|
+
SPDLOG_INFO("id = {}", id[i]);
|
|
96
|
+
auto seq_len = seq_lens[i].item<int>();
|
|
97
|
+
success = success && pool_->alloc(id[i], seq_len);
|
|
98
|
+
OMNI_ASSERT(success);
|
|
99
|
+
const auto& infor = pool_->page_info(id[i]);
|
|
100
|
+
total += infor.kv_page_indices.size();
|
|
101
|
+
}
|
|
102
|
+
std::vector<int> kv_page_indices;
|
|
103
|
+
kv_page_indices.reserve(total);
|
|
104
|
+
|
|
105
|
+
std::vector<int> kv_page_indptr(1 + id.size(), 0);
|
|
106
|
+
std::vector<int> kv_last_page_len(id.size());
|
|
107
|
+
for (size_t i = 0; i < id.size(); ++i) {
|
|
108
|
+
const auto& infor = pool_->page_info(id[i]);
|
|
109
|
+
kv_page_indices.insert(
|
|
110
|
+
kv_page_indices.end(),
|
|
111
|
+
infor.kv_page_indices.begin(),
|
|
112
|
+
infor.kv_page_indices.end());
|
|
113
|
+
kv_page_indptr[i + 1] = kv_page_indptr[i] + infor.kv_page_indices.size();
|
|
114
|
+
kv_last_page_len[i] = infor.kv_last_page_len;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
auto kv_page_indices_CUDA = vec2tensor(kv_page_indices);
|
|
118
|
+
auto kv_page_indptr_CUDA = vec2tensor(kv_page_indptr);
|
|
119
|
+
auto kv_last_page_len_CUDA = vec2tensor(kv_last_page_len);
|
|
120
|
+
|
|
121
|
+
(*io)["kv_page_indices"] = kv_page_indices_CUDA;
|
|
122
|
+
(*io)["kv_page_indptr"] = kv_page_indptr_CUDA;
|
|
123
|
+
(*io)["kv_last_page_len"] = kv_last_page_len_CUDA;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
void lazy_init() {
|
|
127
|
+
// if (max_num_page_ == 0) {
|
|
128
|
+
// auto stats = torch::cuda::memory_stats(-1);
|
|
129
|
+
// int64_t free_memory = stats.free_bytes; // 剩余显存字节数
|
|
130
|
+
|
|
131
|
+
// max_num_page_ = static_cast<size_t>(
|
|
132
|
+
// (free_memory * 0.9) /
|
|
133
|
+
// (page_size_ * head_num_ * head_dim_ * 2 /*kv*/ * num_layer_ *
|
|
134
|
+
// 2 /*fp16 */));
|
|
135
|
+
// }
|
|
136
|
+
|
|
137
|
+
// k_.resize(num_layer_);
|
|
138
|
+
// v_.resize(num_layer_);
|
|
139
|
+
// auto options = torch::TensorOptions()
|
|
140
|
+
// .device(torch::kCUDA, -1)
|
|
141
|
+
// .dtype(torch::kFloat16)
|
|
142
|
+
// .layout(torch::kStrided)
|
|
143
|
+
// .requires_grad(false);
|
|
144
|
+
// for (size_t layer_index = 0; layer_index < num_layer_; ++layer_index) {
|
|
145
|
+
// k_[layer_index] = torch::empty(
|
|
146
|
+
// {max_num_page_, page_size_, head_num_, head_dim_},
|
|
147
|
+
// options,
|
|
148
|
+
// torch::MemoryFormat::Contiguous);
|
|
149
|
+
// v_[layer_index] = torch::empty(
|
|
150
|
+
// {max_num_page_, page_size_, head_num_, head_dim_},
|
|
151
|
+
// options,
|
|
152
|
+
// torch::MemoryFormat::Contiguous);
|
|
153
|
+
// }
|
|
154
|
+
pool_ =
|
|
155
|
+
std::make_unique<PageTable>(max_num_req_, max_num_page_, page_size_);
|
|
156
|
+
inited_ = true;
|
|
157
|
+
}
|
|
158
|
+
};
|
|
159
|
+
OMNI_REGISTER_BACKEND(FIAppendTensor);
|
|
160
|
+
} // namespace torchpipe
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <string>
|
|
3
|
+
#include <unordered_set>
|
|
4
|
+
|
|
5
|
+
#include <tvm/ffi/extra/c_env_api.h>
|
|
6
|
+
#include <tvm/ffi/c_api.h>
|
|
7
|
+
#include <tvm/ffi/error.h>
|
|
8
|
+
#include "omniback/ffi/types.hpp"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
namespace torchpipe {
|
|
12
|
+
|
|
13
|
+
inline DLPackManagedTensorAllocator torch_allocator() {
|
|
14
|
+
DLPackExchangeAPI* api =
|
|
15
|
+
reinterpret_cast<DLPackExchangeAPI*>(om::ffi::dlpack_exchange_api());
|
|
16
|
+
if (api)
|
|
17
|
+
{
|
|
18
|
+
static DLPackManagedTensorAllocator& alloc = api->managed_tensor_allocator;
|
|
19
|
+
TVM_FFI_ICHECK(alloc);
|
|
20
|
+
return alloc;
|
|
21
|
+
}else{
|
|
22
|
+
return nullptr;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#include "helper/mat.hpp"
|
|
2
|
+
// #include <torch/torch.h>
|
|
3
|
+
#include "helper/torch.hpp"
|
|
4
|
+
|
|
5
|
+
namespace torchpipe::convert{
|
|
6
|
+
ImageData TorchAny2ImageData(om::any tensor) {
|
|
7
|
+
torch::Tensor data = tensor.cast<torch::Tensor>();
|
|
8
|
+
return torchpipe::torch2ImageData(data);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
om::any imageDataToAnyTorchCPU(const convert::ImageData& img) {
|
|
12
|
+
return torchpipe::imageDataToTorchCPU(img);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
om::any imageDataToAnyTorchGPU(const convert::ImageData& img) {
|
|
16
|
+
return torchpipe::imageDataToTorchCPU(img).cuda();
|
|
17
|
+
}
|
|
18
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <cstddef>
|
|
3
|
+
#include <cstdint>
|
|
4
|
+
#include <functional>
|
|
5
|
+
#include "omniback/core/any.hpp"
|
|
6
|
+
|
|
7
|
+
namespace torchpipe::convert{
|
|
8
|
+
|
|
9
|
+
struct ImageData {
|
|
10
|
+
void* data = nullptr;
|
|
11
|
+
size_t rows = 0;
|
|
12
|
+
size_t cols = 0;
|
|
13
|
+
size_t channels = 0;
|
|
14
|
+
bool is_float = false; // true: float32, false: uint8
|
|
15
|
+
std::function<void(void*)> deleter;
|
|
16
|
+
};
|
|
17
|
+
ImageData TorchAny2ImageData(om::any tensor);
|
|
18
|
+
|
|
19
|
+
om::any imageDataToAnyTorchCPU(const convert::ImageData& img);
|
|
20
|
+
|
|
21
|
+
om::any imageDataToAnyTorchGPU(const convert::ImageData& img);
|
|
22
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#include <optional>
|
|
2
|
+
#include "helper/net_info.hpp"
|
|
3
|
+
|
|
4
|
+
namespace torchpipe {
|
|
5
|
+
inline bool is_all_positive(const NetIOInfo::Dims64& dims) {
|
|
6
|
+
if (dims.nbDims <= 0) return false;
|
|
7
|
+
for (size_t index = 0; index < dims.nbDims; ++index) {
|
|
8
|
+
if (dims.d[index] <= 0) return false;
|
|
9
|
+
}
|
|
10
|
+
return true;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
bool is_all_positive(NetIOInfos& info) {
|
|
14
|
+
for (const auto& item : info.first) {
|
|
15
|
+
if (!is_all_positive(item.min)) return false;
|
|
16
|
+
}
|
|
17
|
+
for (const auto& item : info.second) {
|
|
18
|
+
if (!is_all_positive(item.min)) return false;
|
|
19
|
+
}
|
|
20
|
+
for (const auto& item : info.first) {
|
|
21
|
+
if (!is_all_positive(item.max)) return false;
|
|
22
|
+
}
|
|
23
|
+
for (const auto& item : info.second) {
|
|
24
|
+
if (!is_all_positive(item.max)) return false;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return true;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
size_t elementSize(NetIOInfo::DataType info) {
|
|
31
|
+
switch (info) {
|
|
32
|
+
case NetIOInfo::DataType::INT4:
|
|
33
|
+
case NetIOInfo::DataType::FP4:
|
|
34
|
+
return 1; // 4 bits = 0.5 bytes
|
|
35
|
+
case NetIOInfo::DataType::INT8:
|
|
36
|
+
case NetIOInfo::DataType::UINT8:
|
|
37
|
+
case NetIOInfo::DataType::BOOL:
|
|
38
|
+
case NetIOInfo::DataType::FP8:
|
|
39
|
+
return 1; // 8 bits = 1 byte
|
|
40
|
+
case NetIOInfo::DataType::INT32:
|
|
41
|
+
case NetIOInfo::DataType::FP32:
|
|
42
|
+
case NetIOInfo::DataType::BF32:
|
|
43
|
+
return 4; // 32 bits = 4 bytes
|
|
44
|
+
case NetIOInfo::DataType::INT64:
|
|
45
|
+
return 8; // 64 bits = 8 bytes
|
|
46
|
+
case NetIOInfo::DataType::FP16:
|
|
47
|
+
case NetIOInfo::DataType::BF16:
|
|
48
|
+
return 2; // 16 bits = 2 bytes
|
|
49
|
+
case NetIOInfo::DataType::RESERVED_INT:
|
|
50
|
+
case NetIOInfo::DataType::RESERVED_FP:
|
|
51
|
+
case NetIOInfo::DataType::RESERVED_BF:
|
|
52
|
+
case NetIOInfo::DataType::UNKNOWN:
|
|
53
|
+
default:
|
|
54
|
+
return 0; // Unknown or reserved types
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
} // namespace torchpipe
|