torchpipe 0.1.23__py3-none-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. torchpipe/__init__.py +99 -0
  2. torchpipe/csrc/core_cuda/SyncTensor.cpp +148 -0
  3. torchpipe/csrc/core_cuda/SyncTensor.hpp +134 -0
  4. torchpipe/csrc/core_cuda/kvcacheTensor.cpp +160 -0
  5. torchpipe/csrc/helper/dlpack_helper.hpp +26 -0
  6. torchpipe/csrc/helper/mat.cpp +18 -0
  7. torchpipe/csrc/helper/mat.hpp +22 -0
  8. torchpipe/csrc/helper/net_info.cpp +57 -0
  9. torchpipe/csrc/helper/net_info.hpp +60 -0
  10. torchpipe/csrc/helper/task_keys.hpp +9 -0
  11. torchpipe/csrc/helper/torch.cpp +794 -0
  12. torchpipe/csrc/helper/torch.hpp +149 -0
  13. torchpipe/csrc/helper_cuda/torch.cpp +180 -0
  14. torchpipe/csrc/helper_cuda/torch.hpp +101 -0
  15. torchpipe/csrc/mat_torch/CvtColorMat.cpp +43 -0
  16. torchpipe/csrc/mat_torch/CvtColorMat.hpp +18 -0
  17. torchpipe/csrc/mat_torch/DecodeMat.cpp +61 -0
  18. torchpipe/csrc/mat_torch/DecodeMat.hpp +19 -0
  19. torchpipe/csrc/mat_torch/ResizeMat.cpp +179 -0
  20. torchpipe/csrc/mat_torch/ResizeMat.hpp +60 -0
  21. torchpipe/csrc/mat_torch/converts.cpp +119 -0
  22. torchpipe/csrc/mat_torch/converts.hpp +33 -0
  23. torchpipe/csrc/mat_torch/mat2tensor.hpp +2 -0
  24. torchpipe/csrc/nvjpeg_torch/DecodeTensor.cpp +186 -0
  25. torchpipe/csrc/nvjpeg_torch/DecodeTensor.hpp +24 -0
  26. torchpipe/csrc/nvjpeg_torch/README.md +3 -0
  27. torchpipe/csrc/tensorrt_torch/TensorrtInferTensor.cpp +234 -0
  28. torchpipe/csrc/tensorrt_torch/TensorrtInferTensor.hpp +47 -0
  29. torchpipe/csrc/tensorrt_torch/TensorrtTensor.cpp +20 -0
  30. torchpipe/csrc/tensorrt_torch/TensorrtTensor.hpp +3 -0
  31. torchpipe/csrc/tensorrt_torch/aes.cpp +574 -0
  32. torchpipe/csrc/tensorrt_torch/aes.h +856 -0
  33. torchpipe/csrc/tensorrt_torch/allocator.cpp +91 -0
  34. torchpipe/csrc/tensorrt_torch/allocator.hpp +27 -0
  35. torchpipe/csrc/tensorrt_torch/encrypt.cpp +164 -0
  36. torchpipe/csrc/tensorrt_torch/encrypt.hpp +33 -0
  37. torchpipe/csrc/tensorrt_torch/model.cpp +315 -0
  38. torchpipe/csrc/tensorrt_torch/model.hpp +68 -0
  39. torchpipe/csrc/tensorrt_torch/tensorrt_helper.cpp +1102 -0
  40. torchpipe/csrc/tensorrt_torch/tensorrt_helper.hpp +109 -0
  41. torchpipe/csrc/tensorrt_torch/tensorrt_plugins.cpp +532 -0
  42. torchpipe/csrc/tensorrt_torch/tensorrt_plugins.hpp +190 -0
  43. torchpipe/csrc/tensorrt_torch/tensorrt_plugins_anchor.cpp_ +525 -0
  44. torchpipe/csrc/tensorrt_torch/tensorrt_plugins_anchor.hpp +195 -0
  45. torchpipe/csrc/torchplugins/CropTensor.cpp +143 -0
  46. torchpipe/csrc/torchplugins/CropTensor.hpp +40 -0
  47. torchpipe/csrc/torchplugins/CvtColorTensor.cpp +61 -0
  48. torchpipe/csrc/torchplugins/CvtColorTensor.hpp +21 -0
  49. torchpipe/csrc/torchplugins/GpuTensor.cpp +293 -0
  50. torchpipe/csrc/torchplugins/GpuTensor.hpp +37 -0
  51. torchpipe/csrc/torchplugins/ResizeTensor.cpp +107 -0
  52. torchpipe/csrc/torchplugins/ResizeTensor.hpp +9 -0
  53. torchpipe/csrc/torchplugins/cat_split_tensor.cpp +217 -0
  54. torchpipe/csrc/torchplugins/cat_split_tensor.hpp +61 -0
  55. torchpipe/csrc/torchplugins/continuous_batching_tensor.cpp +43 -0
  56. torchpipe/csrc/torchplugins/continuous_batching_tensor.hpp +18 -0
  57. torchpipe/csrc/torchplugins/torch_helper.cpp +16 -0
  58. torchpipe/csrc/torchplugins/torch_helper.hpp +18 -0
  59. torchpipe/extension.py +61 -0
  60. torchpipe/group-torchpipe.toml +33 -0
  61. torchpipe/jit/_build_SyncTensor.py +51 -0
  62. torchpipe/lib/.gitignore +2 -0
  63. torchpipe/lib/torchpipe_core-torch113-cpu-abiflag0.so +0 -0
  64. torchpipe/lib/torchpipe_core-torch210-cpu-abiflag1.so +0 -0
  65. torchpipe/lib/torchpipe_core-torch23-cpu-abiflag0.so +0 -0
  66. torchpipe/lib/torchpipe_core-torch24-cpu-abiflag0.so +0 -0
  67. torchpipe/lib/torchpipe_core-torch25-cpu-abiflag0.so +0 -0
  68. torchpipe/lib/torchpipe_core-torch26-cpu-abiflag0.so +0 -0
  69. torchpipe/lib/torchpipe_core-torch27-cpu-abiflag1.so +0 -0
  70. torchpipe/lib/torchpipe_core-torch28-cpu-abiflag1.so +0 -0
  71. torchpipe/lib/torchpipe_core-torch29-cpu-abiflag1.so +0 -0
  72. torchpipe/lib/torchpipe_opencv-abiflag0.so +0 -0
  73. torchpipe/lib/torchpipe_opencv-abiflag1.so +0 -0
  74. torchpipe/load_libs.py +372 -0
  75. torchpipe/load_omniback.py_ +74 -0
  76. torchpipe/serve/__init__.py +0 -0
  77. torchpipe/serve/api_protocol.py +267 -0
  78. torchpipe/serve/errors.py +11 -0
  79. torchpipe/serve/openai/__init__.py +20 -0
  80. torchpipe/serve/openai/async_backend_engine.py +178 -0
  81. torchpipe/serve/openai/client.sh +9 -0
  82. torchpipe/serve/openai/openai_server_api.py +445 -0
  83. torchpipe/serve/output.py +93 -0
  84. torchpipe/serve/register.py +36 -0
  85. torchpipe/serve/server_args.py +127 -0
  86. torchpipe/serve/streaming_response.py +90 -0
  87. torchpipe/utils/.gitignore +2 -0
  88. torchpipe/utils/__init__.py +15 -0
  89. torchpipe/utils/_build_cv.py +372 -0
  90. torchpipe/utils/_build_trt.py +264 -0
  91. torchpipe/utils/_cache_setting.py +8 -0
  92. torchpipe/utils/_group_setting.py_ +14 -0
  93. torchpipe/utils/benchmark.py +953 -0
  94. torchpipe/utils/model_helper.py +1065 -0
  95. torchpipe-0.1.23.dist-info/METADATA +33 -0
  96. torchpipe-0.1.23.dist-info/RECORD +99 -0
  97. torchpipe-0.1.23.dist-info/WHEEL +7 -0
  98. torchpipe-0.1.23.dist-info/licenses/LICENSE +210 -0
  99. torchpipe-0.1.23.dist-info/top_level.txt +1 -0
torchpipe/__init__.py ADDED
@@ -0,0 +1,99 @@
1
+ # isort: skip_file
2
+
3
+ from packaging import version
4
+ import logging
5
+ logger = logging.getLogger(__name__) # type: ignore
6
+
7
+ import ctypes, os
8
+
9
+ ORI_TVM_FFI_DISABLE_TORCH_C_DLPACK = os.environ.get(
10
+ "TVM_FFI_DISABLE_TORCH_C_DLPACK", "0")
11
+ if ORI_TVM_FFI_DISABLE_TORCH_C_DLPACK == "0":
12
+ os.environ["TVM_FFI_DISABLE_TORCH_C_DLPACK"] = "1"
13
+
14
+ import omniback
15
+
16
+ import torch
17
+
18
+
19
+ try:
20
+ from importlib.metadata import version as _get_version
21
+ __version__ = _get_version("torchpipe")
22
+ except Exception:
23
+ __version__ = "0.0.0-dev"
24
+
25
+ # -----------------------
26
+ # assert omniback.compiled_with_cxx11_abi() == torch.compiled_with_cxx11_abi()
27
+
28
+ logger.info(f'torch.cuda.is_available() = {torch.cuda.is_available()}')
29
+
30
+ torch.set_num_threads(torch.get_num_threads())
31
+
32
+ # -----------------------
33
+ from .load_libs import _load_or_build_lib, _load_or_build_lib_skip_if_error # nosort
34
+ from .load_libs import _setting_group_handle # nosort
35
+
36
+ SKIP_ALL=os.environ.get("TORCHPIPE_SKIP_ALL", "0")
37
+
38
+ if SKIP_ALL != "1":
39
+ try:
40
+ _load_or_build_lib("torchpipe_core")
41
+ if torch.cuda.is_available():
42
+ _load_or_build_lib("torchpipe_core_cuda")
43
+ except Exception as e:
44
+ logger.warning(f'Failed to load or JIT compile builtin extensions: \n{e}')
45
+ SKIP_ALL = "1"
46
+ else:
47
+ SKIP_TENSORRT=os.environ.get("TORCHPIPE_SKIP_TENSORRT", "0")
48
+
49
+ if torch.cuda.is_available():
50
+ # _load_or_build_lib_skip_if_error("torchpipe_core_cuda")
51
+ if SKIP_TENSORRT != "1":
52
+ _load_or_build_lib_skip_if_error("torchpipe_tensorrt")
53
+
54
+ _load_or_build_lib_skip_if_error("torchpipe_nvjpeg")
55
+ else:
56
+ logger.warning("[JIT] CUDA is not available, skip loading CUDA extensions.")
57
+
58
+ SKIP_OPENCV=os.environ.get("TORCHPIPE_SKIP_OPENCV", "0")
59
+ if SKIP_OPENCV != "1":
60
+ _load_or_build_lib_skip_if_error("torchpipe_opencv")
61
+
62
+ grp_config = os.path.join(os.path.dirname(__file__), "group-torchpipe.toml")
63
+ assert os.path.exists(grp_config), grp_config
64
+ _setting_group_handle(grp_config)
65
+ logger.info(f"Loaded group config from {grp_config}")
66
+
67
+
68
+ # -----------------------
69
+ pipe = omniback.pipe
70
+ Dict = omniback.Dict
71
+ register = omniback.register
72
+
73
+
74
+ # -----------------------
75
+ def set_fast_dlpack():
76
+ import tvm_ffi
77
+ tvm_ffi._optional_torch_c_dlpack.load_torch_c_dlpack_extension()
78
+ tvm_ffi._optional_torch_c_dlpack.patch_torch_cuda_stream_protocol()
79
+ if hasattr(torch.Tensor, "__dlpack_c_exchange_api__"):
80
+ # type: ignore[attr-defined]
81
+ api_attr = torch.Tensor.__dlpack_c_exchange_api__
82
+ if api_attr:
83
+ # PyCapsule - extract the pointer as integer
84
+ pythonapi = ctypes.pythonapi
85
+ # Set restype to c_size_t to get integer directly (avoids c_void_p quirks)
86
+ pythonapi.PyCapsule_GetPointer.restype = ctypes.c_size_t
87
+ pythonapi.PyCapsule_GetPointer.argtypes = [
88
+ ctypes.py_object, ctypes.c_char_p]
89
+ capsule_name = b"dlpack_exchange_api"
90
+ api_ptr = pythonapi.PyCapsule_GetPointer(api_attr, capsule_name)
91
+ assert api_ptr != 0, "API pointer from PyCapsule should not be NULL"
92
+ omniback.ffi.set_dlpack_exchange_api(api_ptr)
93
+
94
+
95
+ if torch.__version__ >= torch.torch_version.TorchVersion("2.3.0"):
96
+ if ORI_TVM_FFI_DISABLE_TORCH_C_DLPACK == "0":
97
+ os.environ["TVM_FFI_DISABLE_TORCH_C_DLPACK"] = "0"
98
+
99
+ set_fast_dlpack()
@@ -0,0 +1,148 @@
1
+ #include "c10/cuda/CUDAFunctions.h"
2
+ #include "c10/cuda/CUDAStream.h"
3
+
4
+ #include "ATen/cuda/CUDAEvent.h"
5
+ #include "core_cuda/SyncTensor.hpp"
6
+ #include "helper/torch.hpp"
7
+ #include "helper_cuda/torch.hpp"
8
+
9
+ #include "omniback/core/helper.hpp"
10
+ #include "omniback/helper/resource_pool.hpp"
11
+
12
+ #include <tvm/ffi/extra/c_env_api.h>
13
+ #include "helper/dlpack_helper.hpp"
14
+
15
+ namespace torchpipe {
16
+ void SyncTensor::impl_init(
17
+ const std::unordered_map<std::string, std::string>& config,
18
+ const dict& kwargs) {
19
+ auto dep = om::parser_v2::get_opt_dependency_name(this, config);
20
+
21
+ const auto cls_name = om::get_cls_name(this, "SyncTensor");
22
+
23
+ auto iter = config.find(TASK_INDEX_KEY);
24
+ OMNI_ASSERT(
25
+ iter != config.end(),
26
+ "You are not in an independent thread mode(TASK_INDEX_KEY was not detected. Maybe use `With[StreamPool, *]` instead");
27
+
28
+ independent_thread_index_ = std::stoi(iter->second);
29
+ OMNI_ASSERT(independent_thread_index_ >= 0);
30
+
31
+ if (config.find("device_id") != config.end()) {
32
+ throw std::runtime_error(
33
+ "SyncTensor: device_id is not supported by SyncTensor yet.");
34
+ }
35
+
36
+ const auto device_id_int = c10::cuda::current_device(); //- 1;
37
+
38
+ c10::cuda::getCurrentCUDAStream().synchronize();
39
+
40
+ bNeedSync_ = torch_not_use_default_stream(device_id_int, true);
41
+ // Schedule保证了init和forward在同一个线程
42
+ OMNI_ASSERT(
43
+ bNeedSync_,
44
+ "This backend can only be used in default current stream. may be use `With[StreamPool,*]` instead.");
45
+
46
+ TVMFFIStreamHandle out_original_stream{nullptr};
47
+ TVMFFIStreamHandle in_stream = c10::cuda::getCurrentCUDAStream().stream();
48
+ TVM_FFI_ICHECK(
49
+ 0 ==
50
+ TVMFFIEnvSetStream(
51
+ kDLCUDA, device_id_int, in_stream, &out_original_stream));
52
+ DLPackManagedTensorAllocator opt_out_original_allocator{nullptr};
53
+ TVM_FFI_ICHECK(nullptr == TVMFFIEnvGetDLPackManagedTensorAllocator());
54
+ // https: //
55
+ // github.com/apache/tvm-ffi/blob/6e7cafab78cb007d066bc860c600e2ba80b4d1a7/python/tvm_ffi/utils/_build_optional_torch_c_dlpack.py#L535
56
+ TVM_FFI_ICHECK(
57
+ 0 ==
58
+ TVMFFIEnvSetDLPackManagedTensorAllocator(
59
+ torch_allocator(), 0, &opt_out_original_allocator));
60
+ TVM_FFI_ICHECK(nullptr == out_original_stream);
61
+ // TVM_FFI_ICHECK(nullptr == opt_out_original_allocator);
62
+
63
+ if (dep && !owned_backend_) {
64
+ owned_backend_ = om::init_backend(*dep, config, kwargs);
65
+ }
66
+ // ManagedTensorAllocator
67
+
68
+ c10::cuda::getCurrentCUDAStream().synchronize();
69
+
70
+ OMNI_ASSERT(c10::cuda::device_count() >= 1);
71
+
72
+ return;
73
+ }
74
+
75
+ void SyncTensor::impl_forward(const std::vector<dict>& ios) {
76
+ // std::string sync_stream = dict_get<std::string>(ios[0],
77
+ // "sync_stream",true);
78
+
79
+ if (owned_backend_) {
80
+ static auto curr_stream = c10::cuda::getCurrentCUDAStream(-1);
81
+ static auto default_stream = c10::cuda::getDefaultCUDAStream(-1);
82
+
83
+ event_.record(default_stream);
84
+ event_.block(curr_stream);
85
+ }
86
+ impl_dep_forward(ios);
87
+
88
+ c10::cuda::getCurrentCUDAStream().synchronize();
89
+ }
90
+
91
+ OMNI_REGISTER(om::Backend, SyncTensor, "SyncTensor,StreamGuard");
92
+
93
+ class TorchStreamPool : public om::Backend {
94
+ void impl_init(
95
+ const std::unordered_map<std::string, std::string>& params,
96
+ const dict& options) override {
97
+ auto [args, kwargs] =
98
+ om::parser_v2::get_args_kwargs(this, "TorchStreamPool", params);
99
+ om::str::try_update<size_t>(kwargs, "max_stream", max_stream_count_);
100
+ OMNI_ASSERT(max_stream_count_ > 0 && max_stream_count_ < 32);
101
+ stream_pool_ =
102
+ std::make_unique<om::pool::ResourcePool<size_t>>(max_stream_count_);
103
+ for (size_t i = 0; i < max_stream_count_; ++i) {
104
+ auto stream = c10::cuda::getStreamFromPool(true, -1);
105
+ stream_event_.emplace_back(
106
+ StreamWithEvent{std::move(stream), at::cuda::CUDAEvent()});
107
+ }
108
+ }
109
+
110
+ void impl_forward_with_dep(const std::vector<om::dict>& ios, om::Backend& dep)
111
+ override {
112
+ auto index = stream_pool_->acquire();
113
+ om::pool::ResourcePool<size_t>::lease_guard guard(
114
+ stream_pool_.get(), index);
115
+ auto& se = stream_event_.at(index);
116
+ auto original_stream = c10::cuda::getCurrentCUDAStream(-1);
117
+ if (se.stream != original_stream) {
118
+ c10::cuda::CUDAStreamGuard s(se.stream);
119
+ // https://stackoverflow.com/questions/15501699/cudastreamwaitevent-does-not-seem-to-wait
120
+ se.event.record(original_stream);
121
+ se.event.block(se.stream);
122
+
123
+ dep.safe_forward(ios);
124
+
125
+ se.event.record(se.stream);
126
+ se.event.block(original_stream);
127
+ } else {
128
+ dep.safe_forward(ios);
129
+ }
130
+ }
131
+
132
+ [[nodiscard]] uint32_t impl_max() const override {
133
+ return std::numeric_limits<uint32_t>::max();
134
+ }
135
+
136
+ private:
137
+ struct StreamWithEvent {
138
+ c10::cuda::CUDAStream stream;
139
+ at::cuda::CUDAEvent event;
140
+ // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT_1gf4fcb74343aa689f4159791967868446
141
+ };
142
+ size_t max_stream_count_{1};
143
+ std::vector<StreamWithEvent> stream_event_;
144
+ std::unique_ptr<om::pool::ResourcePool<size_t>> stream_pool_;
145
+ };
146
+
147
+ OMNI_REGISTER(om::Backend, TorchStreamPool, "TorchStreamPool, StreamPool");
148
+ } // namespace torchpipe
@@ -0,0 +1,134 @@
1
+ #pragma once
2
+
3
+ #include <omniback/extension.hpp>
4
+ #include <string>
5
+ #include <unordered_set>
6
+
7
+ #include <ATen/cuda/CUDAEvent.h>
8
+ #include "c10/cuda/CUDAGuard.h"
9
+
10
+ using om::dict;
11
+
12
+ namespace torchpipe {
13
+ class SyncTensor : public om::Backend {
14
+ public:
15
+ /**
16
+ * @brief
17
+ * Initialization, determines whether the default stream is being used; if
18
+ * so, and in independent thread mode, it will bind a new CUDA stream to the
19
+ * current thread for GPU asynchronous execution. (Renamed from Torch)
20
+ *
21
+ * @param TASK_INDEX_KEY When the parameter is not null, it represents
22
+ * independent thread mode, at this time it can be assumed that init and
23
+ * forward are running in the same independent thread. Check if the CUDA
24
+ * stream is the default stream, if not, we bind the thread to a new stream
25
+ * and set bNeedSync_ to true, otherwise do nothing.
26
+ * @param SyncTensor::backend Default is Identity. The backend it forwards
27
+ * execution to.
28
+ * @note Usage: SyncTensor[A], SequentialV0[A,B,C,SyncTensor] or
29
+ * SequentialV0[A,B,SyncTensor[C]]. For serial units, such as
30
+ * SequentialV0[SyncTensor[A],SyncTensor[B]], it will initialize in reverse
31
+ * order and forward in order: SyncTensor[B].init -> SyncTensor[A].init ->
32
+ * SyncTensor[A].forward
33
+ * -> SyncTensor[B].forward; SyncTensor[A] is not the default stream at
34
+ * initialization, so it does not need to set a new stream, and it does not
35
+ * need to be responsible for stream synchronization during forward, at this
36
+ * time if SyncTensor[B] has set a new stream, then SyncTensor[B] is
37
+ * responsible for stream synchronization;
38
+ *
39
+ * @ref SequentialV0 and other containers can ensure that the
40
+ * initialization order of their child backends is opposite to the forward
41
+ * order, therefore, even in complex situations, the correct stream
42
+ * synchronization timing can be obtained.
43
+ */
44
+ virtual void impl_init(
45
+ const std::unordered_map<std::string, std::string>&,
46
+ const dict&) override;
47
+
48
+ // virtual void post_init(
49
+ // const std::unordered_map<std::string, std::string>&,
50
+ // const dict&) override;
51
+ /**
52
+ * @brief
53
+ * 如果init时绑定了新的流(也就是bNeedSync_==true),则forward时当子后端执行完毕后执行当前流上的同步。
54
+ *
55
+ */
56
+ virtual void impl_forward(const std::vector<dict>& ios) override;
57
+
58
+ [[nodiscard]] uint32_t impl_max() const override {
59
+ if (owned_backend_) {
60
+ return owned_backend_->max();
61
+ } else {
62
+ return std::numeric_limits<uint32_t>::max(); // default
63
+ }
64
+ }
65
+
66
+ [[nodiscard]] uint32_t impl_min() const override {
67
+ if (owned_backend_) {
68
+ return owned_backend_->min();
69
+ } else {
70
+ return 1; // default
71
+ }
72
+ }
73
+
74
+ private:
75
+ void impl_dep_forward(const std::vector<dict>& ios) {
76
+ if (owned_backend_)
77
+ owned_backend_->forward(ios);
78
+ else {
79
+ for (const auto& io : ios) {
80
+ (*io)[TASK_RESULT_KEY] = io->at(TASK_DATA_KEY);
81
+ }
82
+ }
83
+ }
84
+
85
+ bool bNeedSync_ = false;
86
+ std::unique_ptr<Backend> owned_backend_;
87
+ int independent_thread_index_{-1};
88
+ std::optional<c10::cuda::CUDAStream> stream_;
89
+ at::cuda::CUDAEvent event_;
90
+ };
91
+
92
+ // class StreamGuard : public om::DependencyV0 {
93
+ // public:
94
+ // /**
95
+ // * @brief
96
+ // * Initialization, determines whether the default stream is being used; if
97
+ // * so, and in independent thread mode, it will bind a new CUDA stream to
98
+ // the
99
+ // * current thread for GPU asynchronous execution. (Renamed from Torch)
100
+ // *
101
+ // * @param TASK_INDEX_KEY When the parameter is not null, it represents
102
+ // * independent thread mode, at this time it can be assumed that init and
103
+ // * forward are running in the same independent thread. Check if the CUDA
104
+ // * stream is the default stream, if not, we bind the thread to a new stream
105
+ // * and set bNeedSync_ to true, otherwise do nothing.
106
+ // * @param StreamGuard::backend Default is Identity. The backend it forwards
107
+ // * execution to.
108
+ // *
109
+ // * @ref SequentialV0 and other containers can ensure that the
110
+ // * initialization order of their child backends is opposite to the forward
111
+ // * order, therefore, even in complex situations, the correct stream
112
+ // * synchronization timing can be obtained.
113
+ // */
114
+ // virtual void pre_init(
115
+ // const std::unordered_map<std::string, std::string>&,
116
+ // const dict&) override;
117
+
118
+ // virtual void post_init(
119
+ // const std::unordered_map<std::string, std::string>&,
120
+ // const dict&) override;
121
+ // /**
122
+ // * @brief
123
+ // * 如果init时绑定了新的流(也就是bNeedSync_==true),则forward时当子后端执行完毕后执行当前流上的同步。
124
+ // *
125
+ // */
126
+ // virtual void custom_forward_with_dep(const std::vector<dict>&, Backend*)
127
+ // override;
128
+
129
+ // private:
130
+ // // bool bNeedSync_ = false;
131
+ // std::optional<c10::cuda::CUDAStream> stream_;
132
+ // std::unique_ptr<c10::cuda::CUDAStreamGuard> stream_guard_;
133
+ // };
134
+ } // namespace torchpipe
@@ -0,0 +1,160 @@
1
+ #include <numeric>
2
+ #include <unordered_map>
3
+
4
+ #include <cuda_runtime_api.h>
5
+ #include "c10/cuda/CUDAStream.h"
6
+ #include "omniback/builtin/page_table.hpp"
7
+ #include "omniback/core/backend.hpp"
8
+
9
+ #include <torch/torch.h>
10
+ #include "omniback/helper/base_logging.hpp"
11
+
12
+ namespace torchpipe {
13
+ class LocationManager {
14
+ public:
15
+ };
16
+
17
+ using namespace om;
18
+ class FIAppendTensor : public om::BackendOne {
19
+ private:
20
+ size_t max_num_req_{16};
21
+ size_t max_num_page_{0};
22
+ size_t max_context_len_{4096};
23
+ size_t num_layer_{32};
24
+ size_t head_num_{32};
25
+ size_t head_dim_{128};
26
+ size_t page_size_{16};
27
+
28
+ private:
29
+ size_t max_num_page_per_seq_{0};
30
+ bool inited_{false};
31
+ // std::vector<torch::Tensor> k_;
32
+ // std::vector<torch::Tensor> v_;
33
+ std::unique_ptr<PageTable> pool_;
34
+
35
+ private:
36
+ void impl_init(
37
+ const std::unordered_map<string, string>& params,
38
+ const dict& options) override {
39
+ str::try_update(params, "max_num_req", max_num_req_);
40
+ max_num_page_ = str::get<size_t>(params, "max_num_page");
41
+ // str::try_update(params, "max_context_len", max_context_len_);
42
+ // str::try_update(params, "num_layer", num_layer_);
43
+ // str::try_update(params, "head_num", head_num_);
44
+ // str::try_update(params, "head_dim", head_dim_);
45
+ OMNI_ASSERT(max_context_len_ % page_size_ == 0);
46
+ max_num_page_per_seq_ = max_context_len_ / page_size_;
47
+ }
48
+
49
+ // void get(torch::Tensor kv_append_length) {
50
+ // for (size_t i = 0; i < kv_append_length.size(0); ++i) {
51
+ // }
52
+
53
+ // torch::Tensor batch_indices;
54
+ // torch::Tensor positions;
55
+
56
+ // torch::Tensor kv_page_indptr;
57
+ // torch::Tensor kv_last_page_len;
58
+ // torch::Tensor kv_append_indptr;
59
+ // }
60
+ torch::Tensor vec2tensor(const std::vector<int>& data) {
61
+ thread_local auto options = torch::TensorOptions()
62
+ .device(torch::kCUDA, -1)
63
+ .dtype(torch::kInt) // torch::kByte
64
+ .layout(torch::kStrided)
65
+ .requires_grad(false);
66
+ torch::Tensor re =
67
+ torch::empty({static_cast<int64_t>(data.size())}, options);
68
+ cudaError_t cuda_status = cudaMemcpyAsync(
69
+ re.data_ptr(), // 目标设备指针
70
+ data.data(), // 主机源指针
71
+ data.size() * sizeof(int), // 字节大小
72
+ cudaMemcpyHostToDevice, // 传输方向
73
+ c10::cuda::getCurrentCUDAStream());
74
+ if (cuda_status != cudaSuccess) {
75
+ throw std::runtime_error(
76
+ "CUDA 拷贝失败: " + std::string(cudaGetErrorString(cuda_status)));
77
+ }
78
+ return re;
79
+ }
80
+
81
+ void forward(const dict& io) override {
82
+ if (!inited_)
83
+ lazy_init();
84
+ // in
85
+ // id, type(prefill, decode), seq_len,
86
+ // out
87
+ bool success = true;
88
+ std::vector<id_type> id = dict_gets<id_type>(io, "request_ids");
89
+ // https://docs.flashinfer.ai/generated/flashinfer.page.append_paged_kv_cache.html#flashinfer.page.append_paged_kv_cache
90
+ torch::Tensor seq_lens = dict_get<torch::Tensor>(io, "kv_append_length");
91
+ OMNI_ASSERT(id.size() == seq_lens.size(0) && seq_lens.is_cpu());
92
+ size_t total{0};
93
+
94
+ for (size_t i = 0; i < id.size(); ++i) {
95
+ SPDLOG_INFO("id = {}", id[i]);
96
+ auto seq_len = seq_lens[i].item<int>();
97
+ success = success && pool_->alloc(id[i], seq_len);
98
+ OMNI_ASSERT(success);
99
+ const auto& infor = pool_->page_info(id[i]);
100
+ total += infor.kv_page_indices.size();
101
+ }
102
+ std::vector<int> kv_page_indices;
103
+ kv_page_indices.reserve(total);
104
+
105
+ std::vector<int> kv_page_indptr(1 + id.size(), 0);
106
+ std::vector<int> kv_last_page_len(id.size());
107
+ for (size_t i = 0; i < id.size(); ++i) {
108
+ const auto& infor = pool_->page_info(id[i]);
109
+ kv_page_indices.insert(
110
+ kv_page_indices.end(),
111
+ infor.kv_page_indices.begin(),
112
+ infor.kv_page_indices.end());
113
+ kv_page_indptr[i + 1] = kv_page_indptr[i] + infor.kv_page_indices.size();
114
+ kv_last_page_len[i] = infor.kv_last_page_len;
115
+ }
116
+
117
+ auto kv_page_indices_CUDA = vec2tensor(kv_page_indices);
118
+ auto kv_page_indptr_CUDA = vec2tensor(kv_page_indptr);
119
+ auto kv_last_page_len_CUDA = vec2tensor(kv_last_page_len);
120
+
121
+ (*io)["kv_page_indices"] = kv_page_indices_CUDA;
122
+ (*io)["kv_page_indptr"] = kv_page_indptr_CUDA;
123
+ (*io)["kv_last_page_len"] = kv_last_page_len_CUDA;
124
+ }
125
+
126
+ void lazy_init() {
127
+ // if (max_num_page_ == 0) {
128
+ // auto stats = torch::cuda::memory_stats(-1);
129
+ // int64_t free_memory = stats.free_bytes; // 剩余显存字节数
130
+
131
+ // max_num_page_ = static_cast<size_t>(
132
+ // (free_memory * 0.9) /
133
+ // (page_size_ * head_num_ * head_dim_ * 2 /*kv*/ * num_layer_ *
134
+ // 2 /*fp16 */));
135
+ // }
136
+
137
+ // k_.resize(num_layer_);
138
+ // v_.resize(num_layer_);
139
+ // auto options = torch::TensorOptions()
140
+ // .device(torch::kCUDA, -1)
141
+ // .dtype(torch::kFloat16)
142
+ // .layout(torch::kStrided)
143
+ // .requires_grad(false);
144
+ // for (size_t layer_index = 0; layer_index < num_layer_; ++layer_index) {
145
+ // k_[layer_index] = torch::empty(
146
+ // {max_num_page_, page_size_, head_num_, head_dim_},
147
+ // options,
148
+ // torch::MemoryFormat::Contiguous);
149
+ // v_[layer_index] = torch::empty(
150
+ // {max_num_page_, page_size_, head_num_, head_dim_},
151
+ // options,
152
+ // torch::MemoryFormat::Contiguous);
153
+ // }
154
+ pool_ =
155
+ std::make_unique<PageTable>(max_num_req_, max_num_page_, page_size_);
156
+ inited_ = true;
157
+ }
158
+ };
159
+ OMNI_REGISTER_BACKEND(FIAppendTensor);
160
+ } // namespace torchpipe
@@ -0,0 +1,26 @@
1
+ #pragma once
2
+ #include <string>
3
+ #include <unordered_set>
4
+
5
+ #include <tvm/ffi/extra/c_env_api.h>
6
+ #include <tvm/ffi/c_api.h>
7
+ #include <tvm/ffi/error.h>
8
+ #include "omniback/ffi/types.hpp"
9
+
10
+
11
+ namespace torchpipe {
12
+
13
+ inline DLPackManagedTensorAllocator torch_allocator() {
14
+ DLPackExchangeAPI* api =
15
+ reinterpret_cast<DLPackExchangeAPI*>(om::ffi::dlpack_exchange_api());
16
+ if (api)
17
+ {
18
+ static DLPackManagedTensorAllocator& alloc = api->managed_tensor_allocator;
19
+ TVM_FFI_ICHECK(alloc);
20
+ return alloc;
21
+ }else{
22
+ return nullptr;
23
+ }
24
+ }
25
+
26
+ }
@@ -0,0 +1,18 @@
1
+ #include "helper/mat.hpp"
2
+ // #include <torch/torch.h>
3
+ #include "helper/torch.hpp"
4
+
5
+ namespace torchpipe::convert{
6
+ ImageData TorchAny2ImageData(om::any tensor) {
7
+ torch::Tensor data = tensor.cast<torch::Tensor>();
8
+ return torchpipe::torch2ImageData(data);
9
+ }
10
+
11
+ om::any imageDataToAnyTorchCPU(const convert::ImageData& img) {
12
+ return torchpipe::imageDataToTorchCPU(img);
13
+ }
14
+
15
+ om::any imageDataToAnyTorchGPU(const convert::ImageData& img) {
16
+ return torchpipe::imageDataToTorchCPU(img).cuda();
17
+ }
18
+ }
@@ -0,0 +1,22 @@
1
+ #pragma once
2
+ #include <cstddef>
3
+ #include <cstdint>
4
+ #include <functional>
5
+ #include "omniback/core/any.hpp"
6
+
7
+ namespace torchpipe::convert{
8
+
9
+ struct ImageData {
10
+ void* data = nullptr;
11
+ size_t rows = 0;
12
+ size_t cols = 0;
13
+ size_t channels = 0;
14
+ bool is_float = false; // true: float32, false: uint8
15
+ std::function<void(void*)> deleter;
16
+ };
17
+ ImageData TorchAny2ImageData(om::any tensor);
18
+
19
+ om::any imageDataToAnyTorchCPU(const convert::ImageData& img);
20
+
21
+ om::any imageDataToAnyTorchGPU(const convert::ImageData& img);
22
+ }
@@ -0,0 +1,57 @@
1
+ #include <optional>
2
+ #include "helper/net_info.hpp"
3
+
4
+ namespace torchpipe {
5
+ inline bool is_all_positive(const NetIOInfo::Dims64& dims) {
6
+ if (dims.nbDims <= 0) return false;
7
+ for (size_t index = 0; index < dims.nbDims; ++index) {
8
+ if (dims.d[index] <= 0) return false;
9
+ }
10
+ return true;
11
+ }
12
+
13
+ bool is_all_positive(NetIOInfos& info) {
14
+ for (const auto& item : info.first) {
15
+ if (!is_all_positive(item.min)) return false;
16
+ }
17
+ for (const auto& item : info.second) {
18
+ if (!is_all_positive(item.min)) return false;
19
+ }
20
+ for (const auto& item : info.first) {
21
+ if (!is_all_positive(item.max)) return false;
22
+ }
23
+ for (const auto& item : info.second) {
24
+ if (!is_all_positive(item.max)) return false;
25
+ }
26
+
27
+ return true;
28
+ }
29
+
30
+ size_t elementSize(NetIOInfo::DataType info) {
31
+ switch (info) {
32
+ case NetIOInfo::DataType::INT4:
33
+ case NetIOInfo::DataType::FP4:
34
+ return 1; // 4 bits = 0.5 bytes
35
+ case NetIOInfo::DataType::INT8:
36
+ case NetIOInfo::DataType::UINT8:
37
+ case NetIOInfo::DataType::BOOL:
38
+ case NetIOInfo::DataType::FP8:
39
+ return 1; // 8 bits = 1 byte
40
+ case NetIOInfo::DataType::INT32:
41
+ case NetIOInfo::DataType::FP32:
42
+ case NetIOInfo::DataType::BF32:
43
+ return 4; // 32 bits = 4 bytes
44
+ case NetIOInfo::DataType::INT64:
45
+ return 8; // 64 bits = 8 bytes
46
+ case NetIOInfo::DataType::FP16:
47
+ case NetIOInfo::DataType::BF16:
48
+ return 2; // 16 bits = 2 bytes
49
+ case NetIOInfo::DataType::RESERVED_INT:
50
+ case NetIOInfo::DataType::RESERVED_FP:
51
+ case NetIOInfo::DataType::RESERVED_BF:
52
+ case NetIOInfo::DataType::UNKNOWN:
53
+ default:
54
+ return 0; // Unknown or reserved types
55
+ }
56
+ }
57
+ } // namespace torchpipe