tensorrt 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +182 -0
- data/Rakefile +22 -0
- data/ext/tensorrt_rb/extconf.rb +28 -0
- data/ext/tensorrt_rb/tensorrt_rb.cpp +175 -0
- data/lib/tensorrt/cuda.rb +88 -0
- data/lib/tensorrt.rb +9 -0
- data/tensorrt.gemspec +24 -0
- metadata +104 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 165592be0806398f6bb72846feb414eae2947e592cdba7876f91a92a1850d352
|
|
4
|
+
data.tar.gz: 7f585077dec7ee5de44362d6b93a365ec696b112cd63085006ea6c613e392870
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 33fff8454d89035f4785c556e13207b132ae8677839b3eee073cc69bad3ab621390b41e96fb4eb94542ec00ca29f7c67fee04e24597e971d2f4244af3060d895
|
|
7
|
+
data.tar.gz: 45a0cf35ed380cb77a3d944f8355cd8945184b3584f137d5e6de8f0d1cf32f34778a53f2c782253e7f4e91310fdf0e8134a58db0f8cfc469670212d0890c3acb
|
data/README.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# tensorrt-rb
|
|
2
|
+
|
|
3
|
+
Minimal TensorRT bindings for Ruby using Rice (C++ bindings).
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Linux (x86_64 or aarch64)
|
|
8
|
+
- Ruby >= 3.0
|
|
9
|
+
- TensorRT (with headers and libraries)
|
|
10
|
+
- CUDA runtime
|
|
11
|
+
- Rice gem (`gem install rice`)
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
cd tensorrt-rb
|
|
17
|
+
|
|
18
|
+
# Set paths if not in standard locations
|
|
19
|
+
export TENSORRT_INCLUDE=/path/to/tensorrt/include
|
|
20
|
+
export TENSORRT_LIB=/path/to/tensorrt/lib
|
|
21
|
+
export CUDA_INCLUDE=/usr/local/cuda/include
|
|
22
|
+
export CUDA_LIB=/usr/local/cuda/lib64
|
|
23
|
+
|
|
24
|
+
# Build
|
|
25
|
+
rake compile
|
|
26
|
+
|
|
27
|
+
# Or install as gem
|
|
28
|
+
gem build tensorrt.gemspec
|
|
29
|
+
gem install tensorrt-*.gem
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Default Library Paths
|
|
33
|
+
|
|
34
|
+
**x86_64:**
|
|
35
|
+
- TensorRT: `/usr/include/x86_64-linux-gnu`, `/usr/lib/x86_64-linux-gnu`
|
|
36
|
+
- CUDA: `/usr/local/cuda/include`, `/usr/local/cuda/lib64`
|
|
37
|
+
|
|
38
|
+
**aarch64:**
|
|
39
|
+
- TensorRT: `/usr/include/aarch64-linux-gnu`, `/usr/lib/aarch64-linux-gnu`
|
|
40
|
+
- CUDA: `/usr/local/cuda/include`, `/usr/local/cuda/lib64`
|
|
41
|
+
|
|
42
|
+
## API
|
|
43
|
+
|
|
44
|
+
### TensorRT::Engine
|
|
45
|
+
|
|
46
|
+
```ruby
|
|
47
|
+
engine = TensorRT::Engine.new(path, verbose: false)
|
|
48
|
+
|
|
49
|
+
# Tensor info
|
|
50
|
+
engine.num_io_tensors # Number of input/output tensors
|
|
51
|
+
engine.get_tensor_name(index) # Tensor name by index
|
|
52
|
+
engine.is_input?(name) # Check if tensor is input
|
|
53
|
+
engine.get_tensor_shape(name) # Shape as array [1, 3, 640, 640]
|
|
54
|
+
engine.get_tensor_bytes(name) # Size in bytes
|
|
55
|
+
|
|
56
|
+
# Memory binding
|
|
57
|
+
engine.set_tensor_address(name, device_ptr)
|
|
58
|
+
|
|
59
|
+
# Inference
|
|
60
|
+
engine.execute # Synchronous (blocking)
|
|
61
|
+
engine.enqueue # Asynchronous (non-blocking)
|
|
62
|
+
|
|
63
|
+
# Stream management
|
|
64
|
+
engine.get_stream # CUDA stream handle (uint64)
|
|
65
|
+
engine.stream_synchronize # Wait for stream completion
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### TensorRT::CUDA
|
|
69
|
+
|
|
70
|
+
```ruby
|
|
71
|
+
# Memory allocation
|
|
72
|
+
ptr = TensorRT::CUDA.malloc(bytes)
|
|
73
|
+
TensorRT::CUDA.free(ptr)
|
|
74
|
+
|
|
75
|
+
# Synchronous copy
|
|
76
|
+
TensorRT::CUDA.memcpy_htod(device_ptr, host_ptr, bytes) # Host → Device
|
|
77
|
+
TensorRT::CUDA.memcpy_dtoh(host_ptr, device_ptr, bytes) # Device → Host
|
|
78
|
+
|
|
79
|
+
# Asynchronous copy
|
|
80
|
+
TensorRT::CUDA.memcpy_htod_async(device_ptr, host_ptr, bytes, stream)
|
|
81
|
+
TensorRT::CUDA.memcpy_dtoh_async(host_ptr, device_ptr, bytes, stream)
|
|
82
|
+
|
|
83
|
+
# Synchronization
|
|
84
|
+
TensorRT::CUDA.synchronize # All operations
|
|
85
|
+
TensorRT::CUDA.stream_synchronize(stream) # Specific stream
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Examples
|
|
89
|
+
|
|
90
|
+
### Synchronous Inference
|
|
91
|
+
|
|
92
|
+
```ruby
|
|
93
|
+
require "tensorrt"
|
|
94
|
+
|
|
95
|
+
engine = TensorRT::Engine.new("model.engine")
|
|
96
|
+
|
|
97
|
+
# Allocate GPU memory
|
|
98
|
+
input_bytes = engine.get_tensor_bytes("input")
|
|
99
|
+
output_bytes = engine.get_tensor_bytes("output")
|
|
100
|
+
output_size = engine.get_tensor_shape("output").reduce(1, :*)
|
|
101
|
+
|
|
102
|
+
input_device = TensorRT::CUDA.malloc(input_bytes)
|
|
103
|
+
output_device = TensorRT::CUDA.malloc(output_bytes)
|
|
104
|
+
engine.set_tensor_address("input", input_device)
|
|
105
|
+
engine.set_tensor_address("output", output_device)
|
|
106
|
+
|
|
107
|
+
# Prepare input data
|
|
108
|
+
input_data = preprocess_image(image_path) # Returns Numo::SFloat
|
|
109
|
+
input_host = FFI::MemoryPointer.new(:float, input_data.size)
|
|
110
|
+
input_host.write_bytes(input_data.to_binary)
|
|
111
|
+
|
|
112
|
+
# Copy input to GPU
|
|
113
|
+
TensorRT::CUDA.memcpy_htod(input_device, input_host, input_bytes)
|
|
114
|
+
|
|
115
|
+
# Run inference
|
|
116
|
+
engine.execute
|
|
117
|
+
|
|
118
|
+
# Copy output from GPU
|
|
119
|
+
output_host = FFI::MemoryPointer.new(:float, output_size)
|
|
120
|
+
TensorRT::CUDA.memcpy_dtoh(output_host, output_device, output_bytes)
|
|
121
|
+
output_data = output_host.read_array_of_float(output_size)
|
|
122
|
+
|
|
123
|
+
# Cleanup
|
|
124
|
+
TensorRT::CUDA.free(input_device)
|
|
125
|
+
TensorRT::CUDA.free(output_device)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Pipelined Async Inference
|
|
129
|
+
|
|
130
|
+
Overlap CPU preprocessing with GPU inference for maximum throughput:
|
|
131
|
+
|
|
132
|
+
```ruby
|
|
133
|
+
require "tensorrt"
|
|
134
|
+
|
|
135
|
+
engine = TensorRT::Engine.new("model.engine")
|
|
136
|
+
stream = engine.get_stream
|
|
137
|
+
|
|
138
|
+
# Allocate GPU memory
|
|
139
|
+
input_bytes = engine.get_tensor_bytes("input")
|
|
140
|
+
output_bytes = engine.get_tensor_bytes("output")
|
|
141
|
+
output_size = engine.get_tensor_shape("output").reduce(1, :*)
|
|
142
|
+
|
|
143
|
+
input_device = TensorRT::CUDA.malloc(input_bytes)
|
|
144
|
+
output_device = TensorRT::CUDA.malloc(output_bytes)
|
|
145
|
+
engine.set_tensor_address("input", input_device)
|
|
146
|
+
engine.set_tensor_address("output", output_device)
|
|
147
|
+
|
|
148
|
+
# Allocate host buffers
|
|
149
|
+
input_host = FFI::MemoryPointer.new(:float, input_bytes / 4)
|
|
150
|
+
output_host = FFI::MemoryPointer.new(:float, output_size)
|
|
151
|
+
|
|
152
|
+
# Preload first image
|
|
153
|
+
current_image = preprocess_image(images[0])
|
|
154
|
+
|
|
155
|
+
images.each_with_index do |image_path, i|
|
|
156
|
+
# Copy current image to GPU (async)
|
|
157
|
+
input_host.write_bytes(current_image.to_binary)
|
|
158
|
+
TensorRT::CUDA.memcpy_htod_async(input_device, input_host, input_bytes, stream)
|
|
159
|
+
|
|
160
|
+
# Start async inference
|
|
161
|
+
engine.enqueue
|
|
162
|
+
|
|
163
|
+
# Preprocess next image on CPU while GPU is busy
|
|
164
|
+
next_image = preprocess_image(images[i + 1]) if i < images.size - 1
|
|
165
|
+
|
|
166
|
+
# Wait for GPU inference to complete
|
|
167
|
+
engine.stream_synchronize
|
|
168
|
+
|
|
169
|
+
# Copy output from GPU
|
|
170
|
+
TensorRT::CUDA.memcpy_dtoh(output_host, output_device, output_bytes)
|
|
171
|
+
output_data = output_host.read_array_of_float(output_size)
|
|
172
|
+
|
|
173
|
+
# Process results
|
|
174
|
+
process_detections(output_data)
|
|
175
|
+
|
|
176
|
+
current_image = next_image
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Cleanup
|
|
180
|
+
TensorRT::CUDA.free(input_device)
|
|
181
|
+
TensorRT::CUDA.free(output_device)
|
|
182
|
+
```
|
data/Rakefile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rake/extensiontask"
|
|
4
|
+
|
|
5
|
+
task default: :compile
|
|
6
|
+
|
|
7
|
+
Rake::ExtensionTask.new("tensorrt_rb") do |ext|
|
|
8
|
+
ext.lib_dir = "lib/tensorrt_rb"
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
desc "Build and install the gem"
|
|
12
|
+
task :install => :compile do
|
|
13
|
+
sh "gem build tensorrt.gemspec && gem install tensorrt-*.gem"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
desc "Clean build artifacts"
|
|
17
|
+
task :clean do
|
|
18
|
+
rm_rf "tmp"
|
|
19
|
+
rm_rf "lib/tensorrt_rb/*.bundle"
|
|
20
|
+
rm_rf "lib/tensorrt_rb/*.so"
|
|
21
|
+
rm_f Dir.glob("*.gem")
|
|
22
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "mkmf-rice"
|
|
4
|
+
|
|
5
|
+
# TensorRT/CUDA paths for Linux (x86_64 and aarch64)
|
|
6
|
+
case RUBY_PLATFORM
|
|
7
|
+
when /x86_64-linux/
|
|
8
|
+
tensorrt_include = ENV["TENSORRT_INCLUDE"] || "/usr/include/x86_64-linux-gnu"
|
|
9
|
+
tensorrt_lib = ENV["TENSORRT_LIB"] || "/usr/lib/x86_64-linux-gnu"
|
|
10
|
+
cuda_include = ENV["CUDA_INCLUDE"] || "/usr/local/cuda/include"
|
|
11
|
+
cuda_lib = ENV["CUDA_LIB"] || "/usr/local/cuda/lib64"
|
|
12
|
+
when /aarch64-linux/
|
|
13
|
+
tensorrt_include = ENV["TENSORRT_INCLUDE"] || "/usr/include/aarch64-linux-gnu"
|
|
14
|
+
tensorrt_lib = ENV["TENSORRT_LIB"] || "/usr/lib/aarch64-linux-gnu"
|
|
15
|
+
cuda_include = ENV["CUDA_INCLUDE"] || "/usr/local/cuda/include"
|
|
16
|
+
cuda_lib = ENV["CUDA_LIB"] || "/usr/local/cuda/lib64"
|
|
17
|
+
else
|
|
18
|
+
abort "Unsupported platform: #{RUBY_PLATFORM}. Only Linux x86_64 and aarch64 are supported."
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
$INCFLAGS << " -I#{tensorrt_include} -I#{cuda_include}"
|
|
22
|
+
$LDFLAGS << " -L#{tensorrt_lib} -L#{cuda_lib}"
|
|
23
|
+
$LDFLAGS << " -lnvinfer -lcudart"
|
|
24
|
+
|
|
25
|
+
# C++17 for Rice
|
|
26
|
+
$CXXFLAGS << " -std=c++17"
|
|
27
|
+
|
|
28
|
+
create_makefile("tensorrt_rb/tensorrt_rb")
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
#include <rice/rice.hpp>
|
|
2
|
+
#include <rice/stl.hpp>
|
|
3
|
+
#include <NvInfer.h>
|
|
4
|
+
#include <cuda_runtime.h>
|
|
5
|
+
#include <fstream>
|
|
6
|
+
#include <vector>
|
|
7
|
+
#include <memory>
|
|
8
|
+
|
|
9
|
+
using namespace Rice;
|
|
10
|
+
|
|
11
|
+
class TRTLogger : public nvinfer1::ILogger {
|
|
12
|
+
public:
|
|
13
|
+
bool verbose = false;
|
|
14
|
+
|
|
15
|
+
void log(Severity severity, const char* msg) noexcept override {
|
|
16
|
+
if (verbose || severity <= Severity::kWARNING) {
|
|
17
|
+
const char* level = "";
|
|
18
|
+
switch (severity) {
|
|
19
|
+
case Severity::kINTERNAL_ERROR: level = "INTERNAL_ERROR"; break;
|
|
20
|
+
case Severity::kERROR: level = "ERROR"; break;
|
|
21
|
+
case Severity::kWARNING: level = "WARNING"; break;
|
|
22
|
+
case Severity::kINFO: level = "INFO"; break;
|
|
23
|
+
case Severity::kVERBOSE: level = "VERBOSE"; break;
|
|
24
|
+
}
|
|
25
|
+
if (verbose || severity <= Severity::kWARNING) {
|
|
26
|
+
fprintf(stderr, "[TensorRT %s] %s\n", level, msg);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
class TRTEngine {
|
|
33
|
+
private:
|
|
34
|
+
TRTLogger logger;
|
|
35
|
+
std::unique_ptr<nvinfer1::IRuntime> runtime;
|
|
36
|
+
std::unique_ptr<nvinfer1::ICudaEngine> engine;
|
|
37
|
+
std::unique_ptr<nvinfer1::IExecutionContext> context;
|
|
38
|
+
std::vector<void*> bindings;
|
|
39
|
+
cudaStream_t stream;
|
|
40
|
+
|
|
41
|
+
public:
|
|
42
|
+
TRTEngine(const std::string& engine_path, bool verbose = false) {
|
|
43
|
+
logger.verbose = verbose;
|
|
44
|
+
|
|
45
|
+
std::ifstream file(engine_path, std::ios::binary);
|
|
46
|
+
|
|
47
|
+
if (!file) {
|
|
48
|
+
throw std::runtime_error("Failed to open engine file: " + engine_path);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
file.seekg(0, std::ios::end);
|
|
52
|
+
size_t size = file.tellg();
|
|
53
|
+
file.seekg(0, std::ios::beg);
|
|
54
|
+
|
|
55
|
+
std::vector<char> buffer(size);
|
|
56
|
+
file.read(buffer.data(), size);
|
|
57
|
+
|
|
58
|
+
runtime.reset(nvinfer1::createInferRuntime(logger));
|
|
59
|
+
|
|
60
|
+
if (!runtime) {
|
|
61
|
+
throw std::runtime_error("Failed to create TensorRT runtime");
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
engine.reset(runtime->deserializeCudaEngine(buffer.data(), size));
|
|
65
|
+
if (!engine) {
|
|
66
|
+
throw std::runtime_error("Failed to deserialize engine");
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
context.reset(engine->createExecutionContext());
|
|
70
|
+
if (!context) {
|
|
71
|
+
throw std::runtime_error("Failed to create execution context");
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
cudaError_t err = cudaStreamCreate(&stream);
|
|
75
|
+
if (err != cudaSuccess) {
|
|
76
|
+
throw std::runtime_error("Failed to create CUDA stream");
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
int num_tensors = engine->getNbIOTensors();
|
|
80
|
+
|
|
81
|
+
bindings.resize(num_tensors, nullptr);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
~TRTEngine() {
|
|
85
|
+
if (stream) {
|
|
86
|
+
cudaStreamDestroy(stream);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
int num_io_tensors() const {
|
|
91
|
+
return engine->getNbIOTensors();
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
std::string get_tensor_name(int index) const {
|
|
95
|
+
return engine->getIOTensorName(index);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
bool is_input(const std::string& name) const {
|
|
99
|
+
return engine->getTensorIOMode(name.c_str()) == nvinfer1::TensorIOMode::kINPUT;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
std::vector<int64_t> get_tensor_shape(const std::string& name) const {
|
|
103
|
+
auto dims = engine->getTensorShape(name.c_str());
|
|
104
|
+
std::vector<int64_t> shape;
|
|
105
|
+
for (int i = 0; i < dims.nbDims; i++) {
|
|
106
|
+
shape.push_back(dims.d[i]);
|
|
107
|
+
}
|
|
108
|
+
return shape;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
size_t get_tensor_bytes(const std::string& name) const {
|
|
112
|
+
auto dims = engine->getTensorShape(name.c_str());
|
|
113
|
+
size_t bytes = sizeof(float);
|
|
114
|
+
for (int i = 0; i < dims.nbDims; i++) {
|
|
115
|
+
bytes *= dims.d[i];
|
|
116
|
+
}
|
|
117
|
+
return bytes;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
void set_tensor_address(const std::string& name, uint64_t ptr) {
|
|
121
|
+
void* addr = reinterpret_cast<void*>(ptr);
|
|
122
|
+
context->setTensorAddress(name.c_str(), addr);
|
|
123
|
+
|
|
124
|
+
for (int i = 0; i < engine->getNbIOTensors(); i++) {
|
|
125
|
+
if (name == engine->getIOTensorName(i)) {
|
|
126
|
+
bindings[i] = addr;
|
|
127
|
+
break;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
bool execute() {
|
|
133
|
+
return context->executeV2(bindings.data());
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
bool enqueue() {
|
|
137
|
+
return context->enqueueV3(stream);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
void memcpy_htod_async(uint64_t dst, const float* src, size_t count) {
|
|
141
|
+
cudaMemcpyAsync(reinterpret_cast<void*>(dst), src, count * sizeof(float),
|
|
142
|
+
cudaMemcpyHostToDevice, stream);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
void memcpy_dtoh_async(float* dst, uint64_t src, size_t count) {
|
|
146
|
+
cudaMemcpyAsync(dst, reinterpret_cast<void*>(src), count * sizeof(float),
|
|
147
|
+
cudaMemcpyDeviceToHost, stream);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
void stream_synchronize() {
|
|
151
|
+
cudaStreamSynchronize(stream);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
uint64_t get_stream() const {
|
|
155
|
+
return reinterpret_cast<uint64_t>(stream);
|
|
156
|
+
}
|
|
157
|
+
};
|
|
158
|
+
|
|
159
|
+
extern "C" void Init_tensorrt_rb() {
|
|
160
|
+
Module rb_mTensorRT = define_module("TensorRT");
|
|
161
|
+
|
|
162
|
+
define_class_under<TRTEngine>(rb_mTensorRT, "Engine")
|
|
163
|
+
.define_constructor(Constructor<TRTEngine, const std::string&, bool>(),
|
|
164
|
+
Arg("engine_path"), Arg("verbose") = false)
|
|
165
|
+
.define_method("num_io_tensors", &TRTEngine::num_io_tensors)
|
|
166
|
+
.define_method("get_tensor_name", &TRTEngine::get_tensor_name)
|
|
167
|
+
.define_method("is_input?", &TRTEngine::is_input)
|
|
168
|
+
.define_method("get_tensor_shape", &TRTEngine::get_tensor_shape)
|
|
169
|
+
.define_method("get_tensor_bytes", &TRTEngine::get_tensor_bytes)
|
|
170
|
+
.define_method("set_tensor_address", &TRTEngine::set_tensor_address)
|
|
171
|
+
.define_method("execute", &TRTEngine::execute)
|
|
172
|
+
.define_method("enqueue", &TRTEngine::enqueue)
|
|
173
|
+
.define_method("stream_synchronize", &TRTEngine::stream_synchronize)
|
|
174
|
+
.define_method("get_stream", &TRTEngine::get_stream);
|
|
175
|
+
}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "ffi"
|
|
4
|
+
|
|
5
|
+
module TensorRT
|
|
6
|
+
module CUDA
|
|
7
|
+
extend FFI::Library
|
|
8
|
+
|
|
9
|
+
CUDA_LIBS = %w[
|
|
10
|
+
libcudart.so
|
|
11
|
+
libcudart.so.12
|
|
12
|
+
libcudart.so.11
|
|
13
|
+
].freeze
|
|
14
|
+
|
|
15
|
+
begin
|
|
16
|
+
ffi_lib CUDA_LIBS
|
|
17
|
+
rescue LoadError => e
|
|
18
|
+
raise LoadError, "Could not load CUDA runtime library. Tried: #{CUDA_LIBS.join(', ')}. Error: #{e.message}"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
MEMCPY_HOST_TO_DEVICE = 1
|
|
22
|
+
MEMCPY_DEVICE_TO_HOST = 2
|
|
23
|
+
|
|
24
|
+
attach_function :cudaMalloc, [:pointer, :size_t], :int
|
|
25
|
+
attach_function :cudaFree, [:pointer], :int
|
|
26
|
+
attach_function :cudaMemcpy, [:pointer, :pointer, :size_t, :int], :int
|
|
27
|
+
attach_function :cudaMemcpyAsync, [:pointer, :pointer, :size_t, :int, :pointer], :int
|
|
28
|
+
attach_function :cudaDeviceSynchronize, [], :int
|
|
29
|
+
attach_function :cudaStreamSynchronize, [:pointer], :int
|
|
30
|
+
|
|
31
|
+
Error = Class.new(StandardError)
|
|
32
|
+
|
|
33
|
+
class << self
|
|
34
|
+
def malloc(size)
|
|
35
|
+
ptr = FFI::MemoryPointer.new(:pointer)
|
|
36
|
+
err = cudaMalloc(ptr, size)
|
|
37
|
+
|
|
38
|
+
raise Error, "cudaMalloc failed with error #{err}" unless err.zero?
|
|
39
|
+
|
|
40
|
+
ptr.read_pointer.address
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def free(ptr)
|
|
44
|
+
err = cudaFree(FFI::Pointer.new(ptr))
|
|
45
|
+
|
|
46
|
+
raise Error, "cudaFree failed with error #{err}" unless err.zero?
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def memcpy_htod(dst, src_ptr, size)
|
|
50
|
+
err = cudaMemcpy(FFI::Pointer.new(dst), src_ptr, size, MEMCPY_HOST_TO_DEVICE)
|
|
51
|
+
|
|
52
|
+
raise Error, "cudaMemcpy H2D failed with error #{err}" unless err.zero?
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def memcpy_dtoh(dst_ptr, src, size)
|
|
56
|
+
err = cudaMemcpy(dst_ptr, FFI::Pointer.new(src), size, MEMCPY_DEVICE_TO_HOST)
|
|
57
|
+
|
|
58
|
+
raise Error, "cudaMemcpy D2H failed with error #{err}" unless err.zero?
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def memcpy_htod_async(dst, src_ptr, size, stream)
|
|
62
|
+
err = cudaMemcpyAsync(FFI::Pointer.new(dst), src_ptr, size, MEMCPY_HOST_TO_DEVICE, FFI::Pointer.new(stream))
|
|
63
|
+
|
|
64
|
+
raise Error, "cudaMemcpyAsync H2D failed with error #{err}" unless err.zero?
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def memcpy_dtoh_async(dst_ptr, src, size, stream)
|
|
68
|
+
err = cudaMemcpyAsync(dst_ptr, FFI::Pointer.new(src), size, MEMCPY_DEVICE_TO_HOST, FFI::Pointer.new(stream))
|
|
69
|
+
|
|
70
|
+
raise Error, "cudaMemcpyAsync D2H failed with error #{err}" unless err.zero?
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def synchronize
|
|
74
|
+
err = cudaDeviceSynchronize
|
|
75
|
+
|
|
76
|
+
raise Error, "cudaDeviceSynchronize failed with error #{err}" unless err.zero?
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def stream_synchronize(stream)
|
|
80
|
+
err = cudaStreamSynchronize(FFI::Pointer.new(stream))
|
|
81
|
+
|
|
82
|
+
raise Error, "cudaStreamSynchronize failed with error #{err}" unless err.zero?
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
freeze
|
|
87
|
+
end
|
|
88
|
+
end
|
data/lib/tensorrt.rb
ADDED
data/tensorrt.gemspec
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Gem::Specification.new do |spec|
|
|
4
|
+
spec.name = "tensorrt"
|
|
5
|
+
spec.version = "0.1.0"
|
|
6
|
+
spec.author = "Pete Mtasyburka"
|
|
7
|
+
spec.email = ["pete@docuseal.com"]
|
|
8
|
+
|
|
9
|
+
spec.summary = "Minimal TensorRT bindings for Ruby using Rice"
|
|
10
|
+
spec.description = "Minimal Ruby bindings for NVIDIA TensorRT inference using Rice"
|
|
11
|
+
spec.homepage = "https://github.com/docusealco/tensorrt-rb"
|
|
12
|
+
spec.license = "Apache-2.0"
|
|
13
|
+
spec.required_ruby_version = ">= 3.0.0"
|
|
14
|
+
|
|
15
|
+
spec.files = Dir["lib/**/*.rb", "ext/**/*.{cpp,hpp,rb}", "*.gemspec", "Rakefile", "README.md"]
|
|
16
|
+
spec.require_paths = ["lib"]
|
|
17
|
+
spec.extensions = ["ext/tensorrt_rb/extconf.rb"]
|
|
18
|
+
|
|
19
|
+
spec.add_dependency "rice", ">= 4.7"
|
|
20
|
+
spec.add_dependency "ffi", "~> 1.0"
|
|
21
|
+
|
|
22
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
|
23
|
+
spec.add_development_dependency "rake-compiler", "~> 1.2"
|
|
24
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: tensorrt
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Pete Mtasyburka
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 2025-12-28 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: rice
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '4.7'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '4.7'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: ffi
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '1.0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '1.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: rake
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '13.0'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '13.0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: rake-compiler
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '1.2'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '1.2'
|
|
68
|
+
description: Minimal Ruby bindings for NVIDIA TensorRT inference using Rice
|
|
69
|
+
email:
|
|
70
|
+
- pete@docuseal.com
|
|
71
|
+
executables: []
|
|
72
|
+
extensions:
|
|
73
|
+
- ext/tensorrt_rb/extconf.rb
|
|
74
|
+
extra_rdoc_files: []
|
|
75
|
+
files:
|
|
76
|
+
- README.md
|
|
77
|
+
- Rakefile
|
|
78
|
+
- ext/tensorrt_rb/extconf.rb
|
|
79
|
+
- ext/tensorrt_rb/tensorrt_rb.cpp
|
|
80
|
+
- lib/tensorrt.rb
|
|
81
|
+
- lib/tensorrt/cuda.rb
|
|
82
|
+
- tensorrt.gemspec
|
|
83
|
+
homepage: https://github.com/docusealco/tensorrt-rb
|
|
84
|
+
licenses:
|
|
85
|
+
- Apache-2.0
|
|
86
|
+
metadata: {}
|
|
87
|
+
rdoc_options: []
|
|
88
|
+
require_paths:
|
|
89
|
+
- lib
|
|
90
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
91
|
+
requirements:
|
|
92
|
+
- - ">="
|
|
93
|
+
- !ruby/object:Gem::Version
|
|
94
|
+
version: 3.0.0
|
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
96
|
+
requirements:
|
|
97
|
+
- - ">="
|
|
98
|
+
- !ruby/object:Gem::Version
|
|
99
|
+
version: '0'
|
|
100
|
+
requirements: []
|
|
101
|
+
rubygems_version: 3.6.2
|
|
102
|
+
specification_version: 4
|
|
103
|
+
summary: Minimal TensorRT bindings for Ruby using Rice
|
|
104
|
+
test_files: []
|