compute_unit 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.gitlab-ci.yml +45 -0
- data/.rspec +3 -0
- data/.rubocop.yml +12 -0
- data/.rubocop_todo.yml +139 -0
- data/.ruby_version +1 -0
- data/CHANGELOG.md +3 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +18 -0
- data/LICENSE.txt +21 -0
- data/README.md +74 -0
- data/Rakefile +8 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/compute_unit.gemspec +43 -0
- data/exe/list_computes +13 -0
- data/exe/update_pcidb +11 -0
- data/lib/compute_unit.rb +43 -0
- data/lib/compute_unit/asic.rb +14 -0
- data/lib/compute_unit/cache_store.rb +143 -0
- data/lib/compute_unit/compute_base.rb +65 -0
- data/lib/compute_unit/cpu.rb +36 -0
- data/lib/compute_unit/device.rb +397 -0
- data/lib/compute_unit/exceptions.rb +14 -0
- data/lib/compute_unit/formatters.rb +21 -0
- data/lib/compute_unit/gpu.rb +338 -0
- data/lib/compute_unit/gpus/amd_gpu.rb +525 -0
- data/lib/compute_unit/gpus/nvidia_gpu.rb +223 -0
- data/lib/compute_unit/logger.rb +70 -0
- data/lib/compute_unit/monkey_patches.rb +101 -0
- data/lib/compute_unit/utils.rb +26 -0
- data/lib/compute_unit/version.rb +5 -0
- metadata +142 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ComputeUnit
|
4
|
+
module Exceptions
|
5
|
+
class NoPermission < RuntimeError; end
|
6
|
+
class PermissionDenied < RuntimeError; end
|
7
|
+
class UnsupportedGPU < RuntimeError; end
|
8
|
+
class NotSupported < RuntimeError; end
|
9
|
+
class UnsupportedOSversion < RuntimeError; end
|
10
|
+
class NoWorkerName < RuntimeError; end
|
11
|
+
class NoComputeUnits < RuntimeError; end
|
12
|
+
class InvalidPCIDatabase < RuntimeError; end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ComputeUnit
|
4
|
+
module Formatters
|
5
|
+
def micro_formatter(item, add_unit = false)
|
6
|
+
data = {}
|
7
|
+
item.each do |key, value|
|
8
|
+
if %i[hourly_cost hourly_earnings kwh_cost].include?(key)
|
9
|
+
v = (value * 1000000).round(4)
|
10
|
+
data[key] = add_unit ? "#{v} \u00B5BTC" : v
|
11
|
+
end
|
12
|
+
end
|
13
|
+
item.merge(data)
|
14
|
+
end
|
15
|
+
|
16
|
+
def value_micro_formatter(value, add_unit = false)
|
17
|
+
v = (value * 1000000).round(1)
|
18
|
+
add_unit ? "#{v} \u00B5BTC" : v
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,338 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'compute_unit/compute_base'
|
4
|
+
require 'compute_unit/cache_store'
|
5
|
+
module ComputeUnit
|
6
|
+
class Gpu < ComputeBase
|
7
|
+
attr_reader :pci_loc, :bios, :name
|
8
|
+
DEVICE_CLASS = '030000'
|
9
|
+
DEVICE_CLASS_NAME = 'GPU'
|
10
|
+
attr_accessor :power_limit, :use_opencl
|
11
|
+
|
12
|
+
def compute_type
|
13
|
+
type
|
14
|
+
end
|
15
|
+
|
16
|
+
# @return [OpenCL_Device]
|
17
|
+
def opencl_device
|
18
|
+
@opencl_device ||= self.class.opencl_devices.find_all { |cu| cu[:type] == make }[index] if use_opencl
|
19
|
+
end
|
20
|
+
|
21
|
+
# @return [String] - returns the raw data of the board name from opencl, return nil if no device
|
22
|
+
def opencl_board_name
|
23
|
+
@opencl_board_name ||= opencl_device&.board_name if use_opencl
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [Integer] - returns the number of compute units decteded by opencl
|
27
|
+
# not to be confused with stream processors. Can be helpful when determining which product vega56 or vega64
|
28
|
+
def opencl_units
|
29
|
+
@opencl_units ||= opencl_device.max_compute_units.to_i if use_opencl
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [String] - the device name
|
33
|
+
# ie. GeForce GTX 1070 or RX 580
|
34
|
+
# @note not really needed for Nvidia types since nvidia-smi returns really complete information
|
35
|
+
def opencl_name
|
36
|
+
@opencl_name ||= opencl_device.name if use_opencl
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Array] - returns a list of device paths of all devices considered for display
|
40
|
+
# @note the devices are sorted by the device path
|
41
|
+
# @note this can mean AMD, NVIDIA, Intel or other crappy embedded devices
|
42
|
+
def self.devices
|
43
|
+
@devices ||= ComputeUnit::ComputeBase.devices.find_all do |device|
|
44
|
+
ComputeUnit::Device.device_class(device) == DEVICE_CLASS
|
45
|
+
end.sort
|
46
|
+
end
|
47
|
+
|
48
|
+
# @param device_path [String] - that pci bus path to the device
|
49
|
+
# @param opts [Hash]
|
50
|
+
# @option bios [String] the bios id
|
51
|
+
# @option model [String] the model name
|
52
|
+
# @option serial [String] the serial id of the device
|
53
|
+
# @option busid [String] the pci bus path of the device
|
54
|
+
# @option meta [Hash] metadata about the device
|
55
|
+
# @option index [Integer] the index of the device found in the device tree
|
56
|
+
# @option uuid [String] the uuid of the device
|
57
|
+
# @option use_opencl [Boolean] set to true if you want to get info about the device from opencl, defaults to false
|
58
|
+
def initialize(device_path, opts = {})
|
59
|
+
super(device_path, opts)
|
60
|
+
@type = :GPU
|
61
|
+
@bios = opts[:bios].upcase if opts[:bios]
|
62
|
+
@model = opts[:model]
|
63
|
+
@serial = opts[:serial]
|
64
|
+
@pci_loc = opts[:busid]
|
65
|
+
@meta = opts[:meta]
|
66
|
+
@index = opts[:index].to_i
|
67
|
+
@uuid = opts[:uuid] || opts[:serial]
|
68
|
+
@name = model
|
69
|
+
@power_offset = 0
|
70
|
+
@use_opencl = opts[:use_opencl] || false
|
71
|
+
end
|
72
|
+
|
73
|
+
def fan
|
74
|
+
raise NotImplementedError
|
75
|
+
end
|
76
|
+
|
77
|
+
def status
|
78
|
+
return 0 if utilization > 20 && power >= 50
|
79
|
+
return 2 if power < 20
|
80
|
+
|
81
|
+
1
|
82
|
+
end
|
83
|
+
|
84
|
+
def power
|
85
|
+
raise NotImplementedError
|
86
|
+
end
|
87
|
+
|
88
|
+
def pstate
|
89
|
+
raise NotImplementedError
|
90
|
+
end
|
91
|
+
|
92
|
+
# @return [Integer] - a percentage value of the current fan limit
|
93
|
+
def fan_limit
|
94
|
+
fan
|
95
|
+
end
|
96
|
+
|
97
|
+
# @return [Integer] - a percentage value of the min fan limit
|
98
|
+
def fan_min_limit
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
|
102
|
+
# @return [Integer] - a percentage value of the max fan limit
|
103
|
+
def fan_max_limit
|
104
|
+
nil
|
105
|
+
end
|
106
|
+
|
107
|
+
def power_limit
|
108
|
+
raise NotImplementedError
|
109
|
+
end
|
110
|
+
|
111
|
+
def power_max_limit
|
112
|
+
raise NotImplementedError
|
113
|
+
end
|
114
|
+
|
115
|
+
def memory_total
|
116
|
+
raise NotImplementedError
|
117
|
+
end
|
118
|
+
|
119
|
+
def memory_used
|
120
|
+
raise NotImplementedError
|
121
|
+
end
|
122
|
+
|
123
|
+
def memory_free
|
124
|
+
raise NotImplementedError
|
125
|
+
end
|
126
|
+
|
127
|
+
def utilization
|
128
|
+
raise NotImplementedError
|
129
|
+
end
|
130
|
+
|
131
|
+
# @return [Integer] - the memory speed
|
132
|
+
def memory_clock
|
133
|
+
0
|
134
|
+
end
|
135
|
+
|
136
|
+
# @return [Integer] - the memory speed
|
137
|
+
def memory_volt
|
138
|
+
0
|
139
|
+
end
|
140
|
+
|
141
|
+
# @return [Integer] - the core clock speed
|
142
|
+
def core_clock
|
143
|
+
0
|
144
|
+
end
|
145
|
+
|
146
|
+
# @return [Numeric] - returns voltage of core in mV
|
147
|
+
def core_voltage
|
148
|
+
0
|
149
|
+
end
|
150
|
+
|
151
|
+
# @return [Numeric] - returns voltage of core in mV
|
152
|
+
def configured_core_voltage
|
153
|
+
0
|
154
|
+
end
|
155
|
+
|
156
|
+
def mem_info
|
157
|
+
{
|
158
|
+
index: "#{device_class_name}#{index}",
|
159
|
+
name: name,
|
160
|
+
volt: memory_volt,
|
161
|
+
clock: memory_clock,
|
162
|
+
memory_name: nil,
|
163
|
+
memory_type: nil,
|
164
|
+
memory_used: memory_used,
|
165
|
+
memory_free: memory_free,
|
166
|
+
memory_total: memory_total,
|
167
|
+
mem_temp: mem_temp
|
168
|
+
}
|
169
|
+
end
|
170
|
+
|
171
|
+
# @return [Hash] - hash of hardware status about the gpu
|
172
|
+
def status_info
|
173
|
+
{
|
174
|
+
index: "#{device_class_name}#{index}",
|
175
|
+
name: name,
|
176
|
+
bios: bios,
|
177
|
+
core_clock: core_clock,
|
178
|
+
memory_clock: memory_clock,
|
179
|
+
power: power,
|
180
|
+
fan: fan,
|
181
|
+
core_volt: core_voltage,
|
182
|
+
temp: temp,
|
183
|
+
mem_temp: mem_temp,
|
184
|
+
status: status
|
185
|
+
}
|
186
|
+
end
|
187
|
+
|
188
|
+
# @return [Hash] - hash of information about the gpu data
|
189
|
+
def hardware_info
|
190
|
+
{
|
191
|
+
uuid: uuid,
|
192
|
+
gpuId: "GPU#{index}",
|
193
|
+
syspath: device_path,
|
194
|
+
pciLoc: pci_loc,
|
195
|
+
name: name,
|
196
|
+
bios: bios,
|
197
|
+
subType: subtype,
|
198
|
+
make: make,
|
199
|
+
model: model,
|
200
|
+
vendor: vendor
|
201
|
+
}
|
202
|
+
end
|
203
|
+
|
204
|
+
# @return [Integer] - the temperature of the asic chip
|
205
|
+
def asic_temp
|
206
|
+
0
|
207
|
+
end
|
208
|
+
|
209
|
+
# @return [Integer] - temperature of the memory
|
210
|
+
def mem_temp
|
211
|
+
0
|
212
|
+
end
|
213
|
+
|
214
|
+
# @return [Integer] - the voltage reading of the card, maybe just amd cards (mV)
|
215
|
+
def vddgfx
|
216
|
+
0
|
217
|
+
end
|
218
|
+
|
219
|
+
def temp
|
220
|
+
0
|
221
|
+
end
|
222
|
+
|
223
|
+
def to_h
|
224
|
+
{
|
225
|
+
uuid: uuid,
|
226
|
+
gpuId: "GPU#{index}",
|
227
|
+
syspath: device_path,
|
228
|
+
pciLoc: pci_loc,
|
229
|
+
name: name,
|
230
|
+
bios: bios,
|
231
|
+
subType: subtype,
|
232
|
+
make: make,
|
233
|
+
model: model,
|
234
|
+
vendor: vendor,
|
235
|
+
# memory_name: nil,
|
236
|
+
# memory_type: nil,
|
237
|
+
# gpu_platform: nil,
|
238
|
+
power: power,
|
239
|
+
# power_limit: power_limit,
|
240
|
+
# power_max_limit: power_max_limit,
|
241
|
+
utilization: utilization,
|
242
|
+
# memory_used: memory_used ,
|
243
|
+
# memory_free: memory_free,
|
244
|
+
# memory_total: memory_total,
|
245
|
+
temperature: temp,
|
246
|
+
status: status,
|
247
|
+
pstate: pstate,
|
248
|
+
fanSpeed: fan,
|
249
|
+
type: compute_type,
|
250
|
+
maxTemp: nil,
|
251
|
+
mem: memory_clock,
|
252
|
+
cor: core_clock,
|
253
|
+
vlt: core_voltage,
|
254
|
+
mem_temp: mem_temp,
|
255
|
+
maxFan: nil,
|
256
|
+
dpm: nil,
|
257
|
+
vddci: nil,
|
258
|
+
maxPower: nil,
|
259
|
+
ocProfile: nil,
|
260
|
+
opencl_enabled: use_opencl
|
261
|
+
}
|
262
|
+
end
|
263
|
+
|
264
|
+
# @return [Array] - returns an array of gpu objects, sorted by index
|
265
|
+
def self.find_all(use_opencl = false)
|
266
|
+
require 'compute_unit/gpus/amd_gpu'
|
267
|
+
require 'compute_unit/gpus/nvidia_gpu'
|
268
|
+
g = ComputeUnit::AmdGpu.find_all(use_opencl) + ComputeUnit::NvidiaGpu.find_all(use_opencl)
|
269
|
+
g.sort_by(&:index)
|
270
|
+
end
|
271
|
+
|
272
|
+
# @return [CacheStore] - returns an instance of the cachestore for storign opencl cache
|
273
|
+
def self.opencl_cache
|
274
|
+
@opencl_cache ||= ComputeUnit::CacheStore.new('opencl_cache')
|
275
|
+
end
|
276
|
+
|
277
|
+
# @return [Array] - array of openstruct or nil
|
278
|
+
def self.opencl_devices_from_cache
|
279
|
+
data = opencl_cache.read_cache('opencl_compute_units', {})
|
280
|
+
data[ComputeUnit::Device.system_checksum]
|
281
|
+
end
|
282
|
+
|
283
|
+
# @returns [Array] - an array of openstruct objects
|
284
|
+
def self.opencl_devices_from_platform
|
285
|
+
require 'ostruct'
|
286
|
+
# opencl takes a second to load so we cache later in the process
|
287
|
+
# which is why we need the openstruct object here
|
288
|
+
# opencl can also freeze the system if it tries to enumerate a dead GPU
|
289
|
+
# opencl sould be used sparingly as a result and only read when absolutely
|
290
|
+
# neccessary and no dead GPUs.
|
291
|
+
# TODO: warn when dead gpus detected
|
292
|
+
begin
|
293
|
+
require 'opencl_ruby_ffi'
|
294
|
+
ComputeUnit::Logger.logger.debug('Searching for openCL devices')
|
295
|
+
OpenCL.platforms.map(&:devices).flatten.map do |d|
|
296
|
+
type = d.platform.name.include?('AMD') ? 'AMD' : 'Nvidia'
|
297
|
+
board_name = type == 'AMD' ? d.board_name_amd : ''
|
298
|
+
max_computes = d.respond_to?(:max_compute_units) ? d.max_compute_units : 0
|
299
|
+
OpenStruct.new(
|
300
|
+
name: d.name,
|
301
|
+
type: type,
|
302
|
+
board_name: board_name,
|
303
|
+
max_compute_units: max_computes
|
304
|
+
)
|
305
|
+
end
|
306
|
+
rescue OpenCL::Error::DEVICE_NOT_FOUND => e
|
307
|
+
ComputeUnit::Logger.logger.debug("OpenCL error: #{e.message}, are you root?")
|
308
|
+
[]
|
309
|
+
rescue RuntimeError => e # OpenCL::Error::PLATFORM_NOT_FOUND_KHR,
|
310
|
+
ComputeUnit::Logger.logger.debug("OpenCL error: #{e.message}")
|
311
|
+
ComputeUnit::Logger.logger.debug("OpenCL error: #{e.backtrace}")
|
312
|
+
[]
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# @return [Hash] - a hash of voltages per the voltage table, nil if no table available
|
317
|
+
def voltage_table
|
318
|
+
[]
|
319
|
+
end
|
320
|
+
|
321
|
+
# @return [Array] - array of devices paths either from amd or nvidia
|
322
|
+
def self.found_devices
|
323
|
+
@found_devices ||= ComputeUnit::AmdGpu.devices + ComputeUnit::NvidiaGpu.devices
|
324
|
+
end
|
325
|
+
|
326
|
+
# @return [Array] - returns an array of opencl devices
|
327
|
+
# overwrites cache if new devices are found
|
328
|
+
# OpenCL should only be used when necessary as it can freeze sometimes
|
329
|
+
# OpenCL indexes items differently
|
330
|
+
def self.opencl_devices
|
331
|
+
@opencl_devices ||= opencl_devices_from_cache || begin
|
332
|
+
items = opencl_devices_from_platform
|
333
|
+
opencl_cache.write_cache('opencl_compute_units', ComputeUnit::Device.system_checksum.to_s => items)
|
334
|
+
items
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
@@ -0,0 +1,525 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'compute_unit/gpu'
|
4
|
+
require 'digest'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
module ComputeUnit
|
8
|
+
class AmdGpu < ComputeUnit::Gpu
|
9
|
+
MAKE = 'AMD'
|
10
|
+
VENDOR_ID = '1002'
|
11
|
+
SUBTYPE = 'amdgpu'
|
12
|
+
SYS_DEBUG_PATH = File.join(ComputeUnit::SYSFS_PATH, 'kernel', 'debug', 'dri')
|
13
|
+
|
14
|
+
def initialize(device_path, opts = {})
|
15
|
+
super(device_path, opts)
|
16
|
+
@pci_loc = File.basename(device_path)
|
17
|
+
@model = opts[:model] if opts[:use_opencl]
|
18
|
+
|
19
|
+
@uuid = "GPU#{index}"
|
20
|
+
end
|
21
|
+
|
22
|
+
def meta
|
23
|
+
{}
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [String] - the bios according to the vbios rom
|
27
|
+
# sometimes the kernel / driver extracted rom can be incorrect
|
28
|
+
# this is the bios gathered from the vbios itself.
|
29
|
+
def rom_bios
|
30
|
+
if !/\d{3}-/.match?(rom_metadata[2])
|
31
|
+
logger.warn("Invalid rom bios name for GPU#{index} using alternate name for #{rom_metadata[3]}")
|
32
|
+
rom_metadata[3]
|
33
|
+
elsif /\d{3}-/.match?(rom_metadata[2])
|
34
|
+
rom_metadata[2]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# @return [String::IO] - the contents of the rom file
|
39
|
+
def read_rom_data
|
40
|
+
if File.exist?(debug_rom_path)
|
41
|
+
IO.read(debug_rom_path, mode: 'rb')
|
42
|
+
elsif File.exist?(rom_path)
|
43
|
+
rom_data
|
44
|
+
else
|
45
|
+
''
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# @return [String] - the path to the readonly rom file
|
50
|
+
def debug_rom_path
|
51
|
+
@rom_path ||= File.join(SYS_DEBUG_PATH, index.to_s, 'amdgpu_vbios')
|
52
|
+
end
|
53
|
+
|
54
|
+
# @return [Array] - an array of readable strings from the rom file
|
55
|
+
def rom_metadata
|
56
|
+
@rom_metadata || begin
|
57
|
+
printable_chars = %r{[A-Za-z0-9`~!@#%^&*()-_=+|'";:/?.>,< \t\$\{\}\[\]\\]{10,}}
|
58
|
+
read_rom_data.scan(printable_chars)[0..9]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# @return [String] - returns the name of compute board
|
63
|
+
# for vegas we have to also get the compute units
|
64
|
+
def board_name
|
65
|
+
@board_name ||= begin
|
66
|
+
return nil unless opencl_board_name
|
67
|
+
|
68
|
+
name = opencl_board_name.sub(/Series|\(TM\)/, '').sub('Graphics', '').sub(/\s{2}/, ' ').strip
|
69
|
+
/vega/i.match?(name) ? "#{name} #{opencl_units}" : name
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [Array] - returns a list of device paths of all devices specific to the vendor id
|
74
|
+
def self.devices
|
75
|
+
ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID }
|
76
|
+
end
|
77
|
+
|
78
|
+
def name
|
79
|
+
model
|
80
|
+
end
|
81
|
+
|
82
|
+
# @return [String] - the name of the device model (specific name)
|
83
|
+
def model
|
84
|
+
@model ||= begin
|
85
|
+
board_name || sysfs_model_name
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def load
|
90
|
+
utilization
|
91
|
+
end
|
92
|
+
|
93
|
+
# @return [Integer] - returns temp of gpu in celius
|
94
|
+
def temp
|
95
|
+
read_hwmon_data('temp1_input', 0).to_i / 1000
|
96
|
+
end
|
97
|
+
|
98
|
+
# @return [Integer] - returns fan rpm speed, 0 if cannot be found
|
99
|
+
def fan
|
100
|
+
read_hwmon_data('fan1_input', 0).to_i
|
101
|
+
end
|
102
|
+
|
103
|
+
# @return [Numeric] - returns voltage of core in mV
|
104
|
+
def core_voltage
|
105
|
+
dpm_core_vddc.zero? ? vddgfx.to_i : dpm_core_vddc
|
106
|
+
end
|
107
|
+
|
108
|
+
def configured_core_voltage
|
109
|
+
vddc
|
110
|
+
end
|
111
|
+
|
112
|
+
# @return [Integer] - the memory speed
|
113
|
+
def memory_clock
|
114
|
+
data = read_kernel_setting('pp_dpm_mclk', '').split("\n")
|
115
|
+
item = data.find { |d| d.include?('*') }
|
116
|
+
item.nil? ? item : item.match(/\d{2,6}/).to_a.first.to_i
|
117
|
+
end
|
118
|
+
|
119
|
+
# @return [Integer] - the core clock speed
|
120
|
+
def core_clock
|
121
|
+
data = read_kernel_setting('pp_dpm_sclk', '').split("\n")
|
122
|
+
item = data.find { |d| d.include?('*') }
|
123
|
+
item.nil? ? item : item.match(/\d{2,6}/).to_a.first.to_i
|
124
|
+
end
|
125
|
+
|
126
|
+
# @return [Integer] - the core voltage reading of the GPU via HWMON
|
127
|
+
def vddgfx
|
128
|
+
read_hwmon_data('in0_input', 0).to_i
|
129
|
+
end
|
130
|
+
|
131
|
+
# currently configured gpu core voltage
|
132
|
+
# @return [Numeric] - returns voltage of core in mV
|
133
|
+
def vddc
|
134
|
+
read_kernel_setting('pp_voltage', 0).to_i
|
135
|
+
end
|
136
|
+
|
137
|
+
# currently running gpu core voltage
|
138
|
+
def dpm_core_vddc
|
139
|
+
read_kernel_setting('pp_core_vddc', 0).to_i
|
140
|
+
end
|
141
|
+
|
142
|
+
def subtype
|
143
|
+
SUBTYPE
|
144
|
+
end
|
145
|
+
|
146
|
+
def clock_limits
|
147
|
+
read_kernel_setting('pp_od_clk_limits', '')
|
148
|
+
end
|
149
|
+
|
150
|
+
def gpu_defaults
|
151
|
+
read_kernel_setting('gpu_defaults', '')
|
152
|
+
end
|
153
|
+
|
154
|
+
# @return [Array] - array of hashes of voltages {:pstate=>0, :sclk=>300, :volt=>750}
|
155
|
+
def voltage_table
|
156
|
+
data = read_kernel_setting('pp_od_clk_voltage', nil)
|
157
|
+
return [] if data.nil?
|
158
|
+
|
159
|
+
_, sclk, = data.split(/OD_[S,M]CLK:\s?\n/)
|
160
|
+
sclk.split("\n").map do |line|
|
161
|
+
pstate, clk, volt, = line.gsub(/:|Mhz|mV/, '').split(/\s{2,}/).map(&:to_i)
|
162
|
+
{ pstate: pstate, clk: clk, volt: volt, type: :sclk }
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# @return [Array] - array of hashes of voltages {:pstate=>0, :mclk=>300, :volt=>750}
|
167
|
+
def vddci_voltage_table
|
168
|
+
# not sure if this is what mclk is but left it here anyways
|
169
|
+
data = read_kernel_setting('pp_od_clk_voltage', nil)
|
170
|
+
return data if data.nil?
|
171
|
+
|
172
|
+
_, _, mclk = data.split(/OD_[S,M]CLK:\s?\n/)
|
173
|
+
mclk.split("\n").map do |line|
|
174
|
+
pstate, clk, volt, = line.gsub(/:|Mhz|mV/, '').split(/\s{2,}/).map(&:to_i)
|
175
|
+
{ pstate: pstate, clk: clk, volt: volt, type: :mclk }
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# @return [Float] the average power being used by the gpu
|
180
|
+
def power_average
|
181
|
+
# TODO: if a gpu crashes the average power can sometimes take 3000 ms to read!
|
182
|
+
read_hwmon_data('power1_average', 0).to_i / 1000000
|
183
|
+
end
|
184
|
+
|
185
|
+
# @return [Float] the power being used by the gpu
|
186
|
+
def power
|
187
|
+
pp_value = read_kernel_setting('pp_power_usage', 0).to_i
|
188
|
+
value = pp_value > 0 ? pp_value : power_average
|
189
|
+
value + power_offset
|
190
|
+
end
|
191
|
+
|
192
|
+
# @return [String] - the name of the bios which is unique for every card
|
193
|
+
def bios
|
194
|
+
@bios ||= begin
|
195
|
+
a = read_kernel_setting('vbios_version', 'unreadable').upcase
|
196
|
+
b = rom_bios
|
197
|
+
/\d{3}-/.match?(b) ? b : a
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def pstate
|
202
|
+
-1
|
203
|
+
end
|
204
|
+
|
205
|
+
# @return [String] - the serial number of the card
|
206
|
+
def serial
|
207
|
+
'unknown'
|
208
|
+
end
|
209
|
+
|
210
|
+
# @param value [Numeric] - the power limit that should be applied to the gpu
|
211
|
+
# @return [Numeric] - original passed in value after being set
|
212
|
+
def power_limit=(value)
|
213
|
+
max = power_max_limit
|
214
|
+
raise ArgumentError.new("Power Value #{value} cannot exceed #{max}") if value > max
|
215
|
+
raise ArgumentError.new("Value must be between 10 and #{max}") if value < 10
|
216
|
+
|
217
|
+
# hwmon expects the value to have 6 zeros
|
218
|
+
write_hwmon_data('power1_cap', value * 1000000)
|
219
|
+
# logger.info("GPU#{index} power set to #{value} Watts")
|
220
|
+
end
|
221
|
+
|
222
|
+
# @param value [Numeric] - the fan limit that should be applied to the gpu as a percentage
|
223
|
+
# @return [Numeric] - original passed in value after being set
|
224
|
+
def set_fan_limit(value, type = 'current')
|
225
|
+
write_hwmon_data('fan1_enable', '1')
|
226
|
+
hwmon_file = if type == 'min'
|
227
|
+
'pwm1_min'
|
228
|
+
elsif type == 'max'
|
229
|
+
'pwm1_max'
|
230
|
+
elsif type == 'current'
|
231
|
+
'pwm1'
|
232
|
+
else
|
233
|
+
raise ArgumentError.new("Invalid fan setting type, must be one of 'current, min or max'")
|
234
|
+
end
|
235
|
+
raise ArgumentError.new('Fan limit cannot exceed 100') if value > 100
|
236
|
+
raise ArgumentError.new('Fan limit value must be between 20 and 100') if value < 20
|
237
|
+
|
238
|
+
# Value must be between 0-255
|
239
|
+
amount = (255 * (value / 100.0)).round
|
240
|
+
logger.debug("Setting #{type} Fan on GPU#{index} to #{amount}")
|
241
|
+
write_hwmon_data(hwmon_file, amount)
|
242
|
+
logger.info("GPU#{index} #{type} fan set to #{value} percent")
|
243
|
+
value
|
244
|
+
end
|
245
|
+
|
246
|
+
# @return [Numeric] - current fan limit as a percentage
|
247
|
+
# @note the OS values is between 0 - 255
|
248
|
+
def fan_limit
|
249
|
+
cur = read_hwmon_data('pwm1', 0).to_i
|
250
|
+
return cur unless cur > 0
|
251
|
+
|
252
|
+
((cur / 255.0) * 100).round(0)
|
253
|
+
end
|
254
|
+
|
255
|
+
# @return [Numeric] - current fan limit as a percentage
|
256
|
+
# @note the OS values is between 0 - 255
|
257
|
+
def fan_max_limit
|
258
|
+
cur = read_hwmon_data('pwm1_max', 0).to_i
|
259
|
+
return cur unless cur > 0
|
260
|
+
|
261
|
+
((cur / 255.0) * 100).round(0)
|
262
|
+
end
|
263
|
+
|
264
|
+
# @return [Numeric] - current fan limit as a percentage
|
265
|
+
# @note the OS values is between 0 - 255
|
266
|
+
def fan_min_limit
|
267
|
+
cur = read_hwmon_data('pwm1_min', 0).to_i
|
268
|
+
return cur unless cur > 0
|
269
|
+
|
270
|
+
((cur / 255.0) * 100).round(0)
|
271
|
+
end
|
272
|
+
|
273
|
+
# @return [Numeric] - current power limit
|
274
|
+
def power_limit
|
275
|
+
read_hwmon_data('power1_cap', 0).to_i / 1000000
|
276
|
+
end
|
277
|
+
|
278
|
+
# @return [Numeric] - the maximum power that can be set
|
279
|
+
def power_max_limit
|
280
|
+
read_hwmon_data('power1_cap_max').to_i / 1000000
|
281
|
+
end
|
282
|
+
|
283
|
+
def memory_total
|
284
|
+
0
|
285
|
+
end
|
286
|
+
|
287
|
+
def memory_used
|
288
|
+
0
|
289
|
+
end
|
290
|
+
|
291
|
+
def memory_free
|
292
|
+
0
|
293
|
+
end
|
294
|
+
|
295
|
+
def utilization
|
296
|
+
return 0 unless amdgpu_pm_info[:load]
|
297
|
+
|
298
|
+
amdgpu_pm_info[:load][:value].to_i || 0
|
299
|
+
end
|
300
|
+
|
301
|
+
def self.create_from_path(device_path, index, use_opencl = false)
|
302
|
+
opts = {
|
303
|
+
device_class_id: device_class(device_path),
|
304
|
+
device_id: device(device_path),
|
305
|
+
device_vendor_id: device_vendor(device_path),
|
306
|
+
subsystem_vendor_id: subsystem_vendor(device_path),
|
307
|
+
subsystem_device_id: subsystem_device(device_path),
|
308
|
+
use_opencl: use_opencl,
|
309
|
+
index: index
|
310
|
+
}
|
311
|
+
new(device_path, opts)
|
312
|
+
end
|
313
|
+
|
314
|
+
# @return [Array] - returns and array of gpu instances of AMD type only
|
315
|
+
def self.find_all(use_opencl = false)
|
316
|
+
devices.map.with_index do |device_path, _index|
|
317
|
+
found_index = ComputeUnit::Gpu.found_devices.index(device_path)
|
318
|
+
create_from_path(device_path, found_index, use_opencl)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def read_dri_debug_file(file_name, default = '')
|
323
|
+
File.read(File.join(debug_dri_dir, file_name))
|
324
|
+
rescue Errno::EINVAL
|
325
|
+
default
|
326
|
+
rescue Errno::ENOENT
|
327
|
+
default
|
328
|
+
rescue Errno::EACCES
|
329
|
+
logger.debug('run this command as root or with sudo, using default values')
|
330
|
+
default
|
331
|
+
end
|
332
|
+
|
333
|
+
# @return [String] - returns the path the debug dri directory
|
334
|
+
# ie. "/sys/kernel/debug/dri/0"
|
335
|
+
def debug_dri_dir
|
336
|
+
@debug_dri_dir ||= begin
|
337
|
+
# if the user does not have permission the path will be nil
|
338
|
+
path = Dir.glob(File.join(SYS_DEBUG_PATH, '*', 'name')).find { |file| File.read(file).include?(pci_loc) }
|
339
|
+
raise Errno::EACCES.new("Permission denied #{SYS_DEBUG_PATH}") unless path
|
340
|
+
|
341
|
+
File.dirname(path)
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# @returns [Array] - list of pm info
|
346
|
+
# {:mclk=>{:value=>"1950", :unit=>"MHz"},
|
347
|
+
# :sclk=>{:value=>"1125", :unit=>"MHz"},
|
348
|
+
# :vddgfx=>{:value=>"950", :unit=>"mV"},
|
349
|
+
# :vddc=>{:value=>"61.49", :unit=>"W"},
|
350
|
+
# :vddci=>{:value=>"1.0", :unit=>"W"},
|
351
|
+
# :max_gpu=>{:value=>"81.243", :unit=>"W"},
|
352
|
+
# :average_gpu=>{:value=>"82.117", :unit=>"W"},
|
353
|
+
# :temperature=>{:value=>"41", :unit=>"C"},
|
354
|
+
# :load=>{:value=>"100", :unit=>"%"}}
|
355
|
+
def amdgpu_pm_info
|
356
|
+
@amdgpu_pm_info ||= begin
|
357
|
+
content = read_dri_debug_file('amdgpu_pm_info')
|
358
|
+
data = content.scan(/(\d+\.?\d*)\s+(\w*)\s\(([\w\s]*)\)?/) + content.scan(/(\w*):\s(\d+)\s(.*)/).map(&:rotate)
|
359
|
+
data_hash = {}
|
360
|
+
data.each do |value, unit, name|
|
361
|
+
data_hash[name.gsub(/\s/, '_').downcase.to_sym] = { value: value, unit: unit }
|
362
|
+
end
|
363
|
+
data_hash
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
# @return [String] - reads the setting after writing the setting and returns current value
|
368
|
+
def dpm_force_performance
|
369
|
+
read_kernel_setting('power_dpm_force_performance_level', nil)
|
370
|
+
end
|
371
|
+
|
372
|
+
# @param setting [String] - the dpm performance setting to adjust the dpm (manual or auto)
|
373
|
+
# @return [String] - reads the setting after writing the setting and returns current value
|
374
|
+
def dpm_force_performance_setting(setting = 'manual')
|
375
|
+
raise ArgumentError.new('setting must be one of manual or auto') unless setting =~ /manual|auto/
|
376
|
+
|
377
|
+
write_kernel_setting('power_dpm_force_performance_level', "#{setting}\n")
|
378
|
+
end
|
379
|
+
|
380
|
+
def reset_to_defaults
|
381
|
+
dpm_force_performance_setting('auto')
|
382
|
+
write_kernel_setting('pp_od_clk_voltage', 'r')
|
383
|
+
write_kernel_setting('pp_od_clk_voltage', 'c')
|
384
|
+
write_hwmon_data('pwm1_enable', '2')
|
385
|
+
end
|
386
|
+
|
387
|
+
# @return [Array]
|
388
|
+
# reading from file "Sclk Limit: 2000 Mhz", "Mclk Limit: 2250 Mhz"
|
389
|
+
# @example [2000, 2250]
|
390
|
+
def clock_max_defaults
|
391
|
+
read_kernel_setting('pp_od_clk_limits', '0 0').scan(/\d+/).map(&:to_i)
|
392
|
+
end
|
393
|
+
|
394
|
+
# @return [Integer]
|
395
|
+
def max_core_clock
|
396
|
+
clock_max_defaults.first
|
397
|
+
end
|
398
|
+
|
399
|
+
# @return [Integer]
|
400
|
+
def min_core_clock
|
401
|
+
voltage_table[0][:clk]
|
402
|
+
end
|
403
|
+
|
404
|
+
# @return [Integer]
|
405
|
+
def max_mem_clock
|
406
|
+
clock_max_defaults.last # or vddci_voltage_table.last[:clk]
|
407
|
+
end
|
408
|
+
|
409
|
+
# @return [Integer]
|
410
|
+
def min_mem_clock
|
411
|
+
vddci_voltage_table.first[:clk]
|
412
|
+
end
|
413
|
+
|
414
|
+
# @return [Integer]
|
415
|
+
def max_mem_volt
|
416
|
+
vddci_voltage_table.last[:volt]
|
417
|
+
end
|
418
|
+
|
419
|
+
# @return [Integer]
|
420
|
+
def min_mem_volt
|
421
|
+
vddci_voltage_table.first[:volt]
|
422
|
+
end
|
423
|
+
|
424
|
+
# @return [Integer] - the temperature of the asic chip
|
425
|
+
def asic_temp
|
426
|
+
read_hwmon_data('temp2_input', 0).to_i / 1000
|
427
|
+
end
|
428
|
+
|
429
|
+
# @return [Integer] - the temperature of the memory chips
|
430
|
+
def mem_temp
|
431
|
+
read_hwmon_data('temp3_input', 0).to_i / 1000
|
432
|
+
end
|
433
|
+
|
434
|
+
def set_mem_clock_and_vddc(mem_clock, mem_volt)
|
435
|
+
return unless experimental_on?
|
436
|
+
|
437
|
+
mem_clock = mem_clock.to_i
|
438
|
+
mem_volt = mem_volt.to_i
|
439
|
+
# TODO: find max and min values and limit input
|
440
|
+
dpm_force_performance_setting('manual')
|
441
|
+
raise ArgumentError.new("MemClock value #{mem_clock} must be between #{min_mem_clock}-#{max_mem_clock}") unless mem_clock.between?(min_mem_clock, max_mem_clock)
|
442
|
+
raise ArgumentError.new("MemVolt value #{mem_volt} must be between #{min_mem_volt}-#{max_mem_volt}") unless mem_volt.between?(min_mem_volt, max_mem_volt)
|
443
|
+
|
444
|
+
write_kernel_setting('pp_od_clk_voltage', "r\n") # unlocks in order to write
|
445
|
+
# set row in table (m = manual), 3 = row,
|
446
|
+
write_kernel_setting('pp_od_clk_voltage', "m 3 #{mem_clock} #{mem_volt}\n")
|
447
|
+
write_kernel_setting('pp_od_clk_voltage', "c\n") # locks file
|
448
|
+
write_kernel_setting('pp_mclk_od', "3\n")
|
449
|
+
logger.info("Successfully applied overclock #{mem_clock} #{mem_volt} to #{name} at #{pci_loc}")
|
450
|
+
end
|
451
|
+
end
|
452
|
+
end
|
453
|
+
|
454
|
+
# See https://www.kernel.org/doc/html/latest/gpu/amdgpu.html for sysfs kernel
|
455
|
+
# /sys/kernel/debug/dri/2/name
|
456
|
+
# amdgpu dev=0000:08:00.0 unique=0000:08:00.0
|
457
|
+
|
458
|
+
# Add percent overclock to core speed
|
459
|
+
# sudo echo "7" > /sys/class/drm/card0/device/pp_sclk_od
|
460
|
+
|
461
|
+
# Add percent overclock to mem speed
|
462
|
+
# sudo echo "4" > /sys/class/drm/card0/device/pp_mclk_od
|
463
|
+
|
464
|
+
# The way the current AMDGPU overclocking works for the core frequency
|
465
|
+
# is by writing an integer value between 0 and 20 to
|
466
|
+
# /sys/class/drm/card0/device/pp_sclk_od. That value represents an
|
467
|
+
# overclock of 0~20% above the GPU's core frequency. Similarly,
|
468
|
+
# writing a value to /sys/class/drm/card0/device/pp_mclk_od represents
|
469
|
+
# a percentage-based overclock to the memory frequency.
|
470
|
+
#
|
471
|
+
#
|
472
|
+
# You can change the frequencies and voltage by modifying
|
473
|
+
# the file /sys/class/drm/card0/device/pp_od_clk_voltage
|
474
|
+
#
|
475
|
+
# first: This holds the presets for pp_dpm_sclk and pp_dpm_mclk.
|
476
|
+
#
|
477
|
+
# Second check the current settings:
|
478
|
+
#
|
479
|
+
# sudo cat /sys/class/drm/card0/device/pp_od_clk_voltage
|
480
|
+
# You should see something similar to this:
|
481
|
+
#
|
482
|
+
# OD_SCLK:
|
483
|
+
# 0: 300MHz 750mV
|
484
|
+
# 1: 588MHz 765mV
|
485
|
+
# 2: 980MHz 987mV
|
486
|
+
# 3: 1100MHz 950mV
|
487
|
+
# 4: 1100MHz 950mV
|
488
|
+
# 5: 1100MHz 950mV
|
489
|
+
# 6: 1100MHz 950mV
|
490
|
+
# 7: 1100MHz 950mV
|
491
|
+
# OD_MCLK:
|
492
|
+
# 0: 300MHz 750mV
|
493
|
+
# 1: 1000MHz 800mV
|
494
|
+
# 2: 1970MHz 950mV
|
495
|
+
# OD_RANGE:
|
496
|
+
# SCLK: 300MHz 2000MHz
|
497
|
+
# MCLK: 300MHz 2250MHz
|
498
|
+
# VDDC: 750mV 1150mV
|
499
|
+
# Example to set 1280Mhz at 950mV (check your output above for possible ranges!):
|
500
|
+
#
|
501
|
+
# sudo echo "s 7 1280 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
502
|
+
# Then to apply the changes:
|
503
|
+
#
|
504
|
+
# sudo echo 0 > /sys/class/drm/card0/device/pp_sclk_od
|
505
|
+
# sudo echo 1 > /sys/class/drm/card0/device/pp_sclk_od
|
506
|
+
# These are all the settings I use on my crypto mining card which is an AMD Radeon RX570 in case it's useful to anyone.
|
507
|
+
#
|
508
|
+
# echo 1 > /sys/class/drm/card0/device/hwmon/hwmon0/pwm1_enable
|
509
|
+
# echo manual > /sys/class/drm/card0/device/power_dpm_force_performance_level
|
510
|
+
# echo 200 > /sys/class/drm/card0/device/hwmon/hwmon0/pwm1 # Fan speed
|
511
|
+
# echo 4 > /sys/class/drm/card0/device/pp_power_profile_mode # Compute Mode
|
512
|
+
#
|
513
|
+
# echo "s 3 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
514
|
+
# echo "s 4 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
515
|
+
# echo "s 5 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
516
|
+
# echo "s 6 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
517
|
+
# echo "s 7 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
518
|
+
#
|
519
|
+
# echo "m 2 1985 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
520
|
+
#
|
521
|
+
# echo 0 > /sys/class/drm/card0/device/pp_sclk_od
|
522
|
+
# echo 1 > /sys/class/drm/card0/device/pp_sclk_od
|
523
|
+
#
|
524
|
+
# echo 0 > /sys/class/drm/card0/device/pp_mclk_od
|
525
|
+
# echo 1 > /sys/class/drm/card0/device/pp_mclk_od
|