compute_unit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.gitlab-ci.yml +45 -0
- data/.rspec +3 -0
- data/.rubocop.yml +12 -0
- data/.rubocop_todo.yml +139 -0
- data/.ruby_version +1 -0
- data/CHANGELOG.md +3 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +18 -0
- data/LICENSE.txt +21 -0
- data/README.md +74 -0
- data/Rakefile +8 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/compute_unit.gemspec +43 -0
- data/exe/list_computes +13 -0
- data/exe/update_pcidb +11 -0
- data/lib/compute_unit.rb +43 -0
- data/lib/compute_unit/asic.rb +14 -0
- data/lib/compute_unit/cache_store.rb +143 -0
- data/lib/compute_unit/compute_base.rb +65 -0
- data/lib/compute_unit/cpu.rb +36 -0
- data/lib/compute_unit/device.rb +397 -0
- data/lib/compute_unit/exceptions.rb +14 -0
- data/lib/compute_unit/formatters.rb +21 -0
- data/lib/compute_unit/gpu.rb +338 -0
- data/lib/compute_unit/gpus/amd_gpu.rb +525 -0
- data/lib/compute_unit/gpus/nvidia_gpu.rb +223 -0
- data/lib/compute_unit/logger.rb +70 -0
- data/lib/compute_unit/monkey_patches.rb +101 -0
- data/lib/compute_unit/utils.rb +26 -0
- data/lib/compute_unit/version.rb +5 -0
- metadata +142 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ComputeUnit
|
4
|
+
module Exceptions
|
5
|
+
class NoPermission < RuntimeError; end
|
6
|
+
class PermissionDenied < RuntimeError; end
|
7
|
+
class UnsupportedGPU < RuntimeError; end
|
8
|
+
class NotSupported < RuntimeError; end
|
9
|
+
class UnsupportedOSversion < RuntimeError; end
|
10
|
+
class NoWorkerName < RuntimeError; end
|
11
|
+
class NoComputeUnits < RuntimeError; end
|
12
|
+
class InvalidPCIDatabase < RuntimeError; end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ComputeUnit
|
4
|
+
module Formatters
|
5
|
+
def micro_formatter(item, add_unit = false)
|
6
|
+
data = {}
|
7
|
+
item.each do |key, value|
|
8
|
+
if %i[hourly_cost hourly_earnings kwh_cost].include?(key)
|
9
|
+
v = (value * 1000000).round(4)
|
10
|
+
data[key] = add_unit ? "#{v} \u00B5BTC" : v
|
11
|
+
end
|
12
|
+
end
|
13
|
+
item.merge(data)
|
14
|
+
end
|
15
|
+
|
16
|
+
def value_micro_formatter(value, add_unit = false)
|
17
|
+
v = (value * 1000000).round(1)
|
18
|
+
add_unit ? "#{v} \u00B5BTC" : v
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,338 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'compute_unit/compute_base'
|
4
|
+
require 'compute_unit/cache_store'
|
5
|
+
module ComputeUnit
|
6
|
+
class Gpu < ComputeBase
|
7
|
+
attr_reader :pci_loc, :bios, :name
|
8
|
+
DEVICE_CLASS = '030000'
|
9
|
+
DEVICE_CLASS_NAME = 'GPU'
|
10
|
+
attr_accessor :power_limit, :use_opencl
|
11
|
+
|
12
|
+
def compute_type
|
13
|
+
type
|
14
|
+
end
|
15
|
+
|
16
|
+
# @return [OpenCL_Device]
|
17
|
+
def opencl_device
|
18
|
+
@opencl_device ||= self.class.opencl_devices.find_all { |cu| cu[:type] == make }[index] if use_opencl
|
19
|
+
end
|
20
|
+
|
21
|
+
# @return [String] - returns the raw data of the board name from opencl, return nil if no device
|
22
|
+
def opencl_board_name
|
23
|
+
@opencl_board_name ||= opencl_device&.board_name if use_opencl
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [Integer] - returns the number of compute units decteded by opencl
|
27
|
+
# not to be confused with stream processors. Can be helpful when determining which product vega56 or vega64
|
28
|
+
def opencl_units
|
29
|
+
@opencl_units ||= opencl_device.max_compute_units.to_i if use_opencl
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [String] - the device name
|
33
|
+
# ie. GeForce GTX 1070 or RX 580
|
34
|
+
# @note not really needed for Nvidia types since nvidia-smi returns really complete information
|
35
|
+
def opencl_name
|
36
|
+
@opencl_name ||= opencl_device.name if use_opencl
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Array] - returns a list of device paths of all devices considered for display
|
40
|
+
# @note the devices are sorted by the device path
|
41
|
+
# @note this can mean AMD, NVIDIA, Intel or other crappy embedded devices
|
42
|
+
def self.devices
|
43
|
+
@devices ||= ComputeUnit::ComputeBase.devices.find_all do |device|
|
44
|
+
ComputeUnit::Device.device_class(device) == DEVICE_CLASS
|
45
|
+
end.sort
|
46
|
+
end
|
47
|
+
|
48
|
+
# @param device_path [String] - that pci bus path to the device
|
49
|
+
# @param opts [Hash]
|
50
|
+
# @option bios [String] the bios id
|
51
|
+
# @option model [String] the model name
|
52
|
+
# @option serial [String] the serial id of the device
|
53
|
+
# @option busid [String] the pci bus path of the device
|
54
|
+
# @option meta [Hash] metadata about the device
|
55
|
+
# @option index [Integer] the index of the device found in the device tree
|
56
|
+
# @option uuid [String] the uuid of the device
|
57
|
+
# @option use_opencl [Boolean] set to true if you want to get info about the device from opencl, defaults to false
|
58
|
+
def initialize(device_path, opts = {})
|
59
|
+
super(device_path, opts)
|
60
|
+
@type = :GPU
|
61
|
+
@bios = opts[:bios].upcase if opts[:bios]
|
62
|
+
@model = opts[:model]
|
63
|
+
@serial = opts[:serial]
|
64
|
+
@pci_loc = opts[:busid]
|
65
|
+
@meta = opts[:meta]
|
66
|
+
@index = opts[:index].to_i
|
67
|
+
@uuid = opts[:uuid] || opts[:serial]
|
68
|
+
@name = model
|
69
|
+
@power_offset = 0
|
70
|
+
@use_opencl = opts[:use_opencl] || false
|
71
|
+
end
|
72
|
+
|
73
|
+
def fan
|
74
|
+
raise NotImplementedError
|
75
|
+
end
|
76
|
+
|
77
|
+
def status
|
78
|
+
return 0 if utilization > 20 && power >= 50
|
79
|
+
return 2 if power < 20
|
80
|
+
|
81
|
+
1
|
82
|
+
end
|
83
|
+
|
84
|
+
def power
|
85
|
+
raise NotImplementedError
|
86
|
+
end
|
87
|
+
|
88
|
+
def pstate
|
89
|
+
raise NotImplementedError
|
90
|
+
end
|
91
|
+
|
92
|
+
# @return [Integer] - a percentage value of the current fan limit
|
93
|
+
def fan_limit
|
94
|
+
fan
|
95
|
+
end
|
96
|
+
|
97
|
+
# @return [Integer] - a percentage value of the min fan limit
|
98
|
+
def fan_min_limit
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
|
102
|
+
# @return [Integer] - a percentage value of the max fan limit
|
103
|
+
def fan_max_limit
|
104
|
+
nil
|
105
|
+
end
|
106
|
+
|
107
|
+
def power_limit
|
108
|
+
raise NotImplementedError
|
109
|
+
end
|
110
|
+
|
111
|
+
def power_max_limit
|
112
|
+
raise NotImplementedError
|
113
|
+
end
|
114
|
+
|
115
|
+
def memory_total
|
116
|
+
raise NotImplementedError
|
117
|
+
end
|
118
|
+
|
119
|
+
def memory_used
|
120
|
+
raise NotImplementedError
|
121
|
+
end
|
122
|
+
|
123
|
+
def memory_free
|
124
|
+
raise NotImplementedError
|
125
|
+
end
|
126
|
+
|
127
|
+
def utilization
|
128
|
+
raise NotImplementedError
|
129
|
+
end
|
130
|
+
|
131
|
+
# @return [Integer] - the memory speed
|
132
|
+
def memory_clock
|
133
|
+
0
|
134
|
+
end
|
135
|
+
|
136
|
+
# @return [Integer] - the memory speed
|
137
|
+
def memory_volt
|
138
|
+
0
|
139
|
+
end
|
140
|
+
|
141
|
+
# @return [Integer] - the core clock speed
|
142
|
+
def core_clock
|
143
|
+
0
|
144
|
+
end
|
145
|
+
|
146
|
+
# @return [Numeric] - returns voltage of core in mV
|
147
|
+
def core_voltage
|
148
|
+
0
|
149
|
+
end
|
150
|
+
|
151
|
+
# @return [Numeric] - returns voltage of core in mV
|
152
|
+
def configured_core_voltage
|
153
|
+
0
|
154
|
+
end
|
155
|
+
|
156
|
+
def mem_info
|
157
|
+
{
|
158
|
+
index: "#{device_class_name}#{index}",
|
159
|
+
name: name,
|
160
|
+
volt: memory_volt,
|
161
|
+
clock: memory_clock,
|
162
|
+
memory_name: nil,
|
163
|
+
memory_type: nil,
|
164
|
+
memory_used: memory_used,
|
165
|
+
memory_free: memory_free,
|
166
|
+
memory_total: memory_total,
|
167
|
+
mem_temp: mem_temp
|
168
|
+
}
|
169
|
+
end
|
170
|
+
|
171
|
+
# @return [Hash] - hash of hardware status about the gpu
|
172
|
+
def status_info
|
173
|
+
{
|
174
|
+
index: "#{device_class_name}#{index}",
|
175
|
+
name: name,
|
176
|
+
bios: bios,
|
177
|
+
core_clock: core_clock,
|
178
|
+
memory_clock: memory_clock,
|
179
|
+
power: power,
|
180
|
+
fan: fan,
|
181
|
+
core_volt: core_voltage,
|
182
|
+
temp: temp,
|
183
|
+
mem_temp: mem_temp,
|
184
|
+
status: status
|
185
|
+
}
|
186
|
+
end
|
187
|
+
|
188
|
+
# @return [Hash] - hash of information about the gpu data
|
189
|
+
def hardware_info
|
190
|
+
{
|
191
|
+
uuid: uuid,
|
192
|
+
gpuId: "GPU#{index}",
|
193
|
+
syspath: device_path,
|
194
|
+
pciLoc: pci_loc,
|
195
|
+
name: name,
|
196
|
+
bios: bios,
|
197
|
+
subType: subtype,
|
198
|
+
make: make,
|
199
|
+
model: model,
|
200
|
+
vendor: vendor
|
201
|
+
}
|
202
|
+
end
|
203
|
+
|
204
|
+
# @return [Integer] - the temperature of the asic chip
|
205
|
+
def asic_temp
|
206
|
+
0
|
207
|
+
end
|
208
|
+
|
209
|
+
# @return [Integer] - temperature of the memory
|
210
|
+
def mem_temp
|
211
|
+
0
|
212
|
+
end
|
213
|
+
|
214
|
+
# @return [Integer] - the voltage reading of the card, maybe just amd cards (mV)
|
215
|
+
def vddgfx
|
216
|
+
0
|
217
|
+
end
|
218
|
+
|
219
|
+
def temp
|
220
|
+
0
|
221
|
+
end
|
222
|
+
|
223
|
+
def to_h
|
224
|
+
{
|
225
|
+
uuid: uuid,
|
226
|
+
gpuId: "GPU#{index}",
|
227
|
+
syspath: device_path,
|
228
|
+
pciLoc: pci_loc,
|
229
|
+
name: name,
|
230
|
+
bios: bios,
|
231
|
+
subType: subtype,
|
232
|
+
make: make,
|
233
|
+
model: model,
|
234
|
+
vendor: vendor,
|
235
|
+
# memory_name: nil,
|
236
|
+
# memory_type: nil,
|
237
|
+
# gpu_platform: nil,
|
238
|
+
power: power,
|
239
|
+
# power_limit: power_limit,
|
240
|
+
# power_max_limit: power_max_limit,
|
241
|
+
utilization: utilization,
|
242
|
+
# memory_used: memory_used ,
|
243
|
+
# memory_free: memory_free,
|
244
|
+
# memory_total: memory_total,
|
245
|
+
temperature: temp,
|
246
|
+
status: status,
|
247
|
+
pstate: pstate,
|
248
|
+
fanSpeed: fan,
|
249
|
+
type: compute_type,
|
250
|
+
maxTemp: nil,
|
251
|
+
mem: memory_clock,
|
252
|
+
cor: core_clock,
|
253
|
+
vlt: core_voltage,
|
254
|
+
mem_temp: mem_temp,
|
255
|
+
maxFan: nil,
|
256
|
+
dpm: nil,
|
257
|
+
vddci: nil,
|
258
|
+
maxPower: nil,
|
259
|
+
ocProfile: nil,
|
260
|
+
opencl_enabled: use_opencl
|
261
|
+
}
|
262
|
+
end
|
263
|
+
|
264
|
+
# @return [Array] - returns an array of gpu objects, sorted by index
|
265
|
+
def self.find_all(use_opencl = false)
|
266
|
+
require 'compute_unit/gpus/amd_gpu'
|
267
|
+
require 'compute_unit/gpus/nvidia_gpu'
|
268
|
+
g = ComputeUnit::AmdGpu.find_all(use_opencl) + ComputeUnit::NvidiaGpu.find_all(use_opencl)
|
269
|
+
g.sort_by(&:index)
|
270
|
+
end
|
271
|
+
|
272
|
+
# @return [CacheStore] - returns an instance of the cachestore for storign opencl cache
|
273
|
+
def self.opencl_cache
|
274
|
+
@opencl_cache ||= ComputeUnit::CacheStore.new('opencl_cache')
|
275
|
+
end
|
276
|
+
|
277
|
+
# @return [Array] - array of openstruct or nil
|
278
|
+
def self.opencl_devices_from_cache
|
279
|
+
data = opencl_cache.read_cache('opencl_compute_units', {})
|
280
|
+
data[ComputeUnit::Device.system_checksum]
|
281
|
+
end
|
282
|
+
|
283
|
+
# @returns [Array] - an array of openstruct objects
|
284
|
+
def self.opencl_devices_from_platform
|
285
|
+
require 'ostruct'
|
286
|
+
# opencl takes a second to load so we cache later in the process
|
287
|
+
# which is why we need the openstruct object here
|
288
|
+
# opencl can also freeze the system if it tries to enumerate a dead GPU
|
289
|
+
# opencl sould be used sparingly as a result and only read when absolutely
|
290
|
+
# neccessary and no dead GPUs.
|
291
|
+
# TODO: warn when dead gpus detected
|
292
|
+
begin
|
293
|
+
require 'opencl_ruby_ffi'
|
294
|
+
ComputeUnit::Logger.logger.debug('Searching for openCL devices')
|
295
|
+
OpenCL.platforms.map(&:devices).flatten.map do |d|
|
296
|
+
type = d.platform.name.include?('AMD') ? 'AMD' : 'Nvidia'
|
297
|
+
board_name = type == 'AMD' ? d.board_name_amd : ''
|
298
|
+
max_computes = d.respond_to?(:max_compute_units) ? d.max_compute_units : 0
|
299
|
+
OpenStruct.new(
|
300
|
+
name: d.name,
|
301
|
+
type: type,
|
302
|
+
board_name: board_name,
|
303
|
+
max_compute_units: max_computes
|
304
|
+
)
|
305
|
+
end
|
306
|
+
rescue OpenCL::Error::DEVICE_NOT_FOUND => e
|
307
|
+
ComputeUnit::Logger.logger.debug("OpenCL error: #{e.message}, are you root?")
|
308
|
+
[]
|
309
|
+
rescue RuntimeError => e # OpenCL::Error::PLATFORM_NOT_FOUND_KHR,
|
310
|
+
ComputeUnit::Logger.logger.debug("OpenCL error: #{e.message}")
|
311
|
+
ComputeUnit::Logger.logger.debug("OpenCL error: #{e.backtrace}")
|
312
|
+
[]
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# @return [Hash] - a hash of voltages per the voltage table, nil if no table available
|
317
|
+
def voltage_table
|
318
|
+
[]
|
319
|
+
end
|
320
|
+
|
321
|
+
# @return [Array] - array of devices paths either from amd or nvidia
|
322
|
+
def self.found_devices
|
323
|
+
@found_devices ||= ComputeUnit::AmdGpu.devices + ComputeUnit::NvidiaGpu.devices
|
324
|
+
end
|
325
|
+
|
326
|
+
# @return [Array] - returns an array of opencl devices
|
327
|
+
# overwrites cache if new devices are found
|
328
|
+
# OpenCL should only be used when necessary as it can freeze sometimes
|
329
|
+
# OpenCL indexes items differently
|
330
|
+
def self.opencl_devices
|
331
|
+
@opencl_devices ||= opencl_devices_from_cache || begin
|
332
|
+
items = opencl_devices_from_platform
|
333
|
+
opencl_cache.write_cache('opencl_compute_units', ComputeUnit::Device.system_checksum.to_s => items)
|
334
|
+
items
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
@@ -0,0 +1,525 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'compute_unit/gpu'
|
4
|
+
require 'digest'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
module ComputeUnit
|
8
|
+
class AmdGpu < ComputeUnit::Gpu
|
9
|
+
MAKE = 'AMD'
|
10
|
+
VENDOR_ID = '1002'
|
11
|
+
SUBTYPE = 'amdgpu'
|
12
|
+
SYS_DEBUG_PATH = File.join(ComputeUnit::SYSFS_PATH, 'kernel', 'debug', 'dri')
|
13
|
+
|
14
|
+
def initialize(device_path, opts = {})
|
15
|
+
super(device_path, opts)
|
16
|
+
@pci_loc = File.basename(device_path)
|
17
|
+
@model = opts[:model] if opts[:use_opencl]
|
18
|
+
|
19
|
+
@uuid = "GPU#{index}"
|
20
|
+
end
|
21
|
+
|
22
|
+
def meta
|
23
|
+
{}
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [String] - the bios according to the vbios rom
|
27
|
+
# sometimes the kernel / driver extracted rom can be incorrect
|
28
|
+
# this is the bios gathered from the vbios itself.
|
29
|
+
def rom_bios
|
30
|
+
if !/\d{3}-/.match?(rom_metadata[2])
|
31
|
+
logger.warn("Invalid rom bios name for GPU#{index} using alternate name for #{rom_metadata[3]}")
|
32
|
+
rom_metadata[3]
|
33
|
+
elsif /\d{3}-/.match?(rom_metadata[2])
|
34
|
+
rom_metadata[2]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# @return [String::IO] - the contents of the rom file
|
39
|
+
def read_rom_data
|
40
|
+
if File.exist?(debug_rom_path)
|
41
|
+
IO.read(debug_rom_path, mode: 'rb')
|
42
|
+
elsif File.exist?(rom_path)
|
43
|
+
rom_data
|
44
|
+
else
|
45
|
+
''
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# @return [String] - the path to the readonly rom file
|
50
|
+
def debug_rom_path
|
51
|
+
@rom_path ||= File.join(SYS_DEBUG_PATH, index.to_s, 'amdgpu_vbios')
|
52
|
+
end
|
53
|
+
|
54
|
+
# @return [Array] - an array of readable strings from the rom file
|
55
|
+
def rom_metadata
|
56
|
+
@rom_metadata || begin
|
57
|
+
printable_chars = %r{[A-Za-z0-9`~!@#%^&*()-_=+|'";:/?.>,< \t\$\{\}\[\]\\]{10,}}
|
58
|
+
read_rom_data.scan(printable_chars)[0..9]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# @return [String] - returns the name of compute board
|
63
|
+
# for vegas we have to also get the compute units
|
64
|
+
def board_name
|
65
|
+
@board_name ||= begin
|
66
|
+
return nil unless opencl_board_name
|
67
|
+
|
68
|
+
name = opencl_board_name.sub(/Series|\(TM\)/, '').sub('Graphics', '').sub(/\s{2}/, ' ').strip
|
69
|
+
/vega/i.match?(name) ? "#{name} #{opencl_units}" : name
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [Array] - returns a list of device paths of all devices specific to the vendor id
|
74
|
+
def self.devices
|
75
|
+
ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID }
|
76
|
+
end
|
77
|
+
|
78
|
+
def name
|
79
|
+
model
|
80
|
+
end
|
81
|
+
|
82
|
+
# @return [String] - the name of the device model (specific name)
|
83
|
+
def model
|
84
|
+
@model ||= begin
|
85
|
+
board_name || sysfs_model_name
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def load
|
90
|
+
utilization
|
91
|
+
end
|
92
|
+
|
93
|
+
# @return [Integer] - returns temp of gpu in celius
|
94
|
+
def temp
|
95
|
+
read_hwmon_data('temp1_input', 0).to_i / 1000
|
96
|
+
end
|
97
|
+
|
98
|
+
# @return [Integer] - returns fan rpm speed, 0 if cannot be found
|
99
|
+
def fan
|
100
|
+
read_hwmon_data('fan1_input', 0).to_i
|
101
|
+
end
|
102
|
+
|
103
|
+
# @return [Numeric] - returns voltage of core in mV
|
104
|
+
def core_voltage
|
105
|
+
dpm_core_vddc.zero? ? vddgfx.to_i : dpm_core_vddc
|
106
|
+
end
|
107
|
+
|
108
|
+
def configured_core_voltage
|
109
|
+
vddc
|
110
|
+
end
|
111
|
+
|
112
|
+
# @return [Integer] - the memory speed
|
113
|
+
def memory_clock
|
114
|
+
data = read_kernel_setting('pp_dpm_mclk', '').split("\n")
|
115
|
+
item = data.find { |d| d.include?('*') }
|
116
|
+
item.nil? ? item : item.match(/\d{2,6}/).to_a.first.to_i
|
117
|
+
end
|
118
|
+
|
119
|
+
# @return [Integer] - the core clock speed
|
120
|
+
def core_clock
|
121
|
+
data = read_kernel_setting('pp_dpm_sclk', '').split("\n")
|
122
|
+
item = data.find { |d| d.include?('*') }
|
123
|
+
item.nil? ? item : item.match(/\d{2,6}/).to_a.first.to_i
|
124
|
+
end
|
125
|
+
|
126
|
+
# @return [Integer] - the core voltage reading of the GPU via HWMON
|
127
|
+
def vddgfx
|
128
|
+
read_hwmon_data('in0_input', 0).to_i
|
129
|
+
end
|
130
|
+
|
131
|
+
# currently configured gpu core voltage
|
132
|
+
# @return [Numeric] - returns voltage of core in mV
|
133
|
+
def vddc
|
134
|
+
read_kernel_setting('pp_voltage', 0).to_i
|
135
|
+
end
|
136
|
+
|
137
|
+
# currently running gpu core voltage
|
138
|
+
def dpm_core_vddc
|
139
|
+
read_kernel_setting('pp_core_vddc', 0).to_i
|
140
|
+
end
|
141
|
+
|
142
|
+
def subtype
|
143
|
+
SUBTYPE
|
144
|
+
end
|
145
|
+
|
146
|
+
def clock_limits
|
147
|
+
read_kernel_setting('pp_od_clk_limits', '')
|
148
|
+
end
|
149
|
+
|
150
|
+
def gpu_defaults
|
151
|
+
read_kernel_setting('gpu_defaults', '')
|
152
|
+
end
|
153
|
+
|
154
|
+
# @return [Array] - array of hashes of voltages {:pstate=>0, :sclk=>300, :volt=>750}
|
155
|
+
def voltage_table
|
156
|
+
data = read_kernel_setting('pp_od_clk_voltage', nil)
|
157
|
+
return [] if data.nil?
|
158
|
+
|
159
|
+
_, sclk, = data.split(/OD_[S,M]CLK:\s?\n/)
|
160
|
+
sclk.split("\n").map do |line|
|
161
|
+
pstate, clk, volt, = line.gsub(/:|Mhz|mV/, '').split(/\s{2,}/).map(&:to_i)
|
162
|
+
{ pstate: pstate, clk: clk, volt: volt, type: :sclk }
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# @return [Array] - array of hashes of voltages {:pstate=>0, :mclk=>300, :volt=>750}
|
167
|
+
def vddci_voltage_table
|
168
|
+
# not sure if this is what mclk is but left it here anyways
|
169
|
+
data = read_kernel_setting('pp_od_clk_voltage', nil)
|
170
|
+
return data if data.nil?
|
171
|
+
|
172
|
+
_, _, mclk = data.split(/OD_[S,M]CLK:\s?\n/)
|
173
|
+
mclk.split("\n").map do |line|
|
174
|
+
pstate, clk, volt, = line.gsub(/:|Mhz|mV/, '').split(/\s{2,}/).map(&:to_i)
|
175
|
+
{ pstate: pstate, clk: clk, volt: volt, type: :mclk }
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# @return [Float] the average power being used by the gpu
|
180
|
+
def power_average
|
181
|
+
# TODO: if a gpu crashes the average power can sometimes take 3000 ms to read!
|
182
|
+
read_hwmon_data('power1_average', 0).to_i / 1000000
|
183
|
+
end
|
184
|
+
|
185
|
+
# @return [Float] the power being used by the gpu
|
186
|
+
def power
|
187
|
+
pp_value = read_kernel_setting('pp_power_usage', 0).to_i
|
188
|
+
value = pp_value > 0 ? pp_value : power_average
|
189
|
+
value + power_offset
|
190
|
+
end
|
191
|
+
|
192
|
+
# @return [String] - the name of the bios which is unique for every card
|
193
|
+
def bios
|
194
|
+
@bios ||= begin
|
195
|
+
a = read_kernel_setting('vbios_version', 'unreadable').upcase
|
196
|
+
b = rom_bios
|
197
|
+
/\d{3}-/.match?(b) ? b : a
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def pstate
|
202
|
+
-1
|
203
|
+
end
|
204
|
+
|
205
|
+
# @return [String] - the serial number of the card
|
206
|
+
def serial
|
207
|
+
'unknown'
|
208
|
+
end
|
209
|
+
|
210
|
+
# @param value [Numeric] - the power limit that should be applied to the gpu
|
211
|
+
# @return [Numeric] - original passed in value after being set
|
212
|
+
def power_limit=(value)
|
213
|
+
max = power_max_limit
|
214
|
+
raise ArgumentError.new("Power Value #{value} cannot exceed #{max}") if value > max
|
215
|
+
raise ArgumentError.new("Value must be between 10 and #{max}") if value < 10
|
216
|
+
|
217
|
+
# hwmon expects the value to have 6 zeros
|
218
|
+
write_hwmon_data('power1_cap', value * 1000000)
|
219
|
+
# logger.info("GPU#{index} power set to #{value} Watts")
|
220
|
+
end
|
221
|
+
|
222
|
+
# @param value [Numeric] - the fan limit that should be applied to the gpu as a percentage
|
223
|
+
# @return [Numeric] - original passed in value after being set
|
224
|
+
def set_fan_limit(value, type = 'current')
|
225
|
+
write_hwmon_data('fan1_enable', '1')
|
226
|
+
hwmon_file = if type == 'min'
|
227
|
+
'pwm1_min'
|
228
|
+
elsif type == 'max'
|
229
|
+
'pwm1_max'
|
230
|
+
elsif type == 'current'
|
231
|
+
'pwm1'
|
232
|
+
else
|
233
|
+
raise ArgumentError.new("Invalid fan setting type, must be one of 'current, min or max'")
|
234
|
+
end
|
235
|
+
raise ArgumentError.new('Fan limit cannot exceed 100') if value > 100
|
236
|
+
raise ArgumentError.new('Fan limit value must be between 20 and 100') if value < 20
|
237
|
+
|
238
|
+
# Value must be between 0-255
|
239
|
+
amount = (255 * (value / 100.0)).round
|
240
|
+
logger.debug("Setting #{type} Fan on GPU#{index} to #{amount}")
|
241
|
+
write_hwmon_data(hwmon_file, amount)
|
242
|
+
logger.info("GPU#{index} #{type} fan set to #{value} percent")
|
243
|
+
value
|
244
|
+
end
|
245
|
+
|
246
|
+
# @return [Numeric] - current fan limit as a percentage
|
247
|
+
# @note the OS values is between 0 - 255
|
248
|
+
def fan_limit
|
249
|
+
cur = read_hwmon_data('pwm1', 0).to_i
|
250
|
+
return cur unless cur > 0
|
251
|
+
|
252
|
+
((cur / 255.0) * 100).round(0)
|
253
|
+
end
|
254
|
+
|
255
|
+
# @return [Numeric] - current fan limit as a percentage
|
256
|
+
# @note the OS values is between 0 - 255
|
257
|
+
def fan_max_limit
|
258
|
+
cur = read_hwmon_data('pwm1_max', 0).to_i
|
259
|
+
return cur unless cur > 0
|
260
|
+
|
261
|
+
((cur / 255.0) * 100).round(0)
|
262
|
+
end
|
263
|
+
|
264
|
+
# @return [Numeric] - current fan limit as a percentage
|
265
|
+
# @note the OS values is between 0 - 255
|
266
|
+
def fan_min_limit
|
267
|
+
cur = read_hwmon_data('pwm1_min', 0).to_i
|
268
|
+
return cur unless cur > 0
|
269
|
+
|
270
|
+
((cur / 255.0) * 100).round(0)
|
271
|
+
end
|
272
|
+
|
273
|
+
# @return [Numeric] - current power limit
|
274
|
+
def power_limit
|
275
|
+
read_hwmon_data('power1_cap', 0).to_i / 1000000
|
276
|
+
end
|
277
|
+
|
278
|
+
# @return [Numeric] - the maximum power that can be set
|
279
|
+
def power_max_limit
|
280
|
+
read_hwmon_data('power1_cap_max').to_i / 1000000
|
281
|
+
end
|
282
|
+
|
283
|
+
def memory_total
|
284
|
+
0
|
285
|
+
end
|
286
|
+
|
287
|
+
def memory_used
|
288
|
+
0
|
289
|
+
end
|
290
|
+
|
291
|
+
def memory_free
|
292
|
+
0
|
293
|
+
end
|
294
|
+
|
295
|
+
def utilization
|
296
|
+
return 0 unless amdgpu_pm_info[:load]
|
297
|
+
|
298
|
+
amdgpu_pm_info[:load][:value].to_i || 0
|
299
|
+
end
|
300
|
+
|
301
|
+
def self.create_from_path(device_path, index, use_opencl = false)
|
302
|
+
opts = {
|
303
|
+
device_class_id: device_class(device_path),
|
304
|
+
device_id: device(device_path),
|
305
|
+
device_vendor_id: device_vendor(device_path),
|
306
|
+
subsystem_vendor_id: subsystem_vendor(device_path),
|
307
|
+
subsystem_device_id: subsystem_device(device_path),
|
308
|
+
use_opencl: use_opencl,
|
309
|
+
index: index
|
310
|
+
}
|
311
|
+
new(device_path, opts)
|
312
|
+
end
|
313
|
+
|
314
|
+
# @return [Array] - returns and array of gpu instances of AMD type only
|
315
|
+
def self.find_all(use_opencl = false)
|
316
|
+
devices.map.with_index do |device_path, _index|
|
317
|
+
found_index = ComputeUnit::Gpu.found_devices.index(device_path)
|
318
|
+
create_from_path(device_path, found_index, use_opencl)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def read_dri_debug_file(file_name, default = '')
|
323
|
+
File.read(File.join(debug_dri_dir, file_name))
|
324
|
+
rescue Errno::EINVAL
|
325
|
+
default
|
326
|
+
rescue Errno::ENOENT
|
327
|
+
default
|
328
|
+
rescue Errno::EACCES
|
329
|
+
logger.debug('run this command as root or with sudo, using default values')
|
330
|
+
default
|
331
|
+
end
|
332
|
+
|
333
|
+
# @return [String] - returns the path the debug dri directory
|
334
|
+
# ie. "/sys/kernel/debug/dri/0"
|
335
|
+
def debug_dri_dir
|
336
|
+
@debug_dri_dir ||= begin
|
337
|
+
# if the user does not have permission the path will be nil
|
338
|
+
path = Dir.glob(File.join(SYS_DEBUG_PATH, '*', 'name')).find { |file| File.read(file).include?(pci_loc) }
|
339
|
+
raise Errno::EACCES.new("Permission denied #{SYS_DEBUG_PATH}") unless path
|
340
|
+
|
341
|
+
File.dirname(path)
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# @returns [Array] - list of pm info
|
346
|
+
# {:mclk=>{:value=>"1950", :unit=>"MHz"},
|
347
|
+
# :sclk=>{:value=>"1125", :unit=>"MHz"},
|
348
|
+
# :vddgfx=>{:value=>"950", :unit=>"mV"},
|
349
|
+
# :vddc=>{:value=>"61.49", :unit=>"W"},
|
350
|
+
# :vddci=>{:value=>"1.0", :unit=>"W"},
|
351
|
+
# :max_gpu=>{:value=>"81.243", :unit=>"W"},
|
352
|
+
# :average_gpu=>{:value=>"82.117", :unit=>"W"},
|
353
|
+
# :temperature=>{:value=>"41", :unit=>"C"},
|
354
|
+
# :load=>{:value=>"100", :unit=>"%"}}
|
355
|
+
def amdgpu_pm_info
|
356
|
+
@amdgpu_pm_info ||= begin
|
357
|
+
content = read_dri_debug_file('amdgpu_pm_info')
|
358
|
+
data = content.scan(/(\d+\.?\d*)\s+(\w*)\s\(([\w\s]*)\)?/) + content.scan(/(\w*):\s(\d+)\s(.*)/).map(&:rotate)
|
359
|
+
data_hash = {}
|
360
|
+
data.each do |value, unit, name|
|
361
|
+
data_hash[name.gsub(/\s/, '_').downcase.to_sym] = { value: value, unit: unit }
|
362
|
+
end
|
363
|
+
data_hash
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
# @return [String] - reads the setting after writing the setting and returns current value
|
368
|
+
def dpm_force_performance
|
369
|
+
read_kernel_setting('power_dpm_force_performance_level', nil)
|
370
|
+
end
|
371
|
+
|
372
|
+
# @param setting [String] - the dpm performance setting to adjust the dpm (manual or auto)
|
373
|
+
# @return [String] - reads the setting after writing the setting and returns current value
|
374
|
+
def dpm_force_performance_setting(setting = 'manual')
|
375
|
+
raise ArgumentError.new('setting must be one of manual or auto') unless setting =~ /manual|auto/
|
376
|
+
|
377
|
+
write_kernel_setting('power_dpm_force_performance_level', "#{setting}\n")
|
378
|
+
end
|
379
|
+
|
380
|
+
def reset_to_defaults
|
381
|
+
dpm_force_performance_setting('auto')
|
382
|
+
write_kernel_setting('pp_od_clk_voltage', 'r')
|
383
|
+
write_kernel_setting('pp_od_clk_voltage', 'c')
|
384
|
+
write_hwmon_data('pwm1_enable', '2')
|
385
|
+
end
|
386
|
+
|
387
|
+
# @return [Array]
|
388
|
+
# reading from file "Sclk Limit: 2000 Mhz", "Mclk Limit: 2250 Mhz"
|
389
|
+
# @example [2000, 2250]
|
390
|
+
def clock_max_defaults
|
391
|
+
read_kernel_setting('pp_od_clk_limits', '0 0').scan(/\d+/).map(&:to_i)
|
392
|
+
end
|
393
|
+
|
394
|
+
# @return [Integer]
|
395
|
+
def max_core_clock
|
396
|
+
clock_max_defaults.first
|
397
|
+
end
|
398
|
+
|
399
|
+
# @return [Integer]
|
400
|
+
def min_core_clock
|
401
|
+
voltage_table[0][:clk]
|
402
|
+
end
|
403
|
+
|
404
|
+
# @return [Integer]
|
405
|
+
def max_mem_clock
|
406
|
+
clock_max_defaults.last # or vddci_voltage_table.last[:clk]
|
407
|
+
end
|
408
|
+
|
409
|
+
# @return [Integer]
|
410
|
+
def min_mem_clock
|
411
|
+
vddci_voltage_table.first[:clk]
|
412
|
+
end
|
413
|
+
|
414
|
+
# @return [Integer]
|
415
|
+
def max_mem_volt
|
416
|
+
vddci_voltage_table.last[:volt]
|
417
|
+
end
|
418
|
+
|
419
|
+
# @return [Integer]
|
420
|
+
def min_mem_volt
|
421
|
+
vddci_voltage_table.first[:volt]
|
422
|
+
end
|
423
|
+
|
424
|
+
# @return [Integer] - the temperature of the asic chip
|
425
|
+
def asic_temp
|
426
|
+
read_hwmon_data('temp2_input', 0).to_i / 1000
|
427
|
+
end
|
428
|
+
|
429
|
+
# @return [Integer] - the temperature of the memory chips
|
430
|
+
def mem_temp
|
431
|
+
read_hwmon_data('temp3_input', 0).to_i / 1000
|
432
|
+
end
|
433
|
+
|
434
|
+
def set_mem_clock_and_vddc(mem_clock, mem_volt)
|
435
|
+
return unless experimental_on?
|
436
|
+
|
437
|
+
mem_clock = mem_clock.to_i
|
438
|
+
mem_volt = mem_volt.to_i
|
439
|
+
# TODO: find max and min values and limit input
|
440
|
+
dpm_force_performance_setting('manual')
|
441
|
+
raise ArgumentError.new("MemClock value #{mem_clock} must be between #{min_mem_clock}-#{max_mem_clock}") unless mem_clock.between?(min_mem_clock, max_mem_clock)
|
442
|
+
raise ArgumentError.new("MemVolt value #{mem_volt} must be between #{min_mem_volt}-#{max_mem_volt}") unless mem_volt.between?(min_mem_volt, max_mem_volt)
|
443
|
+
|
444
|
+
write_kernel_setting('pp_od_clk_voltage', "r\n") # unlocks in order to write
|
445
|
+
# set row in table (m = manual), 3 = row,
|
446
|
+
write_kernel_setting('pp_od_clk_voltage', "m 3 #{mem_clock} #{mem_volt}\n")
|
447
|
+
write_kernel_setting('pp_od_clk_voltage', "c\n") # locks file
|
448
|
+
write_kernel_setting('pp_mclk_od', "3\n")
|
449
|
+
logger.info("Successfully applied overclock #{mem_clock} #{mem_volt} to #{name} at #{pci_loc}")
|
450
|
+
end
|
451
|
+
end
|
452
|
+
end
|
453
|
+
|
454
|
+
# See https://www.kernel.org/doc/html/latest/gpu/amdgpu.html for sysfs kernel
|
455
|
+
# /sys/kernel/debug/dri/2/name
|
456
|
+
# amdgpu dev=0000:08:00.0 unique=0000:08:00.0
|
457
|
+
|
458
|
+
# Add percent overclock to core speed
|
459
|
+
# sudo echo "7" > /sys/class/drm/card0/device/pp_sclk_od
|
460
|
+
|
461
|
+
# Add percent overclock to mem speed
|
462
|
+
# sudo echo "4" > /sys/class/drm/card0/device/pp_mclk_od
|
463
|
+
|
464
|
+
# The way the current AMDGPU overclocking works for the core frequency
|
465
|
+
# is by writing an integer value between 0 and 20 to
|
466
|
+
# /sys/class/drm/card0/device/pp_sclk_od. That value represents an
|
467
|
+
# overclock of 0~20% above the GPU's core frequency. Similarly,
|
468
|
+
# writing a value to /sys/class/drm/card0/device/pp_mclk_od represents
|
469
|
+
# a percentage-based overclock to the memory frequency.
|
470
|
+
#
|
471
|
+
#
|
472
|
+
# You can change the frequencies and voltage by modifying
|
473
|
+
# the file /sys/class/drm/card0/device/pp_od_clk_voltage
|
474
|
+
#
|
475
|
+
# first: This holds the presets for pp_dpm_sclk and pp_dpm_mclk.
|
476
|
+
#
|
477
|
+
# Second check the current settings:
|
478
|
+
#
|
479
|
+
# sudo cat /sys/class/drm/card0/device/pp_od_clk_voltage
|
480
|
+
# You should see something similar to this:
|
481
|
+
#
|
482
|
+
# OD_SCLK:
|
483
|
+
# 0: 300MHz 750mV
|
484
|
+
# 1: 588MHz 765mV
|
485
|
+
# 2: 980MHz 987mV
|
486
|
+
# 3: 1100MHz 950mV
|
487
|
+
# 4: 1100MHz 950mV
|
488
|
+
# 5: 1100MHz 950mV
|
489
|
+
# 6: 1100MHz 950mV
|
490
|
+
# 7: 1100MHz 950mV
|
491
|
+
# OD_MCLK:
|
492
|
+
# 0: 300MHz 750mV
|
493
|
+
# 1: 1000MHz 800mV
|
494
|
+
# 2: 1970MHz 950mV
|
495
|
+
# OD_RANGE:
|
496
|
+
# SCLK: 300MHz 2000MHz
|
497
|
+
# MCLK: 300MHz 2250MHz
|
498
|
+
# VDDC: 750mV 1150mV
|
499
|
+
# Example to set 1280Mhz at 950mV (check your output above for possible ranges!):
|
500
|
+
#
|
501
|
+
# sudo echo "s 7 1280 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
502
|
+
# Then to apply the changes:
|
503
|
+
#
|
504
|
+
# sudo echo 0 > /sys/class/drm/card0/device/pp_sclk_od
|
505
|
+
# sudo echo 1 > /sys/class/drm/card0/device/pp_sclk_od
|
506
|
+
# These are all the settings I use on my crypto mining card which is an AMD Radeon RX570 in case it's useful to anyone.
|
507
|
+
#
|
508
|
+
# echo 1 > /sys/class/drm/card0/device/hwmon/hwmon0/pwm1_enable
|
509
|
+
# echo manual > /sys/class/drm/card0/device/power_dpm_force_performance_level
|
510
|
+
# echo 200 > /sys/class/drm/card0/device/hwmon/hwmon0/pwm1 # Fan speed
|
511
|
+
# echo 4 > /sys/class/drm/card0/device/pp_power_profile_mode # Compute Mode
|
512
|
+
#
|
513
|
+
# echo "s 3 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
514
|
+
# echo "s 4 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
515
|
+
# echo "s 5 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
516
|
+
# echo "s 6 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
517
|
+
# echo "s 7 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
518
|
+
#
|
519
|
+
# echo "m 2 1985 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
|
520
|
+
#
|
521
|
+
# echo 0 > /sys/class/drm/card0/device/pp_sclk_od
|
522
|
+
# echo 1 > /sys/class/drm/card0/device/pp_sclk_od
|
523
|
+
#
|
524
|
+
# echo 0 > /sys/class/drm/card0/device/pp_mclk_od
|
525
|
+
# echo 1 > /sys/class/drm/card0/device/pp_mclk_od
|