compute_unit 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ComputeUnit
4
+ module Exceptions
5
+ class NoPermission < RuntimeError; end
6
+ class PermissionDenied < RuntimeError; end
7
+ class UnsupportedGPU < RuntimeError; end
8
+ class NotSupported < RuntimeError; end
9
+ class UnsupportedOSversion < RuntimeError; end
10
+ class NoWorkerName < RuntimeError; end
11
+ class NoComputeUnits < RuntimeError; end
12
+ class InvalidPCIDatabase < RuntimeError; end
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ComputeUnit
4
+ module Formatters
5
+ def micro_formatter(item, add_unit = false)
6
+ data = {}
7
+ item.each do |key, value|
8
+ if %i[hourly_cost hourly_earnings kwh_cost].include?(key)
9
+ v = (value * 1000000).round(4)
10
+ data[key] = add_unit ? "#{v} \u00B5BTC" : v
11
+ end
12
+ end
13
+ item.merge(data)
14
+ end
15
+
16
+ def value_micro_formatter(value, add_unit = false)
17
+ v = (value * 1000000).round(1)
18
+ add_unit ? "#{v} \u00B5BTC" : v
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,338 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'compute_unit/compute_base'
4
+ require 'compute_unit/cache_store'
5
+ module ComputeUnit
6
+ class Gpu < ComputeBase
7
+ attr_reader :pci_loc, :bios, :name
8
+ DEVICE_CLASS = '030000'
9
+ DEVICE_CLASS_NAME = 'GPU'
10
+ attr_accessor :power_limit, :use_opencl
11
+
12
+ def compute_type
13
+ type
14
+ end
15
+
16
+ # @return [OpenCL_Device]
17
+ def opencl_device
18
+ @opencl_device ||= self.class.opencl_devices.find_all { |cu| cu[:type] == make }[index] if use_opencl
19
+ end
20
+
21
+ # @return [String] - returns the raw data of the board name from opencl, return nil if no device
22
+ def opencl_board_name
23
+ @opencl_board_name ||= opencl_device&.board_name if use_opencl
24
+ end
25
+
26
+ # @return [Integer] - returns the number of compute units decteded by opencl
27
+ # not to be confused with stream processors. Can be helpful when determining which product vega56 or vega64
28
+ def opencl_units
29
+ @opencl_units ||= opencl_device.max_compute_units.to_i if use_opencl
30
+ end
31
+
32
+ # @return [String] - the device name
33
+ # ie. GeForce GTX 1070 or RX 580
34
+ # @note not really needed for Nvidia types since nvidia-smi returns really complete information
35
+ def opencl_name
36
+ @opencl_name ||= opencl_device.name if use_opencl
37
+ end
38
+
39
+ # @return [Array] - returns a list of device paths of all devices considered for display
40
+ # @note the devices are sorted by the device path
41
+ # @note this can mean AMD, NVIDIA, Intel or other crappy embedded devices
42
+ def self.devices
43
+ @devices ||= ComputeUnit::ComputeBase.devices.find_all do |device|
44
+ ComputeUnit::Device.device_class(device) == DEVICE_CLASS
45
+ end.sort
46
+ end
47
+
48
+ # @param device_path [String] - that pci bus path to the device
49
+ # @param opts [Hash]
50
+ # @option bios [String] the bios id
51
+ # @option model [String] the model name
52
+ # @option serial [String] the serial id of the device
53
+ # @option busid [String] the pci bus path of the device
54
+ # @option meta [Hash] metadata about the device
55
+ # @option index [Integer] the index of the device found in the device tree
56
+ # @option uuid [String] the uuid of the device
57
+ # @option use_opencl [Boolean] set to true if you want to get info about the device from opencl, defaults to false
58
+ def initialize(device_path, opts = {})
59
+ super(device_path, opts)
60
+ @type = :GPU
61
+ @bios = opts[:bios].upcase if opts[:bios]
62
+ @model = opts[:model]
63
+ @serial = opts[:serial]
64
+ @pci_loc = opts[:busid]
65
+ @meta = opts[:meta]
66
+ @index = opts[:index].to_i
67
+ @uuid = opts[:uuid] || opts[:serial]
68
+ @name = model
69
+ @power_offset = 0
70
+ @use_opencl = opts[:use_opencl] || false
71
+ end
72
+
73
+ def fan
74
+ raise NotImplementedError
75
+ end
76
+
77
+ def status
78
+ return 0 if utilization > 20 && power >= 50
79
+ return 2 if power < 20
80
+
81
+ 1
82
+ end
83
+
84
+ def power
85
+ raise NotImplementedError
86
+ end
87
+
88
+ def pstate
89
+ raise NotImplementedError
90
+ end
91
+
92
+ # @return [Integer] - a percentage value of the current fan limit
93
+ def fan_limit
94
+ fan
95
+ end
96
+
97
+ # @return [Integer] - a percentage value of the min fan limit
98
+ def fan_min_limit
99
+ nil
100
+ end
101
+
102
+ # @return [Integer] - a percentage value of the max fan limit
103
+ def fan_max_limit
104
+ nil
105
+ end
106
+
107
+ def power_limit
108
+ raise NotImplementedError
109
+ end
110
+
111
+ def power_max_limit
112
+ raise NotImplementedError
113
+ end
114
+
115
+ def memory_total
116
+ raise NotImplementedError
117
+ end
118
+
119
+ def memory_used
120
+ raise NotImplementedError
121
+ end
122
+
123
+ def memory_free
124
+ raise NotImplementedError
125
+ end
126
+
127
+ def utilization
128
+ raise NotImplementedError
129
+ end
130
+
131
+ # @return [Integer] - the memory speed
132
+ def memory_clock
133
+ 0
134
+ end
135
+
136
+ # @return [Integer] - the memory speed
137
+ def memory_volt
138
+ 0
139
+ end
140
+
141
+ # @return [Integer] - the core clock speed
142
+ def core_clock
143
+ 0
144
+ end
145
+
146
+ # @return [Numeric] - returns voltage of core in mV
147
+ def core_voltage
148
+ 0
149
+ end
150
+
151
+ # @return [Numeric] - returns voltage of core in mV
152
+ def configured_core_voltage
153
+ 0
154
+ end
155
+
156
+ def mem_info
157
+ {
158
+ index: "#{device_class_name}#{index}",
159
+ name: name,
160
+ volt: memory_volt,
161
+ clock: memory_clock,
162
+ memory_name: nil,
163
+ memory_type: nil,
164
+ memory_used: memory_used,
165
+ memory_free: memory_free,
166
+ memory_total: memory_total,
167
+ mem_temp: mem_temp
168
+ }
169
+ end
170
+
171
+ # @return [Hash] - hash of hardware status about the gpu
172
+ def status_info
173
+ {
174
+ index: "#{device_class_name}#{index}",
175
+ name: name,
176
+ bios: bios,
177
+ core_clock: core_clock,
178
+ memory_clock: memory_clock,
179
+ power: power,
180
+ fan: fan,
181
+ core_volt: core_voltage,
182
+ temp: temp,
183
+ mem_temp: mem_temp,
184
+ status: status
185
+ }
186
+ end
187
+
188
+ # @return [Hash] - hash of information about the gpu data
189
+ def hardware_info
190
+ {
191
+ uuid: uuid,
192
+ gpuId: "GPU#{index}",
193
+ syspath: device_path,
194
+ pciLoc: pci_loc,
195
+ name: name,
196
+ bios: bios,
197
+ subType: subtype,
198
+ make: make,
199
+ model: model,
200
+ vendor: vendor
201
+ }
202
+ end
203
+
204
+ # @return [Integer] - the temperature of the asic chip
205
+ def asic_temp
206
+ 0
207
+ end
208
+
209
+ # @return [Integer] - temperature of the memory
210
+ def mem_temp
211
+ 0
212
+ end
213
+
214
+ # @return [Integer] - the voltage reading of the card, maybe just amd cards (mV)
215
+ def vddgfx
216
+ 0
217
+ end
218
+
219
+ def temp
220
+ 0
221
+ end
222
+
223
+ def to_h
224
+ {
225
+ uuid: uuid,
226
+ gpuId: "GPU#{index}",
227
+ syspath: device_path,
228
+ pciLoc: pci_loc,
229
+ name: name,
230
+ bios: bios,
231
+ subType: subtype,
232
+ make: make,
233
+ model: model,
234
+ vendor: vendor,
235
+ # memory_name: nil,
236
+ # memory_type: nil,
237
+ # gpu_platform: nil,
238
+ power: power,
239
+ # power_limit: power_limit,
240
+ # power_max_limit: power_max_limit,
241
+ utilization: utilization,
242
+ # memory_used: memory_used ,
243
+ # memory_free: memory_free,
244
+ # memory_total: memory_total,
245
+ temperature: temp,
246
+ status: status,
247
+ pstate: pstate,
248
+ fanSpeed: fan,
249
+ type: compute_type,
250
+ maxTemp: nil,
251
+ mem: memory_clock,
252
+ cor: core_clock,
253
+ vlt: core_voltage,
254
+ mem_temp: mem_temp,
255
+ maxFan: nil,
256
+ dpm: nil,
257
+ vddci: nil,
258
+ maxPower: nil,
259
+ ocProfile: nil,
260
+ opencl_enabled: use_opencl
261
+ }
262
+ end
263
+
264
+ # @return [Array] - returns an array of gpu objects, sorted by index
265
+ def self.find_all(use_opencl = false)
266
+ require 'compute_unit/gpus/amd_gpu'
267
+ require 'compute_unit/gpus/nvidia_gpu'
268
+ g = ComputeUnit::AmdGpu.find_all(use_opencl) + ComputeUnit::NvidiaGpu.find_all(use_opencl)
269
+ g.sort_by(&:index)
270
+ end
271
+
272
+ # @return [CacheStore] - returns an instance of the cachestore for storign opencl cache
273
+ def self.opencl_cache
274
+ @opencl_cache ||= ComputeUnit::CacheStore.new('opencl_cache')
275
+ end
276
+
277
+ # @return [Array] - array of openstruct or nil
278
+ def self.opencl_devices_from_cache
279
+ data = opencl_cache.read_cache('opencl_compute_units', {})
280
+ data[ComputeUnit::Device.system_checksum]
281
+ end
282
+
283
+ # @returns [Array] - an array of openstruct objects
284
+ def self.opencl_devices_from_platform
285
+ require 'ostruct'
286
+ # opencl takes a second to load so we cache later in the process
287
+ # which is why we need the openstruct object here
288
+ # opencl can also freeze the system if it tries to enumerate a dead GPU
289
+ # opencl sould be used sparingly as a result and only read when absolutely
290
+ # neccessary and no dead GPUs.
291
+ # TODO: warn when dead gpus detected
292
+ begin
293
+ require 'opencl_ruby_ffi'
294
+ ComputeUnit::Logger.logger.debug('Searching for openCL devices')
295
+ OpenCL.platforms.map(&:devices).flatten.map do |d|
296
+ type = d.platform.name.include?('AMD') ? 'AMD' : 'Nvidia'
297
+ board_name = type == 'AMD' ? d.board_name_amd : ''
298
+ max_computes = d.respond_to?(:max_compute_units) ? d.max_compute_units : 0
299
+ OpenStruct.new(
300
+ name: d.name,
301
+ type: type,
302
+ board_name: board_name,
303
+ max_compute_units: max_computes
304
+ )
305
+ end
306
+ rescue OpenCL::Error::DEVICE_NOT_FOUND => e
307
+ ComputeUnit::Logger.logger.debug("OpenCL error: #{e.message}, are you root?")
308
+ []
309
+ rescue RuntimeError => e # OpenCL::Error::PLATFORM_NOT_FOUND_KHR,
310
+ ComputeUnit::Logger.logger.debug("OpenCL error: #{e.message}")
311
+ ComputeUnit::Logger.logger.debug("OpenCL error: #{e.backtrace}")
312
+ []
313
+ end
314
+ end
315
+
316
+ # @return [Hash] - a hash of voltages per the voltage table, nil if no table available
317
+ def voltage_table
318
+ []
319
+ end
320
+
321
+ # @return [Array] - array of devices paths either from amd or nvidia
322
+ def self.found_devices
323
+ @found_devices ||= ComputeUnit::AmdGpu.devices + ComputeUnit::NvidiaGpu.devices
324
+ end
325
+
326
+ # @return [Array] - returns an array of opencl devices
327
+ # overwrites cache if new devices are found
328
+ # OpenCL should only be used when necessary as it can freeze sometimes
329
+ # OpenCL indexes items differently
330
+ def self.opencl_devices
331
+ @opencl_devices ||= opencl_devices_from_cache || begin
332
+ items = opencl_devices_from_platform
333
+ opencl_cache.write_cache('opencl_compute_units', ComputeUnit::Device.system_checksum.to_s => items)
334
+ items
335
+ end
336
+ end
337
+ end
338
+ end
@@ -0,0 +1,525 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'compute_unit/gpu'
4
+ require 'digest'
5
+ require 'json'
6
+
7
+ module ComputeUnit
8
+ class AmdGpu < ComputeUnit::Gpu
9
+ MAKE = 'AMD'
10
+ VENDOR_ID = '1002'
11
+ SUBTYPE = 'amdgpu'
12
+ SYS_DEBUG_PATH = File.join(ComputeUnit::SYSFS_PATH, 'kernel', 'debug', 'dri')
13
+
14
+ def initialize(device_path, opts = {})
15
+ super(device_path, opts)
16
+ @pci_loc = File.basename(device_path)
17
+ @model = opts[:model] if opts[:use_opencl]
18
+
19
+ @uuid = "GPU#{index}"
20
+ end
21
+
22
+ def meta
23
+ {}
24
+ end
25
+
26
+ # @return [String] - the bios according to the vbios rom
27
+ # sometimes the kernel / driver extracted rom can be incorrect
28
+ # this is the bios gathered from the vbios itself.
29
+ def rom_bios
30
+ if !/\d{3}-/.match?(rom_metadata[2])
31
+ logger.warn("Invalid rom bios name for GPU#{index} using alternate name for #{rom_metadata[3]}")
32
+ rom_metadata[3]
33
+ elsif /\d{3}-/.match?(rom_metadata[2])
34
+ rom_metadata[2]
35
+ end
36
+ end
37
+
38
+ # @return [String::IO] - the contents of the rom file
39
+ def read_rom_data
40
+ if File.exist?(debug_rom_path)
41
+ IO.read(debug_rom_path, mode: 'rb')
42
+ elsif File.exist?(rom_path)
43
+ rom_data
44
+ else
45
+ ''
46
+ end
47
+ end
48
+
49
+ # @return [String] - the path to the readonly rom file
50
+ def debug_rom_path
51
+ @rom_path ||= File.join(SYS_DEBUG_PATH, index.to_s, 'amdgpu_vbios')
52
+ end
53
+
54
+ # @return [Array] - an array of readable strings from the rom file
55
+ def rom_metadata
56
+ @rom_metadata || begin
57
+ printable_chars = %r{[A-Za-z0-9`~!@#%^&*()-_=+|'";:/?.>,< \t\$\{\}\[\]\\]{10,}}
58
+ read_rom_data.scan(printable_chars)[0..9]
59
+ end
60
+ end
61
+
62
+ # @return [String] - returns the name of compute board
63
+ # for vegas we have to also get the compute units
64
+ def board_name
65
+ @board_name ||= begin
66
+ return nil unless opencl_board_name
67
+
68
+ name = opencl_board_name.sub(/Series|\(TM\)/, '').sub('Graphics', '').sub(/\s{2}/, ' ').strip
69
+ /vega/i.match?(name) ? "#{name} #{opencl_units}" : name
70
+ end
71
+ end
72
+
73
+ # @return [Array] - returns a list of device paths of all devices specific to the vendor id
74
+ def self.devices
75
+ ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID }
76
+ end
77
+
78
+ def name
79
+ model
80
+ end
81
+
82
+ # @return [String] - the name of the device model (specific name)
83
+ def model
84
+ @model ||= begin
85
+ board_name || sysfs_model_name
86
+ end
87
+ end
88
+
89
+ def load
90
+ utilization
91
+ end
92
+
93
+ # @return [Integer] - returns temp of gpu in celius
94
+ def temp
95
+ read_hwmon_data('temp1_input', 0).to_i / 1000
96
+ end
97
+
98
+ # @return [Integer] - returns fan rpm speed, 0 if cannot be found
99
+ def fan
100
+ read_hwmon_data('fan1_input', 0).to_i
101
+ end
102
+
103
+ # @return [Numeric] - returns voltage of core in mV
104
+ def core_voltage
105
+ dpm_core_vddc.zero? ? vddgfx.to_i : dpm_core_vddc
106
+ end
107
+
108
+ def configured_core_voltage
109
+ vddc
110
+ end
111
+
112
+ # @return [Integer] - the memory speed
113
+ def memory_clock
114
+ data = read_kernel_setting('pp_dpm_mclk', '').split("\n")
115
+ item = data.find { |d| d.include?('*') }
116
+ item.nil? ? item : item.match(/\d{2,6}/).to_a.first.to_i
117
+ end
118
+
119
+ # @return [Integer] - the core clock speed
120
+ def core_clock
121
+ data = read_kernel_setting('pp_dpm_sclk', '').split("\n")
122
+ item = data.find { |d| d.include?('*') }
123
+ item.nil? ? item : item.match(/\d{2,6}/).to_a.first.to_i
124
+ end
125
+
126
+ # @return [Integer] - the core voltage reading of the GPU via HWMON
127
+ def vddgfx
128
+ read_hwmon_data('in0_input', 0).to_i
129
+ end
130
+
131
+ # currently configured gpu core voltage
132
+ # @return [Numeric] - returns voltage of core in mV
133
+ def vddc
134
+ read_kernel_setting('pp_voltage', 0).to_i
135
+ end
136
+
137
+ # currently running gpu core voltage
138
+ def dpm_core_vddc
139
+ read_kernel_setting('pp_core_vddc', 0).to_i
140
+ end
141
+
142
+ def subtype
143
+ SUBTYPE
144
+ end
145
+
146
+ def clock_limits
147
+ read_kernel_setting('pp_od_clk_limits', '')
148
+ end
149
+
150
+ def gpu_defaults
151
+ read_kernel_setting('gpu_defaults', '')
152
+ end
153
+
154
+ # @return [Array] - array of hashes of voltages {:pstate=>0, :sclk=>300, :volt=>750}
155
+ def voltage_table
156
+ data = read_kernel_setting('pp_od_clk_voltage', nil)
157
+ return [] if data.nil?
158
+
159
+ _, sclk, = data.split(/OD_[S,M]CLK:\s?\n/)
160
+ sclk.split("\n").map do |line|
161
+ pstate, clk, volt, = line.gsub(/:|Mhz|mV/, '').split(/\s{2,}/).map(&:to_i)
162
+ { pstate: pstate, clk: clk, volt: volt, type: :sclk }
163
+ end
164
+ end
165
+
166
+ # @return [Array] - array of hashes of voltages {:pstate=>0, :mclk=>300, :volt=>750}
167
+ def vddci_voltage_table
168
+ # not sure if this is what mclk is but left it here anyways
169
+ data = read_kernel_setting('pp_od_clk_voltage', nil)
170
+ return data if data.nil?
171
+
172
+ _, _, mclk = data.split(/OD_[S,M]CLK:\s?\n/)
173
+ mclk.split("\n").map do |line|
174
+ pstate, clk, volt, = line.gsub(/:|Mhz|mV/, '').split(/\s{2,}/).map(&:to_i)
175
+ { pstate: pstate, clk: clk, volt: volt, type: :mclk }
176
+ end
177
+ end
178
+
179
+ # @return [Float] the average power being used by the gpu
180
+ def power_average
181
+ # TODO: if a gpu crashes the average power can sometimes take 3000 ms to read!
182
+ read_hwmon_data('power1_average', 0).to_i / 1000000
183
+ end
184
+
185
+ # @return [Float] the power being used by the gpu
186
+ def power
187
+ pp_value = read_kernel_setting('pp_power_usage', 0).to_i
188
+ value = pp_value > 0 ? pp_value : power_average
189
+ value + power_offset
190
+ end
191
+
192
+ # @return [String] - the name of the bios which is unique for every card
193
+ def bios
194
+ @bios ||= begin
195
+ a = read_kernel_setting('vbios_version', 'unreadable').upcase
196
+ b = rom_bios
197
+ /\d{3}-/.match?(b) ? b : a
198
+ end
199
+ end
200
+
201
+ def pstate
202
+ -1
203
+ end
204
+
205
+ # @return [String] - the serial number of the card
206
+ def serial
207
+ 'unknown'
208
+ end
209
+
210
+ # @param value [Numeric] - the power limit that should be applied to the gpu
211
+ # @return [Numeric] - original passed in value after being set
212
+ def power_limit=(value)
213
+ max = power_max_limit
214
+ raise ArgumentError.new("Power Value #{value} cannot exceed #{max}") if value > max
215
+ raise ArgumentError.new("Value must be between 10 and #{max}") if value < 10
216
+
217
+ # hwmon expects the value to have 6 zeros
218
+ write_hwmon_data('power1_cap', value * 1000000)
219
+ # logger.info("GPU#{index} power set to #{value} Watts")
220
+ end
221
+
222
+ # @param value [Numeric] - the fan limit that should be applied to the gpu as a percentage
223
+ # @return [Numeric] - original passed in value after being set
224
+ def set_fan_limit(value, type = 'current')
225
+ write_hwmon_data('fan1_enable', '1')
226
+ hwmon_file = if type == 'min'
227
+ 'pwm1_min'
228
+ elsif type == 'max'
229
+ 'pwm1_max'
230
+ elsif type == 'current'
231
+ 'pwm1'
232
+ else
233
+ raise ArgumentError.new("Invalid fan setting type, must be one of 'current, min or max'")
234
+ end
235
+ raise ArgumentError.new('Fan limit cannot exceed 100') if value > 100
236
+ raise ArgumentError.new('Fan limit value must be between 20 and 100') if value < 20
237
+
238
+ # Value must be between 0-255
239
+ amount = (255 * (value / 100.0)).round
240
+ logger.debug("Setting #{type} Fan on GPU#{index} to #{amount}")
241
+ write_hwmon_data(hwmon_file, amount)
242
+ logger.info("GPU#{index} #{type} fan set to #{value} percent")
243
+ value
244
+ end
245
+
246
+ # @return [Numeric] - current fan limit as a percentage
247
+ # @note the OS values is between 0 - 255
248
+ def fan_limit
249
+ cur = read_hwmon_data('pwm1', 0).to_i
250
+ return cur unless cur > 0
251
+
252
+ ((cur / 255.0) * 100).round(0)
253
+ end
254
+
255
+ # @return [Numeric] - current fan limit as a percentage
256
+ # @note the OS values is between 0 - 255
257
+ def fan_max_limit
258
+ cur = read_hwmon_data('pwm1_max', 0).to_i
259
+ return cur unless cur > 0
260
+
261
+ ((cur / 255.0) * 100).round(0)
262
+ end
263
+
264
+ # @return [Numeric] - current fan limit as a percentage
265
+ # @note the OS values is between 0 - 255
266
+ def fan_min_limit
267
+ cur = read_hwmon_data('pwm1_min', 0).to_i
268
+ return cur unless cur > 0
269
+
270
+ ((cur / 255.0) * 100).round(0)
271
+ end
272
+
273
+ # @return [Numeric] - current power limit
274
+ def power_limit
275
+ read_hwmon_data('power1_cap', 0).to_i / 1000000
276
+ end
277
+
278
+ # @return [Numeric] - the maximum power that can be set
279
+ def power_max_limit
280
+ read_hwmon_data('power1_cap_max').to_i / 1000000
281
+ end
282
+
283
+ def memory_total
284
+ 0
285
+ end
286
+
287
+ def memory_used
288
+ 0
289
+ end
290
+
291
+ def memory_free
292
+ 0
293
+ end
294
+
295
+ def utilization
296
+ return 0 unless amdgpu_pm_info[:load]
297
+
298
+ amdgpu_pm_info[:load][:value].to_i || 0
299
+ end
300
+
301
+ def self.create_from_path(device_path, index, use_opencl = false)
302
+ opts = {
303
+ device_class_id: device_class(device_path),
304
+ device_id: device(device_path),
305
+ device_vendor_id: device_vendor(device_path),
306
+ subsystem_vendor_id: subsystem_vendor(device_path),
307
+ subsystem_device_id: subsystem_device(device_path),
308
+ use_opencl: use_opencl,
309
+ index: index
310
+ }
311
+ new(device_path, opts)
312
+ end
313
+
314
+ # @return [Array] - returns and array of gpu instances of AMD type only
315
+ def self.find_all(use_opencl = false)
316
+ devices.map.with_index do |device_path, _index|
317
+ found_index = ComputeUnit::Gpu.found_devices.index(device_path)
318
+ create_from_path(device_path, found_index, use_opencl)
319
+ end
320
+ end
321
+
322
+ def read_dri_debug_file(file_name, default = '')
323
+ File.read(File.join(debug_dri_dir, file_name))
324
+ rescue Errno::EINVAL
325
+ default
326
+ rescue Errno::ENOENT
327
+ default
328
+ rescue Errno::EACCES
329
+ logger.debug('run this command as root or with sudo, using default values')
330
+ default
331
+ end
332
+
333
+ # @return [String] - returns the path the debug dri directory
334
+ # ie. "/sys/kernel/debug/dri/0"
335
+ def debug_dri_dir
336
+ @debug_dri_dir ||= begin
337
+ # if the user does not have permission the path will be nil
338
+ path = Dir.glob(File.join(SYS_DEBUG_PATH, '*', 'name')).find { |file| File.read(file).include?(pci_loc) }
339
+ raise Errno::EACCES.new("Permission denied #{SYS_DEBUG_PATH}") unless path
340
+
341
+ File.dirname(path)
342
+ end
343
+ end
344
+
345
+ # @returns [Array] - list of pm info
346
+ # {:mclk=>{:value=>"1950", :unit=>"MHz"},
347
+ # :sclk=>{:value=>"1125", :unit=>"MHz"},
348
+ # :vddgfx=>{:value=>"950", :unit=>"mV"},
349
+ # :vddc=>{:value=>"61.49", :unit=>"W"},
350
+ # :vddci=>{:value=>"1.0", :unit=>"W"},
351
+ # :max_gpu=>{:value=>"81.243", :unit=>"W"},
352
+ # :average_gpu=>{:value=>"82.117", :unit=>"W"},
353
+ # :temperature=>{:value=>"41", :unit=>"C"},
354
+ # :load=>{:value=>"100", :unit=>"%"}}
355
+ def amdgpu_pm_info
356
+ @amdgpu_pm_info ||= begin
357
+ content = read_dri_debug_file('amdgpu_pm_info')
358
+ data = content.scan(/(\d+\.?\d*)\s+(\w*)\s\(([\w\s]*)\)?/) + content.scan(/(\w*):\s(\d+)\s(.*)/).map(&:rotate)
359
+ data_hash = {}
360
+ data.each do |value, unit, name|
361
+ data_hash[name.gsub(/\s/, '_').downcase.to_sym] = { value: value, unit: unit }
362
+ end
363
+ data_hash
364
+ end
365
+ end
366
+
367
+ # @return [String] - reads the setting after writing the setting and returns current value
368
+ def dpm_force_performance
369
+ read_kernel_setting('power_dpm_force_performance_level', nil)
370
+ end
371
+
372
+ # @param setting [String] - the dpm performance setting to adjust the dpm (manual or auto)
373
+ # @return [String] - reads the setting after writing the setting and returns current value
374
+ def dpm_force_performance_setting(setting = 'manual')
375
+ raise ArgumentError.new('setting must be one of manual or auto') unless setting =~ /manual|auto/
376
+
377
+ write_kernel_setting('power_dpm_force_performance_level', "#{setting}\n")
378
+ end
379
+
380
+ def reset_to_defaults
381
+ dpm_force_performance_setting('auto')
382
+ write_kernel_setting('pp_od_clk_voltage', 'r')
383
+ write_kernel_setting('pp_od_clk_voltage', 'c')
384
+ write_hwmon_data('pwm1_enable', '2')
385
+ end
386
+
387
+ # @return [Array]
388
+ # reading from file "Sclk Limit: 2000 Mhz", "Mclk Limit: 2250 Mhz"
389
+ # @example [2000, 2250]
390
+ def clock_max_defaults
391
+ read_kernel_setting('pp_od_clk_limits', '0 0').scan(/\d+/).map(&:to_i)
392
+ end
393
+
394
+ # @return [Integer]
395
+ def max_core_clock
396
+ clock_max_defaults.first
397
+ end
398
+
399
+ # @return [Integer]
400
+ def min_core_clock
401
+ voltage_table[0][:clk]
402
+ end
403
+
404
+ # @return [Integer]
405
+ def max_mem_clock
406
+ clock_max_defaults.last # or vddci_voltage_table.last[:clk]
407
+ end
408
+
409
+ # @return [Integer]
410
+ def min_mem_clock
411
+ vddci_voltage_table.first[:clk]
412
+ end
413
+
414
+ # @return [Integer]
415
+ def max_mem_volt
416
+ vddci_voltage_table.last[:volt]
417
+ end
418
+
419
+ # @return [Integer]
420
+ def min_mem_volt
421
+ vddci_voltage_table.first[:volt]
422
+ end
423
+
424
+ # @return [Integer] - the temperature of the asic chip
425
+ def asic_temp
426
+ read_hwmon_data('temp2_input', 0).to_i / 1000
427
+ end
428
+
429
+ # @return [Integer] - the temperature of the memory chips
430
+ def mem_temp
431
+ read_hwmon_data('temp3_input', 0).to_i / 1000
432
+ end
433
+
434
+ def set_mem_clock_and_vddc(mem_clock, mem_volt)
435
+ return unless experimental_on?
436
+
437
+ mem_clock = mem_clock.to_i
438
+ mem_volt = mem_volt.to_i
439
+ # TODO: find max and min values and limit input
440
+ dpm_force_performance_setting('manual')
441
+ raise ArgumentError.new("MemClock value #{mem_clock} must be between #{min_mem_clock}-#{max_mem_clock}") unless mem_clock.between?(min_mem_clock, max_mem_clock)
442
+ raise ArgumentError.new("MemVolt value #{mem_volt} must be between #{min_mem_volt}-#{max_mem_volt}") unless mem_volt.between?(min_mem_volt, max_mem_volt)
443
+
444
+ write_kernel_setting('pp_od_clk_voltage', "r\n") # unlocks in order to write
445
+ # set row in table (m = manual), 3 = row,
446
+ write_kernel_setting('pp_od_clk_voltage', "m 3 #{mem_clock} #{mem_volt}\n")
447
+ write_kernel_setting('pp_od_clk_voltage', "c\n") # locks file
448
+ write_kernel_setting('pp_mclk_od', "3\n")
449
+ logger.info("Successfully applied overclock #{mem_clock} #{mem_volt} to #{name} at #{pci_loc}")
450
+ end
451
+ end
452
+ end
453
+
454
+ # See https://www.kernel.org/doc/html/latest/gpu/amdgpu.html for sysfs kernel
455
+ # /sys/kernel/debug/dri/2/name
456
+ # amdgpu dev=0000:08:00.0 unique=0000:08:00.0
457
+
458
+ # Add percent overclock to core speed
459
+ # sudo echo "7" > /sys/class/drm/card0/device/pp_sclk_od
460
+
461
+ # Add percent overclock to mem speed
462
+ # sudo echo "4" > /sys/class/drm/card0/device/pp_mclk_od
463
+
464
+ # The way the current AMDGPU overclocking works for the core frequency
465
+ # is by writing an integer value between 0 and 20 to
466
+ # /sys/class/drm/card0/device/pp_sclk_od. That value represents an
467
+ # overclock of 0~20% above the GPU's core frequency. Similarly,
468
+ # writing a value to /sys/class/drm/card0/device/pp_mclk_od represents
469
+ # a percentage-based overclock to the memory frequency.
470
+ #
471
+ #
472
+ # You can change the frequencies and voltage by modifying
473
+ # the file /sys/class/drm/card0/device/pp_od_clk_voltage
474
+ #
475
+ # first: This holds the presets for pp_dpm_sclk and pp_dpm_mclk.
476
+ #
477
+ # Second check the current settings:
478
+ #
479
+ # sudo cat /sys/class/drm/card0/device/pp_od_clk_voltage
480
+ # You should see something similar to this:
481
+ #
482
+ # OD_SCLK:
483
+ # 0: 300MHz 750mV
484
+ # 1: 588MHz 765mV
485
+ # 2: 980MHz 987mV
486
+ # 3: 1100MHz 950mV
487
+ # 4: 1100MHz 950mV
488
+ # 5: 1100MHz 950mV
489
+ # 6: 1100MHz 950mV
490
+ # 7: 1100MHz 950mV
491
+ # OD_MCLK:
492
+ # 0: 300MHz 750mV
493
+ # 1: 1000MHz 800mV
494
+ # 2: 1970MHz 950mV
495
+ # OD_RANGE:
496
+ # SCLK: 300MHz 2000MHz
497
+ # MCLK: 300MHz 2250MHz
498
+ # VDDC: 750mV 1150mV
499
+ # Example to set 1280Mhz at 950mV (check your output above for possible ranges!):
500
+ #
501
+ # sudo echo "s 7 1280 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
502
+ # Then to apply the changes:
503
+ #
504
+ # sudo echo 0 > /sys/class/drm/card0/device/pp_sclk_od
505
+ # sudo echo 1 > /sys/class/drm/card0/device/pp_sclk_od
506
+ # These are all the settings I use on my crypto mining card which is an AMD Radeon RX570 in case it's useful to anyone.
507
+ #
508
+ # echo 1 > /sys/class/drm/card0/device/hwmon/hwmon0/pwm1_enable
509
+ # echo manual > /sys/class/drm/card0/device/power_dpm_force_performance_level
510
+ # echo 200 > /sys/class/drm/card0/device/hwmon/hwmon0/pwm1 # Fan speed
511
+ # echo 4 > /sys/class/drm/card0/device/pp_power_profile_mode # Compute Mode
512
+ #
513
+ # echo "s 3 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
514
+ # echo "s 4 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
515
+ # echo "s 5 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
516
+ # echo "s 6 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
517
+ # echo "s 7 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
518
+ #
519
+ # echo "m 2 1985 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
520
+ #
521
+ # echo 0 > /sys/class/drm/card0/device/pp_sclk_od
522
+ # echo 1 > /sys/class/drm/card0/device/pp_sclk_od
523
+ #
524
+ # echo 0 > /sys/class/drm/card0/device/pp_mclk_od
525
+ # echo 1 > /sys/class/drm/card0/device/pp_mclk_od