compute_unit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ComputeUnit
4
+ module Exceptions
5
+ class NoPermission < RuntimeError; end
6
+ class PermissionDenied < RuntimeError; end
7
+ class UnsupportedGPU < RuntimeError; end
8
+ class NotSupported < RuntimeError; end
9
+ class UnsupportedOSversion < RuntimeError; end
10
+ class NoWorkerName < RuntimeError; end
11
+ class NoComputeUnits < RuntimeError; end
12
+ class InvalidPCIDatabase < RuntimeError; end
13
+ end
14
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ComputeUnit
4
+ module Formatters
5
+ def micro_formatter(item, add_unit = false)
6
+ data = {}
7
+ item.each do |key, value|
8
+ if %i[hourly_cost hourly_earnings kwh_cost].include?(key)
9
+ v = (value * 1000000).round(4)
10
+ data[key] = add_unit ? "#{v} \u00B5BTC" : v
11
+ end
12
+ end
13
+ item.merge(data)
14
+ end
15
+
16
+ def value_micro_formatter(value, add_unit = false)
17
+ v = (value * 1000000).round(1)
18
+ add_unit ? "#{v} \u00B5BTC" : v
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,338 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'compute_unit/compute_base'
4
+ require 'compute_unit/cache_store'
5
+ module ComputeUnit
6
+ class Gpu < ComputeBase
7
+ attr_reader :pci_loc, :bios, :name
8
+ DEVICE_CLASS = '030000'
9
+ DEVICE_CLASS_NAME = 'GPU'
10
+ attr_accessor :power_limit, :use_opencl
11
+
12
+ def compute_type
13
+ type
14
+ end
15
+
16
+ # @return [OpenCL_Device]
17
+ def opencl_device
18
+ @opencl_device ||= self.class.opencl_devices.find_all { |cu| cu[:type] == make }[index] if use_opencl
19
+ end
20
+
21
+ # @return [String] - returns the raw data of the board name from opencl, return nil if no device
22
+ def opencl_board_name
23
+ @opencl_board_name ||= opencl_device&.board_name if use_opencl
24
+ end
25
+
26
+ # @return [Integer] - returns the number of compute units decteded by opencl
27
+ # not to be confused with stream processors. Can be helpful when determining which product vega56 or vega64
28
+ def opencl_units
29
+ @opencl_units ||= opencl_device.max_compute_units.to_i if use_opencl
30
+ end
31
+
32
+ # @return [String] - the device name
33
+ # ie. GeForce GTX 1070 or RX 580
34
+ # @note not really needed for Nvidia types since nvidia-smi returns really complete information
35
+ def opencl_name
36
+ @opencl_name ||= opencl_device.name if use_opencl
37
+ end
38
+
39
+ # @return [Array] - returns a list of device paths of all devices considered for display
40
+ # @note the devices are sorted by the device path
41
+ # @note this can mean AMD, NVIDIA, Intel or other crappy embedded devices
42
+ def self.devices
43
+ @devices ||= ComputeUnit::ComputeBase.devices.find_all do |device|
44
+ ComputeUnit::Device.device_class(device) == DEVICE_CLASS
45
+ end.sort
46
+ end
47
+
48
+ # @param device_path [String] - that pci bus path to the device
49
+ # @param opts [Hash]
50
+ # @option bios [String] the bios id
51
+ # @option model [String] the model name
52
+ # @option serial [String] the serial id of the device
53
+ # @option busid [String] the pci bus path of the device
54
+ # @option meta [Hash] metadata about the device
55
+ # @option index [Integer] the index of the device found in the device tree
56
+ # @option uuid [String] the uuid of the device
57
+ # @option use_opencl [Boolean] set to true if you want to get info about the device from opencl, defaults to false
58
+ def initialize(device_path, opts = {})
59
+ super(device_path, opts)
60
+ @type = :GPU
61
+ @bios = opts[:bios].upcase if opts[:bios]
62
+ @model = opts[:model]
63
+ @serial = opts[:serial]
64
+ @pci_loc = opts[:busid]
65
+ @meta = opts[:meta]
66
+ @index = opts[:index].to_i
67
+ @uuid = opts[:uuid] || opts[:serial]
68
+ @name = model
69
+ @power_offset = 0
70
+ @use_opencl = opts[:use_opencl] || false
71
+ end
72
+
73
+ def fan
74
+ raise NotImplementedError
75
+ end
76
+
77
+ def status
78
+ return 0 if utilization > 20 && power >= 50
79
+ return 2 if power < 20
80
+
81
+ 1
82
+ end
83
+
84
+ def power
85
+ raise NotImplementedError
86
+ end
87
+
88
+ def pstate
89
+ raise NotImplementedError
90
+ end
91
+
92
+ # @return [Integer] - a percentage value of the current fan limit
93
+ def fan_limit
94
+ fan
95
+ end
96
+
97
+ # @return [Integer] - a percentage value of the min fan limit
98
+ def fan_min_limit
99
+ nil
100
+ end
101
+
102
+ # @return [Integer] - a percentage value of the max fan limit
103
+ def fan_max_limit
104
+ nil
105
+ end
106
+
107
+ def power_limit
108
+ raise NotImplementedError
109
+ end
110
+
111
+ def power_max_limit
112
+ raise NotImplementedError
113
+ end
114
+
115
+ def memory_total
116
+ raise NotImplementedError
117
+ end
118
+
119
+ def memory_used
120
+ raise NotImplementedError
121
+ end
122
+
123
+ def memory_free
124
+ raise NotImplementedError
125
+ end
126
+
127
+ def utilization
128
+ raise NotImplementedError
129
+ end
130
+
131
+ # @return [Integer] - the memory speed
132
+ def memory_clock
133
+ 0
134
+ end
135
+
136
+ # @return [Integer] - the memory speed
137
+ def memory_volt
138
+ 0
139
+ end
140
+
141
+ # @return [Integer] - the core clock speed
142
+ def core_clock
143
+ 0
144
+ end
145
+
146
+ # @return [Numeric] - returns voltage of core in mV
147
+ def core_voltage
148
+ 0
149
+ end
150
+
151
+ # @return [Numeric] - returns voltage of core in mV
152
+ def configured_core_voltage
153
+ 0
154
+ end
155
+
156
+ def mem_info
157
+ {
158
+ index: "#{device_class_name}#{index}",
159
+ name: name,
160
+ volt: memory_volt,
161
+ clock: memory_clock,
162
+ memory_name: nil,
163
+ memory_type: nil,
164
+ memory_used: memory_used,
165
+ memory_free: memory_free,
166
+ memory_total: memory_total,
167
+ mem_temp: mem_temp
168
+ }
169
+ end
170
+
171
+ # @return [Hash] - hash of hardware status about the gpu
172
+ def status_info
173
+ {
174
+ index: "#{device_class_name}#{index}",
175
+ name: name,
176
+ bios: bios,
177
+ core_clock: core_clock,
178
+ memory_clock: memory_clock,
179
+ power: power,
180
+ fan: fan,
181
+ core_volt: core_voltage,
182
+ temp: temp,
183
+ mem_temp: mem_temp,
184
+ status: status
185
+ }
186
+ end
187
+
188
+ # @return [Hash] - hash of information about the gpu data
189
+ def hardware_info
190
+ {
191
+ uuid: uuid,
192
+ gpuId: "GPU#{index}",
193
+ syspath: device_path,
194
+ pciLoc: pci_loc,
195
+ name: name,
196
+ bios: bios,
197
+ subType: subtype,
198
+ make: make,
199
+ model: model,
200
+ vendor: vendor
201
+ }
202
+ end
203
+
204
+ # @return [Integer] - the temperature of the asic chip
205
+ def asic_temp
206
+ 0
207
+ end
208
+
209
+ # @return [Integer] - temperature of the memory
210
+ def mem_temp
211
+ 0
212
+ end
213
+
214
+ # @return [Integer] - the voltage reading of the card, maybe just amd cards (mV)
215
+ def vddgfx
216
+ 0
217
+ end
218
+
219
+ def temp
220
+ 0
221
+ end
222
+
223
+ def to_h
224
+ {
225
+ uuid: uuid,
226
+ gpuId: "GPU#{index}",
227
+ syspath: device_path,
228
+ pciLoc: pci_loc,
229
+ name: name,
230
+ bios: bios,
231
+ subType: subtype,
232
+ make: make,
233
+ model: model,
234
+ vendor: vendor,
235
+ # memory_name: nil,
236
+ # memory_type: nil,
237
+ # gpu_platform: nil,
238
+ power: power,
239
+ # power_limit: power_limit,
240
+ # power_max_limit: power_max_limit,
241
+ utilization: utilization,
242
+ # memory_used: memory_used ,
243
+ # memory_free: memory_free,
244
+ # memory_total: memory_total,
245
+ temperature: temp,
246
+ status: status,
247
+ pstate: pstate,
248
+ fanSpeed: fan,
249
+ type: compute_type,
250
+ maxTemp: nil,
251
+ mem: memory_clock,
252
+ cor: core_clock,
253
+ vlt: core_voltage,
254
+ mem_temp: mem_temp,
255
+ maxFan: nil,
256
+ dpm: nil,
257
+ vddci: nil,
258
+ maxPower: nil,
259
+ ocProfile: nil,
260
+ opencl_enabled: use_opencl
261
+ }
262
+ end
263
+
264
+ # @return [Array] - returns an array of gpu objects, sorted by index
265
+ def self.find_all(use_opencl = false)
266
+ require 'compute_unit/gpus/amd_gpu'
267
+ require 'compute_unit/gpus/nvidia_gpu'
268
+ g = ComputeUnit::AmdGpu.find_all(use_opencl) + ComputeUnit::NvidiaGpu.find_all(use_opencl)
269
+ g.sort_by(&:index)
270
+ end
271
+
272
+ # @return [CacheStore] - returns an instance of the cachestore for storign opencl cache
273
+ def self.opencl_cache
274
+ @opencl_cache ||= ComputeUnit::CacheStore.new('opencl_cache')
275
+ end
276
+
277
+ # @return [Array] - array of openstruct or nil
278
+ def self.opencl_devices_from_cache
279
+ data = opencl_cache.read_cache('opencl_compute_units', {})
280
+ data[ComputeUnit::Device.system_checksum]
281
+ end
282
+
283
+ # @returns [Array] - an array of openstruct objects
284
+ def self.opencl_devices_from_platform
285
+ require 'ostruct'
286
+ # opencl takes a second to load so we cache later in the process
287
+ # which is why we need the openstruct object here
288
+ # opencl can also freeze the system if it tries to enumerate a dead GPU
289
+ # opencl sould be used sparingly as a result and only read when absolutely
290
+ # neccessary and no dead GPUs.
291
+ # TODO: warn when dead gpus detected
292
+ begin
293
+ require 'opencl_ruby_ffi'
294
+ ComputeUnit::Logger.logger.debug('Searching for openCL devices')
295
+ OpenCL.platforms.map(&:devices).flatten.map do |d|
296
+ type = d.platform.name.include?('AMD') ? 'AMD' : 'Nvidia'
297
+ board_name = type == 'AMD' ? d.board_name_amd : ''
298
+ max_computes = d.respond_to?(:max_compute_units) ? d.max_compute_units : 0
299
+ OpenStruct.new(
300
+ name: d.name,
301
+ type: type,
302
+ board_name: board_name,
303
+ max_compute_units: max_computes
304
+ )
305
+ end
306
+ rescue OpenCL::Error::DEVICE_NOT_FOUND => e
307
+ ComputeUnit::Logger.logger.debug("OpenCL error: #{e.message}, are you root?")
308
+ []
309
+ rescue RuntimeError => e # OpenCL::Error::PLATFORM_NOT_FOUND_KHR,
310
+ ComputeUnit::Logger.logger.debug("OpenCL error: #{e.message}")
311
+ ComputeUnit::Logger.logger.debug("OpenCL error: #{e.backtrace}")
312
+ []
313
+ end
314
+ end
315
+
316
+ # @return [Hash] - a hash of voltages per the voltage table, nil if no table available
317
+ def voltage_table
318
+ []
319
+ end
320
+
321
+ # @return [Array] - array of devices paths either from amd or nvidia
322
+ def self.found_devices
323
+ @found_devices ||= ComputeUnit::AmdGpu.devices + ComputeUnit::NvidiaGpu.devices
324
+ end
325
+
326
+ # @return [Array] - returns an array of opencl devices
327
+ # overwrites cache if new devices are found
328
+ # OpenCL should only be used when necessary as it can freeze sometimes
329
+ # OpenCL indexes items differently
330
+ def self.opencl_devices
331
+ @opencl_devices ||= opencl_devices_from_cache || begin
332
+ items = opencl_devices_from_platform
333
+ opencl_cache.write_cache('opencl_compute_units', ComputeUnit::Device.system_checksum.to_s => items)
334
+ items
335
+ end
336
+ end
337
+ end
338
+ end
@@ -0,0 +1,525 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'compute_unit/gpu'
4
+ require 'digest'
5
+ require 'json'
6
+
7
+ module ComputeUnit
8
+ class AmdGpu < ComputeUnit::Gpu
9
+ MAKE = 'AMD'
10
+ VENDOR_ID = '1002'
11
+ SUBTYPE = 'amdgpu'
12
+ SYS_DEBUG_PATH = File.join(ComputeUnit::SYSFS_PATH, 'kernel', 'debug', 'dri')
13
+
14
+ def initialize(device_path, opts = {})
15
+ super(device_path, opts)
16
+ @pci_loc = File.basename(device_path)
17
+ @model = opts[:model] if opts[:use_opencl]
18
+
19
+ @uuid = "GPU#{index}"
20
+ end
21
+
22
+ def meta
23
+ {}
24
+ end
25
+
26
+ # @return [String] - the bios according to the vbios rom
27
+ # sometimes the kernel / driver extracted rom can be incorrect
28
+ # this is the bios gathered from the vbios itself.
29
+ def rom_bios
30
+ if !/\d{3}-/.match?(rom_metadata[2])
31
+ logger.warn("Invalid rom bios name for GPU#{index} using alternate name for #{rom_metadata[3]}")
32
+ rom_metadata[3]
33
+ elsif /\d{3}-/.match?(rom_metadata[2])
34
+ rom_metadata[2]
35
+ end
36
+ end
37
+
38
+ # @return [String::IO] - the contents of the rom file
39
+ def read_rom_data
40
+ if File.exist?(debug_rom_path)
41
+ IO.read(debug_rom_path, mode: 'rb')
42
+ elsif File.exist?(rom_path)
43
+ rom_data
44
+ else
45
+ ''
46
+ end
47
+ end
48
+
49
+ # @return [String] - the path to the readonly rom file
50
+ def debug_rom_path
51
+ @rom_path ||= File.join(SYS_DEBUG_PATH, index.to_s, 'amdgpu_vbios')
52
+ end
53
+
54
+ # @return [Array] - an array of readable strings from the rom file
55
+ def rom_metadata
56
+ @rom_metadata || begin
57
+ printable_chars = %r{[A-Za-z0-9`~!@#%^&*()-_=+|'";:/?.>,< \t\$\{\}\[\]\\]{10,}}
58
+ read_rom_data.scan(printable_chars)[0..9]
59
+ end
60
+ end
61
+
62
+ # @return [String] - returns the name of compute board
63
+ # for vegas we have to also get the compute units
64
+ def board_name
65
+ @board_name ||= begin
66
+ return nil unless opencl_board_name
67
+
68
+ name = opencl_board_name.sub(/Series|\(TM\)/, '').sub('Graphics', '').sub(/\s{2}/, ' ').strip
69
+ /vega/i.match?(name) ? "#{name} #{opencl_units}" : name
70
+ end
71
+ end
72
+
73
+ # @return [Array] - returns a list of device paths of all devices specific to the vendor id
74
+ def self.devices
75
+ ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID }
76
+ end
77
+
78
+ def name
79
+ model
80
+ end
81
+
82
+ # @return [String] - the name of the device model (specific name)
83
+ def model
84
+ @model ||= begin
85
+ board_name || sysfs_model_name
86
+ end
87
+ end
88
+
89
+ def load
90
+ utilization
91
+ end
92
+
93
+ # @return [Integer] - returns temp of gpu in celius
94
+ def temp
95
+ read_hwmon_data('temp1_input', 0).to_i / 1000
96
+ end
97
+
98
+ # @return [Integer] - returns fan rpm speed, 0 if cannot be found
99
+ def fan
100
+ read_hwmon_data('fan1_input', 0).to_i
101
+ end
102
+
103
+ # @return [Numeric] - returns voltage of core in mV
104
+ def core_voltage
105
+ dpm_core_vddc.zero? ? vddgfx.to_i : dpm_core_vddc
106
+ end
107
+
108
+ def configured_core_voltage
109
+ vddc
110
+ end
111
+
112
+ # @return [Integer] - the memory speed
113
+ def memory_clock
114
+ data = read_kernel_setting('pp_dpm_mclk', '').split("\n")
115
+ item = data.find { |d| d.include?('*') }
116
+ item.nil? ? item : item.match(/\d{2,6}/).to_a.first.to_i
117
+ end
118
+
119
+ # @return [Integer] - the core clock speed
120
+ def core_clock
121
+ data = read_kernel_setting('pp_dpm_sclk', '').split("\n")
122
+ item = data.find { |d| d.include?('*') }
123
+ item.nil? ? item : item.match(/\d{2,6}/).to_a.first.to_i
124
+ end
125
+
126
+ # @return [Integer] - the core voltage reading of the GPU via HWMON
127
+ def vddgfx
128
+ read_hwmon_data('in0_input', 0).to_i
129
+ end
130
+
131
+ # currently configured gpu core voltage
132
+ # @return [Numeric] - returns voltage of core in mV
133
+ def vddc
134
+ read_kernel_setting('pp_voltage', 0).to_i
135
+ end
136
+
137
+ # currently running gpu core voltage
138
+ def dpm_core_vddc
139
+ read_kernel_setting('pp_core_vddc', 0).to_i
140
+ end
141
+
142
+ def subtype
143
+ SUBTYPE
144
+ end
145
+
146
+ def clock_limits
147
+ read_kernel_setting('pp_od_clk_limits', '')
148
+ end
149
+
150
+ def gpu_defaults
151
+ read_kernel_setting('gpu_defaults', '')
152
+ end
153
+
154
+ # @return [Array] - array of hashes of voltages {:pstate=>0, :sclk=>300, :volt=>750}
155
+ def voltage_table
156
+ data = read_kernel_setting('pp_od_clk_voltage', nil)
157
+ return [] if data.nil?
158
+
159
+ _, sclk, = data.split(/OD_[S,M]CLK:\s?\n/)
160
+ sclk.split("\n").map do |line|
161
+ pstate, clk, volt, = line.gsub(/:|Mhz|mV/, '').split(/\s{2,}/).map(&:to_i)
162
+ { pstate: pstate, clk: clk, volt: volt, type: :sclk }
163
+ end
164
+ end
165
+
166
+ # @return [Array] - array of hashes of voltages {:pstate=>0, :mclk=>300, :volt=>750}
167
+ def vddci_voltage_table
168
+ # not sure if this is what mclk is but left it here anyways
169
+ data = read_kernel_setting('pp_od_clk_voltage', nil)
170
+ return data if data.nil?
171
+
172
+ _, _, mclk = data.split(/OD_[S,M]CLK:\s?\n/)
173
+ mclk.split("\n").map do |line|
174
+ pstate, clk, volt, = line.gsub(/:|Mhz|mV/, '').split(/\s{2,}/).map(&:to_i)
175
+ { pstate: pstate, clk: clk, volt: volt, type: :mclk }
176
+ end
177
+ end
178
+
179
+ # @return [Float] the average power being used by the gpu
180
+ def power_average
181
+ # TODO: if a gpu crashes the average power can sometimes take 3000 ms to read!
182
+ read_hwmon_data('power1_average', 0).to_i / 1000000
183
+ end
184
+
185
+ # @return [Float] the power being used by the gpu
186
+ def power
187
+ pp_value = read_kernel_setting('pp_power_usage', 0).to_i
188
+ value = pp_value > 0 ? pp_value : power_average
189
+ value + power_offset
190
+ end
191
+
192
+ # @return [String] - the name of the bios which is unique for every card
193
+ def bios
194
+ @bios ||= begin
195
+ a = read_kernel_setting('vbios_version', 'unreadable').upcase
196
+ b = rom_bios
197
+ /\d{3}-/.match?(b) ? b : a
198
+ end
199
+ end
200
+
201
+ def pstate
202
+ -1
203
+ end
204
+
205
+ # @return [String] - the serial number of the card
206
+ def serial
207
+ 'unknown'
208
+ end
209
+
210
+ # @param value [Numeric] - the power limit that should be applied to the gpu
211
+ # @return [Numeric] - original passed in value after being set
212
+ def power_limit=(value)
213
+ max = power_max_limit
214
+ raise ArgumentError.new("Power Value #{value} cannot exceed #{max}") if value > max
215
+ raise ArgumentError.new("Value must be between 10 and #{max}") if value < 10
216
+
217
+ # hwmon expects the value to have 6 zeros
218
+ write_hwmon_data('power1_cap', value * 1000000)
219
+ # logger.info("GPU#{index} power set to #{value} Watts")
220
+ end
221
+
222
+ # @param value [Numeric] - the fan limit that should be applied to the gpu as a percentage
223
+ # @return [Numeric] - original passed in value after being set
224
+ def set_fan_limit(value, type = 'current')
225
+ write_hwmon_data('fan1_enable', '1')
226
+ hwmon_file = if type == 'min'
227
+ 'pwm1_min'
228
+ elsif type == 'max'
229
+ 'pwm1_max'
230
+ elsif type == 'current'
231
+ 'pwm1'
232
+ else
233
+ raise ArgumentError.new("Invalid fan setting type, must be one of 'current, min or max'")
234
+ end
235
+ raise ArgumentError.new('Fan limit cannot exceed 100') if value > 100
236
+ raise ArgumentError.new('Fan limit value must be between 20 and 100') if value < 20
237
+
238
+ # Value must be between 0-255
239
+ amount = (255 * (value / 100.0)).round
240
+ logger.debug("Setting #{type} Fan on GPU#{index} to #{amount}")
241
+ write_hwmon_data(hwmon_file, amount)
242
+ logger.info("GPU#{index} #{type} fan set to #{value} percent")
243
+ value
244
+ end
245
+
246
+ # @return [Numeric] - current fan limit as a percentage
247
+ # @note the OS values is between 0 - 255
248
+ def fan_limit
249
+ cur = read_hwmon_data('pwm1', 0).to_i
250
+ return cur unless cur > 0
251
+
252
+ ((cur / 255.0) * 100).round(0)
253
+ end
254
+
255
+ # @return [Numeric] - current fan limit as a percentage
256
+ # @note the OS values is between 0 - 255
257
+ def fan_max_limit
258
+ cur = read_hwmon_data('pwm1_max', 0).to_i
259
+ return cur unless cur > 0
260
+
261
+ ((cur / 255.0) * 100).round(0)
262
+ end
263
+
264
+ # @return [Numeric] - current fan limit as a percentage
265
+ # @note the OS values is between 0 - 255
266
+ def fan_min_limit
267
+ cur = read_hwmon_data('pwm1_min', 0).to_i
268
+ return cur unless cur > 0
269
+
270
+ ((cur / 255.0) * 100).round(0)
271
+ end
272
+
273
+ # @return [Numeric] - current power limit
274
+ def power_limit
275
+ read_hwmon_data('power1_cap', 0).to_i / 1000000
276
+ end
277
+
278
+ # @return [Numeric] - the maximum power that can be set
279
+ def power_max_limit
280
+ read_hwmon_data('power1_cap_max').to_i / 1000000
281
+ end
282
+
283
+ def memory_total
284
+ 0
285
+ end
286
+
287
+ def memory_used
288
+ 0
289
+ end
290
+
291
+ def memory_free
292
+ 0
293
+ end
294
+
295
+ def utilization
296
+ return 0 unless amdgpu_pm_info[:load]
297
+
298
+ amdgpu_pm_info[:load][:value].to_i || 0
299
+ end
300
+
301
+ def self.create_from_path(device_path, index, use_opencl = false)
302
+ opts = {
303
+ device_class_id: device_class(device_path),
304
+ device_id: device(device_path),
305
+ device_vendor_id: device_vendor(device_path),
306
+ subsystem_vendor_id: subsystem_vendor(device_path),
307
+ subsystem_device_id: subsystem_device(device_path),
308
+ use_opencl: use_opencl,
309
+ index: index
310
+ }
311
+ new(device_path, opts)
312
+ end
313
+
314
+ # @return [Array] - returns and array of gpu instances of AMD type only
315
+ def self.find_all(use_opencl = false)
316
+ devices.map.with_index do |device_path, _index|
317
+ found_index = ComputeUnit::Gpu.found_devices.index(device_path)
318
+ create_from_path(device_path, found_index, use_opencl)
319
+ end
320
+ end
321
+
322
+ def read_dri_debug_file(file_name, default = '')
323
+ File.read(File.join(debug_dri_dir, file_name))
324
+ rescue Errno::EINVAL
325
+ default
326
+ rescue Errno::ENOENT
327
+ default
328
+ rescue Errno::EACCES
329
+ logger.debug('run this command as root or with sudo, using default values')
330
+ default
331
+ end
332
+
333
+ # @return [String] - returns the path the debug dri directory
334
+ # ie. "/sys/kernel/debug/dri/0"
335
+ def debug_dri_dir
336
+ @debug_dri_dir ||= begin
337
+ # if the user does not have permission the path will be nil
338
+ path = Dir.glob(File.join(SYS_DEBUG_PATH, '*', 'name')).find { |file| File.read(file).include?(pci_loc) }
339
+ raise Errno::EACCES.new("Permission denied #{SYS_DEBUG_PATH}") unless path
340
+
341
+ File.dirname(path)
342
+ end
343
+ end
344
+
345
+ # @returns [Array] - list of pm info
346
+ # {:mclk=>{:value=>"1950", :unit=>"MHz"},
347
+ # :sclk=>{:value=>"1125", :unit=>"MHz"},
348
+ # :vddgfx=>{:value=>"950", :unit=>"mV"},
349
+ # :vddc=>{:value=>"61.49", :unit=>"W"},
350
+ # :vddci=>{:value=>"1.0", :unit=>"W"},
351
+ # :max_gpu=>{:value=>"81.243", :unit=>"W"},
352
+ # :average_gpu=>{:value=>"82.117", :unit=>"W"},
353
+ # :temperature=>{:value=>"41", :unit=>"C"},
354
+ # :load=>{:value=>"100", :unit=>"%"}}
355
+ def amdgpu_pm_info
356
+ @amdgpu_pm_info ||= begin
357
+ content = read_dri_debug_file('amdgpu_pm_info')
358
+ data = content.scan(/(\d+\.?\d*)\s+(\w*)\s\(([\w\s]*)\)?/) + content.scan(/(\w*):\s(\d+)\s(.*)/).map(&:rotate)
359
+ data_hash = {}
360
+ data.each do |value, unit, name|
361
+ data_hash[name.gsub(/\s/, '_').downcase.to_sym] = { value: value, unit: unit }
362
+ end
363
+ data_hash
364
+ end
365
+ end
366
+
367
+ # @return [String] - reads the setting after writing the setting and returns current value
368
+ def dpm_force_performance
369
+ read_kernel_setting('power_dpm_force_performance_level', nil)
370
+ end
371
+
372
+ # @param setting [String] - the dpm performance setting to adjust the dpm (manual or auto)
373
+ # @return [String] - reads the setting after writing the setting and returns current value
374
+ def dpm_force_performance_setting(setting = 'manual')
375
+ raise ArgumentError.new('setting must be one of manual or auto') unless setting =~ /manual|auto/
376
+
377
+ write_kernel_setting('power_dpm_force_performance_level', "#{setting}\n")
378
+ end
379
+
380
+ def reset_to_defaults
381
+ dpm_force_performance_setting('auto')
382
+ write_kernel_setting('pp_od_clk_voltage', 'r')
383
+ write_kernel_setting('pp_od_clk_voltage', 'c')
384
+ write_hwmon_data('pwm1_enable', '2')
385
+ end
386
+
387
+ # @return [Array]
388
+ # reading from file "Sclk Limit: 2000 Mhz", "Mclk Limit: 2250 Mhz"
389
+ # @example [2000, 2250]
390
+ def clock_max_defaults
391
+ read_kernel_setting('pp_od_clk_limits', '0 0').scan(/\d+/).map(&:to_i)
392
+ end
393
+
394
+ # @return [Integer]
395
+ def max_core_clock
396
+ clock_max_defaults.first
397
+ end
398
+
399
+ # @return [Integer]
400
+ def min_core_clock
401
+ voltage_table[0][:clk]
402
+ end
403
+
404
+ # @return [Integer]
405
+ def max_mem_clock
406
+ clock_max_defaults.last # or vddci_voltage_table.last[:clk]
407
+ end
408
+
409
+ # @return [Integer]
410
+ def min_mem_clock
411
+ vddci_voltage_table.first[:clk]
412
+ end
413
+
414
+ # @return [Integer]
415
+ def max_mem_volt
416
+ vddci_voltage_table.last[:volt]
417
+ end
418
+
419
+ # @return [Integer]
420
+ def min_mem_volt
421
+ vddci_voltage_table.first[:volt]
422
+ end
423
+
424
+ # @return [Integer] - the temperature of the asic chip
425
+ def asic_temp
426
+ read_hwmon_data('temp2_input', 0).to_i / 1000
427
+ end
428
+
429
+ # @return [Integer] - the temperature of the memory chips
430
+ def mem_temp
431
+ read_hwmon_data('temp3_input', 0).to_i / 1000
432
+ end
433
+
434
+ def set_mem_clock_and_vddc(mem_clock, mem_volt)
435
+ return unless experimental_on?
436
+
437
+ mem_clock = mem_clock.to_i
438
+ mem_volt = mem_volt.to_i
439
+ # TODO: find max and min values and limit input
440
+ dpm_force_performance_setting('manual')
441
+ raise ArgumentError.new("MemClock value #{mem_clock} must be between #{min_mem_clock}-#{max_mem_clock}") unless mem_clock.between?(min_mem_clock, max_mem_clock)
442
+ raise ArgumentError.new("MemVolt value #{mem_volt} must be between #{min_mem_volt}-#{max_mem_volt}") unless mem_volt.between?(min_mem_volt, max_mem_volt)
443
+
444
+ write_kernel_setting('pp_od_clk_voltage', "r\n") # unlocks in order to write
445
+ # set row in table (m = manual), 3 = row,
446
+ write_kernel_setting('pp_od_clk_voltage', "m 3 #{mem_clock} #{mem_volt}\n")
447
+ write_kernel_setting('pp_od_clk_voltage', "c\n") # locks file
448
+ write_kernel_setting('pp_mclk_od', "3\n")
449
+ logger.info("Successfully applied overclock #{mem_clock} #{mem_volt} to #{name} at #{pci_loc}")
450
+ end
451
+ end
452
+ end
453
+
454
+ # See https://www.kernel.org/doc/html/latest/gpu/amdgpu.html for sysfs kernel
455
+ # /sys/kernel/debug/dri/2/name
456
+ # amdgpu dev=0000:08:00.0 unique=0000:08:00.0
457
+
458
+ # Add percent overclock to core speed
459
+ # sudo echo "7" > /sys/class/drm/card0/device/pp_sclk_od
460
+
461
+ # Add percent overclock to mem speed
462
+ # sudo echo "4" > /sys/class/drm/card0/device/pp_mclk_od
463
+
464
+ # The way the current AMDGPU overclocking works for the core frequency
465
+ # is by writing an integer value between 0 and 20 to
466
+ # /sys/class/drm/card0/device/pp_sclk_od. That value represents an
467
+ # overclock of 0~20% above the GPU's core frequency. Similarly,
468
+ # writing a value to /sys/class/drm/card0/device/pp_mclk_od represents
469
+ # a percentage-based overclock to the memory frequency.
470
+ #
471
+ #
472
+ # You can change the frequencies and voltage by modifying
473
+ # the file /sys/class/drm/card0/device/pp_od_clk_voltage
474
+ #
475
+ # first: This holds the presets for pp_dpm_sclk and pp_dpm_mclk.
476
+ #
477
+ # Second check the current settings:
478
+ #
479
+ # sudo cat /sys/class/drm/card0/device/pp_od_clk_voltage
480
+ # You should see something similar to this:
481
+ #
482
+ # OD_SCLK:
483
+ # 0: 300MHz 750mV
484
+ # 1: 588MHz 765mV
485
+ # 2: 980MHz 987mV
486
+ # 3: 1100MHz 950mV
487
+ # 4: 1100MHz 950mV
488
+ # 5: 1100MHz 950mV
489
+ # 6: 1100MHz 950mV
490
+ # 7: 1100MHz 950mV
491
+ # OD_MCLK:
492
+ # 0: 300MHz 750mV
493
+ # 1: 1000MHz 800mV
494
+ # 2: 1970MHz 950mV
495
+ # OD_RANGE:
496
+ # SCLK: 300MHz 2000MHz
497
+ # MCLK: 300MHz 2250MHz
498
+ # VDDC: 750mV 1150mV
499
+ # Example to set 1280Mhz at 950mV (check your output above for possible ranges!):
500
+ #
501
+ # sudo echo "s 7 1280 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
502
+ # Then to apply the changes:
503
+ #
504
+ # sudo echo 0 > /sys/class/drm/card0/device/pp_sclk_od
505
+ # sudo echo 1 > /sys/class/drm/card0/device/pp_sclk_od
506
+ # These are all the settings I use on my crypto mining card which is an AMD Radeon RX570 in case it's useful to anyone.
507
+ #
508
+ # echo 1 > /sys/class/drm/card0/device/hwmon/hwmon0/pwm1_enable
509
+ # echo manual > /sys/class/drm/card0/device/power_dpm_force_performance_level
510
+ # echo 200 > /sys/class/drm/card0/device/hwmon/hwmon0/pwm1 # Fan speed
511
+ # echo 4 > /sys/class/drm/card0/device/pp_power_profile_mode # Compute Mode
512
+ #
513
+ # echo "s 3 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
514
+ # echo "s 4 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
515
+ # echo "s 5 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
516
+ # echo "s 6 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
517
+ # echo "s 7 1100 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
518
+ #
519
+ # echo "m 2 1985 950" > /sys/class/drm/card0/device/pp_od_clk_voltage
520
+ #
521
+ # echo 0 > /sys/class/drm/card0/device/pp_sclk_od
522
+ # echo 1 > /sys/class/drm/card0/device/pp_sclk_od
523
+ #
524
+ # echo 0 > /sys/class/drm/card0/device/pp_mclk_od
525
+ # echo 1 > /sys/class/drm/card0/device/pp_mclk_od