compute_unit 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'compute_unit/gpu'
4
+ require 'csv'
5
+ module ComputeUnit
6
+ class NvidiaGpu < ComputeUnit::Gpu
7
+ VENDOR_ID = '10de'
8
+ MAKE = 'Nvidia'
9
+ SUBTYPE = 'nvidia'
10
+ NVIDIA_SMI = '/usr/bin/nvidia-smi'
11
+
12
+ def initialize(device_path, opts = {})
13
+ data = self.class.read_information_file(device_path).merge(opts)
14
+ data[:pci_loc] = device_path
15
+ data[:busid] = data[:bus_location]
16
+ data[:bios] = data[:video_bios].upcase if data[:video_bios]
17
+ data[:uuid] = data[:gpu_uuid]
18
+ super(device_path, data)
19
+ end
20
+
21
+ def subtype
22
+ SUBTYPE
23
+ end
24
+
25
+ def reset_metadata
26
+ @meta = nil
27
+ end
28
+
29
+ # @return [Hash] the metadata from the nvidia-smi tool
30
+ # return cached data or fetch new data
31
+ def meta
32
+ if expired_metadata?
33
+ logger.debug("Expired Nvidia Data for #{uuid} ")
34
+ @meta = metadata
35
+ else
36
+ @meta ||= metadata
37
+ end
38
+ end
39
+
40
+ def self.blank_data
41
+ @blank_data ||= {
42
+ 'memory.used [MiB]' => '0',
43
+ 'memory.free [MiB]' => '0',
44
+ 'memory.total [MiB]' => '0',
45
+ 'utilization.gpu [%]' => '0',
46
+ 'temperature.gpu' => '0',
47
+ 'power.draw [W]' => '0',
48
+ 'power.limit [W]' => '0',
49
+ 'power.max_limit [W]' => '0',
50
+ 'pstate' => 7,
51
+ 'fan.speed [%]' => '0',
52
+ 'clocks.current.memory [MHz]' => '0',
53
+ 'clocks.current.sm [MHz]' => '0'
54
+ }
55
+ end
56
+
57
+ # @note data returned from nvidia-smi
58
+ # @return [Hash]
59
+ # "name": "GeForce GTX 1070 Ti",
60
+ # "vbios_version": "86.04.85.00.63",
61
+ # "uuid": "GPU-a583cb04-f9b5-68f3-50b9-2b4ba1c7d14e",
62
+ # "memory.used [MiB]": "2578 MiB",
63
+ # "memory.free [MiB]": "5534 MiB",
64
+ # "memory.total [MiB]": "8112 MiB",
65
+ # "utilization.gpu [%]": "100",
66
+ # "temperature.gpu": "53",
67
+ # "power.draw [W]": "129.21",
68
+ # "power.limit [W]": "130.00",
69
+ # "power.max_limit [W]": "217.00",
70
+ # "pstate": 2,
71
+ # "fan.speed [%]": "75"
72
+ def metadata
73
+ logger.debug("Calling #{NVIDIA_SMI}")
74
+ data = `#{NVIDIA_SMI} --query-gpu=gpu_name,vbios_version,uuid,memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu,power.draw,power.limit,power.max_limit,fan.speed,pstate,clocks.current.memory,clocks.current.sm -i #{index} --format=csv,nounits 2>&1`
75
+ unless $CHILD_STATUS.success?
76
+ # error code 15
77
+ logger.error(data.delete("\n"))
78
+ return self.class.blank_data
79
+ end
80
+ cards = if data
81
+ CSV.parse(data, headers: true, header_converters: ->(f) { f.strip },
82
+ converters: ->(f) { f ? f.strip : nil }).map(&:to_h)
83
+ end
84
+ cards.first
85
+ end
86
+
87
+ # @return [Integer] the current memory clock speed
88
+ def memory_clock
89
+ meta['clocks.current.memory [MHz]'].to_i
90
+ end
91
+
92
+ # @return [Integer] the current core clock speed
93
+ def core_clock
94
+ meta['clocks.current.sm [MHz]'].to_i
95
+ end
96
+
97
+ # @return [Integer] the fan speed
98
+ def fan
99
+ meta['fan.speed [%]'].to_i
100
+ end
101
+
102
+ # @return [Float] the power being used by the gpu
103
+ def power
104
+ meta['power.draw [W]'].strip.to_f + power_offset
105
+ end
106
+
107
+ def temp
108
+ meta['temperature.gpu'].to_i
109
+ end
110
+
111
+ def pstate
112
+ meta['pstate'].to_i
113
+ end
114
+
115
+ def power_limit
116
+ meta['power.limit [W]'].strip.to_f
117
+ end
118
+
119
+ def power_max_limit
120
+ meta['power.max_limit [W]'].strip.to_f
121
+ end
122
+
123
+ # @param value [Numeric] power in watts to set the gpu limit to
124
+ def power_limit=(value)
125
+ # in the correct format and above 10 watts
126
+ raise ArgumentError.new("Power value #{value.to_i} cannot exceed #{power_max_limit}") unless value.to_i.between?(1, power_max_limit.to_i)
127
+
128
+ output = `#{NVIDIA_SMI} -i #{index} -pl #{value}`
129
+ if $CHILD_STATUS.success?
130
+ logger.info("GPU#{index} power set to #{value} Watts")
131
+ else
132
+ logger.warn("GPU#{index} failed setting power to #{value}\n#{output}")
133
+ end
134
+ value.to_i
135
+ end
136
+
137
+ # @param value [Numeric] - the fan limit that should be applied to the gpu as a percentage
138
+ # @return [Numeric] - original passed in value after being set
139
+ def set_fan_limit(_value, _type = 'current')
140
+ raise NotImplementedError.new('Not implemented for Nvidia')
141
+ end
142
+
143
+ def memory_total
144
+ meta['memory.total [MiB]']
145
+ end
146
+
147
+ def memory_used
148
+ meta['memory.used [MiB]']
149
+ end
150
+
151
+ def memory_free
152
+ meta['memory.free [MiB]']
153
+ end
154
+
155
+ def utilization
156
+ meta['utilization.gpu [%]'].sub(/%/, '').to_i
157
+ end
158
+
159
+ def information_file
160
+ @information_file ||= begin
161
+ device_name = File.basename(device_path)
162
+ File.join('/proc/driver/nvidia/gpus', device_name, 'information')
163
+ end
164
+ end
165
+
166
+ # @return [Array] - returns a list of device paths of all devices specific to the vendor id
167
+ def self.devices
168
+ ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID }
169
+ end
170
+
171
+ # @param device_path [String] - the device path of the device
172
+ # @param index [Integer] - the index of the device relative to other devices of the same class ie. GPU0
173
+ def self.create_from_path(device_path, index, use_opencl = false)
174
+ opts = {
175
+ device_class_id: device_class(device_path),
176
+ device_id: device(device_path),
177
+ device_vendor_id: device_vendor(device_path),
178
+ subsystem_vendor_id: subsystem_vendor(device_path),
179
+ subsystem_device_id: subsystem_device(device_path),
180
+ use_opencl: use_opencl,
181
+ index: index
182
+ }
183
+ new(device_path, opts)
184
+ end
185
+
186
+ # @return [Array] - returns and array of gpu instances of NVIDIA type only
187
+ def self.find_all(use_opencl = false)
188
+ devices.map.with_index do |device_path, _index|
189
+ found_index = ComputeUnit::Gpu.found_devices.index(device_path)
190
+ create_from_path(device_path, found_index, use_opencl)
191
+ end
192
+ end
193
+
194
+ def set_mem_clock_and_vddc(_mem_clock, _mem_volt)
195
+ return unless experimental_on?
196
+
197
+ logger.warn('Feature not enabled for nvidia')
198
+ end
199
+
200
+ # @return [Hash] - hash of card info given by the kernel
201
+ # {:model=>"GeForce GTX 1070",
202
+ # :irq=>"130",
203
+ # :gpu_uuid=>"GPU-0116fb5c-66f4-1cba-c216-97f4600a8152",
204
+ # :video_bios=>"86.04.50.40.4a",
205
+ # :bus_type=>"PCIe",
206
+ # :dma_size=>"47 bits",
207
+ # :dma_mask=>"0x7fffffffffff",
208
+ # :bus_location=>"0000:0d:00.0",
209
+ # :device_minor=>"7"}
210
+ def self.read_information_file(device_path)
211
+ device_name = File.basename(device_path)
212
+ information_file = "/proc/driver/nvidia/gpus/#{device_name}/information"
213
+ File.open(information_file, 'r') do |file|
214
+ content = file.read
215
+ content.scan(/\n?([\w\s]*):\s+(.*)/).map { |key, value| [key.downcase.tr(' ', '_').to_sym, value] }.to_h
216
+ end
217
+ end
218
+
219
+ # more information can be found here: /sys/bus/pci/devices/0000:05:00.0
220
+
221
+ # index|misc|busid|model|bios|dpm|cor|cor_offset|default_cor|max_cor|mem_clock|mem_offset|default_mem|temp|temp_throttle|shut_throttle|fan|fan_rpm|pwr|pwr_limit|default_pwr_limit|throttle_reason|sysfs_path
222
+ end
223
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'logger'
4
+
5
+ module ComputeUnit
6
+ module Logger
7
+ def self.log_file
8
+ if ENV['LOG_FILENAME'] && File.exist?(ENV['LOG_FILENAME'])
9
+ ENV['LOG_FILENAME']
10
+ else
11
+ STDOUT
12
+ end
13
+ end
14
+
15
+ def self.logger
16
+ @logger ||= begin
17
+ log = ::Logger.new(Logger.log_file)
18
+ log.level = log_level
19
+ log.progname = 'ComputeUnit'
20
+ log.formatter = proc do |severity, datetime, progname, msg|
21
+ if Logger.log_file == STDOUT
22
+ "#{severity} - #{progname}: #{msg}\n".send(color(severity))
23
+ else
24
+ "#{datetime} #{severity} - #{progname}: #{msg}\n".send(color(severity))
25
+ end
26
+ end
27
+ log
28
+ end
29
+ end
30
+
31
+ def logger
32
+ @logger ||= Logger.logger
33
+ end
34
+
35
+ def self.color(severity)
36
+ case severity
37
+ when ::Logger::Severity::WARN, 'WARN'
38
+ :yellow
39
+ when ::Logger::Severity::INFO, 'INFO'
40
+ :green
41
+ when ::Logger::Severity::FATAL, 'FATAL'
42
+ :fatal
43
+ when ::Logger::Severity::ERROR, 'ERROR'
44
+ :fatal
45
+ when ::Logger::Severity::DEBUG, 'DEBUG'
46
+ :green
47
+ else
48
+ :green
49
+ end
50
+ end
51
+
52
+ def self.log_level
53
+ level = ENV['LOG_LEVEL'].downcase if ENV['LOG_LEVEL']
54
+ case level
55
+ when 'warn'
56
+ ::Logger::Severity::WARN
57
+ when 'fatal'
58
+ ::Logger::Severity::FATAL
59
+ when 'debug'
60
+ ::Logger::Severity::DEBUG
61
+ when 'info'
62
+ ::Logger::Severity::INFO
63
+ when 'error'
64
+ ::Logger::Severity::ERROR
65
+ else
66
+ ::Logger::Severity::INFO
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ class File
4
+ # @param path [String] - the path to the file
5
+ # @param n [Integer] - the number of lines to read from the path
6
+ # @summary Reads N lines from the end of file, without reading the entire file into memory
7
+ # @return [String] - the data read from the file
8
+ def self.tail(path, n = 1)
9
+ return '' unless File.exist?(path)
10
+
11
+ File.open(path, 'r') do |file|
12
+ buffer_s = 512
13
+ line_count = 0
14
+ file.seek(0, IO::SEEK_END)
15
+
16
+ offset = file.pos # we start at the end
17
+
18
+ while line_count <= n && offset > 0
19
+ to_read = if (offset - buffer_s) < 0
20
+ offset
21
+ else
22
+ buffer_s
23
+ end
24
+
25
+ file.seek(offset - to_read)
26
+ data = file.read(to_read)
27
+
28
+ data.reverse.each_char do |c|
29
+ if line_count > n
30
+ offset += 1
31
+ break
32
+ end
33
+ offset -= 1
34
+ line_count += 1 if c == "\n"
35
+ end
36
+ end
37
+ file.seek(offset)
38
+ file.read
39
+ end
40
+ end
41
+ end
42
+
43
+ class String
44
+ def colorize(color_code)
45
+ "\e[#{color_code}m#{self}\e[0m"
46
+ end
47
+
48
+ def red
49
+ colorize(31)
50
+ end
51
+
52
+ def green
53
+ colorize(32)
54
+ end
55
+
56
+ def fatal
57
+ red
58
+ end
59
+
60
+ def yellow
61
+ colorize(33)
62
+ end
63
+ end
64
+
65
+ class Hash
66
+ def stringify_keys
67
+ each_with_object({}) do |(key, value), hash|
68
+ value = value.stringify_keys if value.is_a?(Hash)
69
+ hash[key.to_s] = value
70
+ end
71
+ end
72
+
73
+ def symbolize_keys(&select)
74
+ dup.symbolize_keys!(&select)
75
+ end
76
+
77
+ def symbolize_keys!(&select)
78
+ if select
79
+ keys.each do |key|
80
+ next unless select[key]
81
+
82
+ new_key = (begin
83
+ key.to_sym
84
+ rescue StandardError
85
+ key.to_s.to_sym
86
+ end)
87
+ self[new_key] = delete(key)
88
+ end
89
+ else
90
+ keys.each do |key|
91
+ new_key = (begin
92
+ key.to_sym
93
+ rescue StandardError
94
+ key.to_s.to_sym
95
+ end)
96
+ self[new_key] = delete(key)
97
+ end
98
+ end
99
+ self
100
+ end
101
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'compute_unit/exceptions'
4
+ require 'etc'
5
+ module ComputeUnit
6
+ module Utils
7
+ # @return [Boolean] - return true if the current user is root
8
+ def self.root?
9
+ ::Etc.getpwuid.name == 'root'
10
+ end
11
+
12
+ # @return [Boolean] - return true if the current user is root
13
+ def root?
14
+ root?
15
+ end
16
+
17
+ # @return [Boolean] - returns true if user is root
18
+ # @raises [Crossbelt::Exceptions::NoPermission] if user does not have permission
19
+ def check_for_root
20
+ raise Exceptions::NoPermission.new('Please run this command as root or with sudo') unless root?
21
+
22
+ root?
23
+ end
24
+ module_function :check_for_root
25
+ end
26
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ComputeUnit
4
+ VERSION = '0.1.0'
5
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: compute_unit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Corey Osman
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-05-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: opencl_ruby_ffi
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.3.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 1.3.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ description: |2+
70
+
71
+ A ruby library that searches uses the linux sysfs file system for compute unit devices such as
72
+ CPUS, GPUs and other ASIC compute devices. Allows programmatic access collect real time metrics from the kernel or relatated driver toolchain.
73
+ Is meant to be used as a toolchain to future build tooling on. This library also makes use of opencl toolchain and requires
74
+ the opencl_ruby_ffi gem.
75
+
76
+ email:
77
+ - opselite@blockops.party
78
+ executables:
79
+ - list_computes
80
+ - update_pcidb
81
+ extensions: []
82
+ extra_rdoc_files: []
83
+ files:
84
+ - ".gitignore"
85
+ - ".gitlab-ci.yml"
86
+ - ".rspec"
87
+ - ".rubocop.yml"
88
+ - ".rubocop_todo.yml"
89
+ - ".ruby_version"
90
+ - CHANGELOG.md
91
+ - CODE_OF_CONDUCT.md
92
+ - Gemfile
93
+ - LICENSE.txt
94
+ - README.md
95
+ - Rakefile
96
+ - bin/console
97
+ - bin/setup
98
+ - compute_unit.gemspec
99
+ - exe/list_computes
100
+ - exe/update_pcidb
101
+ - lib/compute_unit.rb
102
+ - lib/compute_unit/asic.rb
103
+ - lib/compute_unit/cache_store.rb
104
+ - lib/compute_unit/compute_base.rb
105
+ - lib/compute_unit/cpu.rb
106
+ - lib/compute_unit/device.rb
107
+ - lib/compute_unit/exceptions.rb
108
+ - lib/compute_unit/formatters.rb
109
+ - lib/compute_unit/gpu.rb
110
+ - lib/compute_unit/gpus/amd_gpu.rb
111
+ - lib/compute_unit/gpus/nvidia_gpu.rb
112
+ - lib/compute_unit/logger.rb
113
+ - lib/compute_unit/monkey_patches.rb
114
+ - lib/compute_unit/utils.rb
115
+ - lib/compute_unit/version.rb
116
+ homepage: https://gitlab.com/blockops/compute_unit
117
+ licenses:
118
+ - MIT
119
+ metadata:
120
+ homepage_uri: https://gitlab.com/blockops/compute_unit
121
+ source_code_uri: https://gitlab.com/blockops/compute_unit
122
+ changelog_uri: https://gitlab.com/blockops/compute_unit/CHANGELOG
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '2.5'
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubygems_version: 3.0.3
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: A ruby library for compute unit devices
142
+ test_files: []