compute_unit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'compute_unit/gpu'
4
+ require 'csv'
5
+ module ComputeUnit
6
+ class NvidiaGpu < ComputeUnit::Gpu
7
+ VENDOR_ID = '10de'
8
+ MAKE = 'Nvidia'
9
+ SUBTYPE = 'nvidia'
10
+ NVIDIA_SMI = '/usr/bin/nvidia-smi'
11
+
12
+ def initialize(device_path, opts = {})
13
+ data = self.class.read_information_file(device_path).merge(opts)
14
+ data[:pci_loc] = device_path
15
+ data[:busid] = data[:bus_location]
16
+ data[:bios] = data[:video_bios].upcase if data[:video_bios]
17
+ data[:uuid] = data[:gpu_uuid]
18
+ super(device_path, data)
19
+ end
20
+
21
+ def subtype
22
+ SUBTYPE
23
+ end
24
+
25
+ def reset_metadata
26
+ @meta = nil
27
+ end
28
+
29
+ # @return [Hash] the metadata from the nvidia-smi tool
30
+ # return cached data or fetch new data
31
+ def meta
32
+ if expired_metadata?
33
+ logger.debug("Expired Nvidia Data for #{uuid} ")
34
+ @meta = metadata
35
+ else
36
+ @meta ||= metadata
37
+ end
38
+ end
39
+
40
+ def self.blank_data
41
+ @blank_data ||= {
42
+ 'memory.used [MiB]' => '0',
43
+ 'memory.free [MiB]' => '0',
44
+ 'memory.total [MiB]' => '0',
45
+ 'utilization.gpu [%]' => '0',
46
+ 'temperature.gpu' => '0',
47
+ 'power.draw [W]' => '0',
48
+ 'power.limit [W]' => '0',
49
+ 'power.max_limit [W]' => '0',
50
+ 'pstate' => 7,
51
+ 'fan.speed [%]' => '0',
52
+ 'clocks.current.memory [MHz]' => '0',
53
+ 'clocks.current.sm [MHz]' => '0'
54
+ }
55
+ end
56
+
57
+ # @note data returned from nvidia-smi
58
+ # @return [Hash]
59
+ # "name": "GeForce GTX 1070 Ti",
60
+ # "vbios_version": "86.04.85.00.63",
61
+ # "uuid": "GPU-a583cb04-f9b5-68f3-50b9-2b4ba1c7d14e",
62
+ # "memory.used [MiB]": "2578 MiB",
63
+ # "memory.free [MiB]": "5534 MiB",
64
+ # "memory.total [MiB]": "8112 MiB",
65
+ # "utilization.gpu [%]": "100",
66
+ # "temperature.gpu": "53",
67
+ # "power.draw [W]": "129.21",
68
+ # "power.limit [W]": "130.00",
69
+ # "power.max_limit [W]": "217.00",
70
+ # "pstate": 2,
71
+ # "fan.speed [%]": "75"
72
+ def metadata
73
+ logger.debug("Calling #{NVIDIA_SMI}")
74
+ data = `#{NVIDIA_SMI} --query-gpu=gpu_name,vbios_version,uuid,memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu,power.draw,power.limit,power.max_limit,fan.speed,pstate,clocks.current.memory,clocks.current.sm -i #{index} --format=csv,nounits 2>&1`
75
+ unless $CHILD_STATUS.success?
76
+ # error code 15
77
+ logger.error(data.delete("\n"))
78
+ return self.class.blank_data
79
+ end
80
+ cards = if data
81
+ CSV.parse(data, headers: true, header_converters: ->(f) { f.strip },
82
+ converters: ->(f) { f ? f.strip : nil }).map(&:to_h)
83
+ end
84
+ cards.first
85
+ end
86
+
87
+ # @return [Integer] the current memory clock speed
88
+ def memory_clock
89
+ meta['clocks.current.memory [MHz]'].to_i
90
+ end
91
+
92
+ # @return [Integer] the current core clock speed
93
+ def core_clock
94
+ meta['clocks.current.sm [MHz]'].to_i
95
+ end
96
+
97
+ # @return [Integer] the fan speed
98
+ def fan
99
+ meta['fan.speed [%]'].to_i
100
+ end
101
+
102
+ # @return [Float] the power being used by the gpu
103
+ def power
104
+ meta['power.draw [W]'].strip.to_f + power_offset
105
+ end
106
+
107
+ def temp
108
+ meta['temperature.gpu'].to_i
109
+ end
110
+
111
+ def pstate
112
+ meta['pstate'].to_i
113
+ end
114
+
115
+ def power_limit
116
+ meta['power.limit [W]'].strip.to_f
117
+ end
118
+
119
+ def power_max_limit
120
+ meta['power.max_limit [W]'].strip.to_f
121
+ end
122
+
123
+ # @param value [Numeric] power in watts to set the gpu limit to
124
+ def power_limit=(value)
125
+ # in the correct format and above 10 watts
126
+ raise ArgumentError.new("Power value #{value.to_i} cannot exceed #{power_max_limit}") unless value.to_i.between?(1, power_max_limit.to_i)
127
+
128
+ output = `#{NVIDIA_SMI} -i #{index} -pl #{value}`
129
+ if $CHILD_STATUS.success?
130
+ logger.info("GPU#{index} power set to #{value} Watts")
131
+ else
132
+ logger.warn("GPU#{index} failed setting power to #{value}\n#{output}")
133
+ end
134
+ value.to_i
135
+ end
136
+
137
+ # @param value [Numeric] - the fan limit that should be applied to the gpu as a percentage
138
+ # @return [Numeric] - original passed in value after being set
139
+ def set_fan_limit(_value, _type = 'current')
140
+ raise NotImplementedError.new('Not implemented for Nvidia')
141
+ end
142
+
143
+ def memory_total
144
+ meta['memory.total [MiB]']
145
+ end
146
+
147
+ def memory_used
148
+ meta['memory.used [MiB]']
149
+ end
150
+
151
+ def memory_free
152
+ meta['memory.free [MiB]']
153
+ end
154
+
155
+ def utilization
156
+ meta['utilization.gpu [%]'].sub(/%/, '').to_i
157
+ end
158
+
159
+ def information_file
160
+ @information_file ||= begin
161
+ device_name = File.basename(device_path)
162
+ File.join('/proc/driver/nvidia/gpus', device_name, 'information')
163
+ end
164
+ end
165
+
166
+ # @return [Array] - returns a list of device paths of all devices specific to the vendor id
167
+ def self.devices
168
+ ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID }
169
+ end
170
+
171
+ # @param device_path [String] - the device path of the device
172
+ # @param index [Integer] - the index of the device relative to other devices of the same class ie. GPU0
173
+ def self.create_from_path(device_path, index, use_opencl = false)
174
+ opts = {
175
+ device_class_id: device_class(device_path),
176
+ device_id: device(device_path),
177
+ device_vendor_id: device_vendor(device_path),
178
+ subsystem_vendor_id: subsystem_vendor(device_path),
179
+ subsystem_device_id: subsystem_device(device_path),
180
+ use_opencl: use_opencl,
181
+ index: index
182
+ }
183
+ new(device_path, opts)
184
+ end
185
+
186
+ # @return [Array] - returns and array of gpu instances of NVIDIA type only
187
+ def self.find_all(use_opencl = false)
188
+ devices.map.with_index do |device_path, _index|
189
+ found_index = ComputeUnit::Gpu.found_devices.index(device_path)
190
+ create_from_path(device_path, found_index, use_opencl)
191
+ end
192
+ end
193
+
194
+ def set_mem_clock_and_vddc(_mem_clock, _mem_volt)
195
+ return unless experimental_on?
196
+
197
+ logger.warn('Feature not enabled for nvidia')
198
+ end
199
+
200
+ # @return [Hash] - hash of card info given by the kernel
201
+ # {:model=>"GeForce GTX 1070",
202
+ # :irq=>"130",
203
+ # :gpu_uuid=>"GPU-0116fb5c-66f4-1cba-c216-97f4600a8152",
204
+ # :video_bios=>"86.04.50.40.4a",
205
+ # :bus_type=>"PCIe",
206
+ # :dma_size=>"47 bits",
207
+ # :dma_mask=>"0x7fffffffffff",
208
+ # :bus_location=>"0000:0d:00.0",
209
+ # :device_minor=>"7"}
210
+ def self.read_information_file(device_path)
211
+ device_name = File.basename(device_path)
212
+ information_file = "/proc/driver/nvidia/gpus/#{device_name}/information"
213
+ File.open(information_file, 'r') do |file|
214
+ content = file.read
215
+ content.scan(/\n?([\w\s]*):\s+(.*)/).map { |key, value| [key.downcase.tr(' ', '_').to_sym, value] }.to_h
216
+ end
217
+ end
218
+
219
+ # more information can be found here: /sys/bus/pci/devices/0000:05:00.0
220
+
221
+ # index|misc|busid|model|bios|dpm|cor|cor_offset|default_cor|max_cor|mem_clock|mem_offset|default_mem|temp|temp_throttle|shut_throttle|fan|fan_rpm|pwr|pwr_limit|default_pwr_limit|throttle_reason|sysfs_path
222
+ end
223
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'logger'
4
+
5
+ module ComputeUnit
6
+ module Logger
7
+ def self.log_file
8
+ if ENV['LOG_FILENAME'] && File.exist?(ENV['LOG_FILENAME'])
9
+ ENV['LOG_FILENAME']
10
+ else
11
+ STDOUT
12
+ end
13
+ end
14
+
15
+ def self.logger
16
+ @logger ||= begin
17
+ log = ::Logger.new(Logger.log_file)
18
+ log.level = log_level
19
+ log.progname = 'ComputeUnit'
20
+ log.formatter = proc do |severity, datetime, progname, msg|
21
+ if Logger.log_file == STDOUT
22
+ "#{severity} - #{progname}: #{msg}\n".send(color(severity))
23
+ else
24
+ "#{datetime} #{severity} - #{progname}: #{msg}\n".send(color(severity))
25
+ end
26
+ end
27
+ log
28
+ end
29
+ end
30
+
31
+ def logger
32
+ @logger ||= Logger.logger
33
+ end
34
+
35
+ def self.color(severity)
36
+ case severity
37
+ when ::Logger::Severity::WARN, 'WARN'
38
+ :yellow
39
+ when ::Logger::Severity::INFO, 'INFO'
40
+ :green
41
+ when ::Logger::Severity::FATAL, 'FATAL'
42
+ :fatal
43
+ when ::Logger::Severity::ERROR, 'ERROR'
44
+ :fatal
45
+ when ::Logger::Severity::DEBUG, 'DEBUG'
46
+ :green
47
+ else
48
+ :green
49
+ end
50
+ end
51
+
52
+ def self.log_level
53
+ level = ENV['LOG_LEVEL'].downcase if ENV['LOG_LEVEL']
54
+ case level
55
+ when 'warn'
56
+ ::Logger::Severity::WARN
57
+ when 'fatal'
58
+ ::Logger::Severity::FATAL
59
+ when 'debug'
60
+ ::Logger::Severity::DEBUG
61
+ when 'info'
62
+ ::Logger::Severity::INFO
63
+ when 'error'
64
+ ::Logger::Severity::ERROR
65
+ else
66
+ ::Logger::Severity::INFO
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ class File
4
+ # @param path [String] - the path to the file
5
+ # @param n [Integer] - the number of lines to read from the path
6
+ # @summary Reads N lines from the end of file, without reading the entire file into memory
7
+ # @return [String] - the data read from the file
8
+ def self.tail(path, n = 1)
9
+ return '' unless File.exist?(path)
10
+
11
+ File.open(path, 'r') do |file|
12
+ buffer_s = 512
13
+ line_count = 0
14
+ file.seek(0, IO::SEEK_END)
15
+
16
+ offset = file.pos # we start at the end
17
+
18
+ while line_count <= n && offset > 0
19
+ to_read = if (offset - buffer_s) < 0
20
+ offset
21
+ else
22
+ buffer_s
23
+ end
24
+
25
+ file.seek(offset - to_read)
26
+ data = file.read(to_read)
27
+
28
+ data.reverse.each_char do |c|
29
+ if line_count > n
30
+ offset += 1
31
+ break
32
+ end
33
+ offset -= 1
34
+ line_count += 1 if c == "\n"
35
+ end
36
+ end
37
+ file.seek(offset)
38
+ file.read
39
+ end
40
+ end
41
+ end
42
+
43
+ class String
44
+ def colorize(color_code)
45
+ "\e[#{color_code}m#{self}\e[0m"
46
+ end
47
+
48
+ def red
49
+ colorize(31)
50
+ end
51
+
52
+ def green
53
+ colorize(32)
54
+ end
55
+
56
+ def fatal
57
+ red
58
+ end
59
+
60
+ def yellow
61
+ colorize(33)
62
+ end
63
+ end
64
+
65
+ class Hash
66
+ def stringify_keys
67
+ each_with_object({}) do |(key, value), hash|
68
+ value = value.stringify_keys if value.is_a?(Hash)
69
+ hash[key.to_s] = value
70
+ end
71
+ end
72
+
73
+ def symbolize_keys(&select)
74
+ dup.symbolize_keys!(&select)
75
+ end
76
+
77
+ def symbolize_keys!(&select)
78
+ if select
79
+ keys.each do |key|
80
+ next unless select[key]
81
+
82
+ new_key = (begin
83
+ key.to_sym
84
+ rescue StandardError
85
+ key.to_s.to_sym
86
+ end)
87
+ self[new_key] = delete(key)
88
+ end
89
+ else
90
+ keys.each do |key|
91
+ new_key = (begin
92
+ key.to_sym
93
+ rescue StandardError
94
+ key.to_s.to_sym
95
+ end)
96
+ self[new_key] = delete(key)
97
+ end
98
+ end
99
+ self
100
+ end
101
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'compute_unit/exceptions'
4
+ require 'etc'
5
+ module ComputeUnit
6
+ module Utils
7
+ # @return [Boolean] - return true if the current user is root
8
+ def self.root?
9
+ ::Etc.getpwuid.name == 'root'
10
+ end
11
+
12
+ # @return [Boolean] - return true if the current user is root
13
+ def root?
14
+ root?
15
+ end
16
+
17
+ # @return [Boolean] - returns true if user is root
18
+ # @raises [Crossbelt::Exceptions::NoPermission] if user does not have permission
19
+ def check_for_root
20
+ raise Exceptions::NoPermission.new('Please run this command as root or with sudo') unless root?
21
+
22
+ root?
23
+ end
24
+ module_function :check_for_root
25
+ end
26
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ComputeUnit
4
+ VERSION = '0.1.0'
5
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: compute_unit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Corey Osman
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2020-05-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: opencl_ruby_ffi
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.3.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 1.3.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ description: |2+
70
+
71
+ A ruby library that searches uses the linux sysfs file system for compute unit devices such as
72
+ CPUS, GPUs and other ASIC compute devices. Allows programmatic access collect real time metrics from the kernel or relatated driver toolchain.
73
+ Is meant to be used as a toolchain to future build tooling on. This library also makes use of opencl toolchain and requires
74
+ the opencl_ruby_ffi gem.
75
+
76
+ email:
77
+ - opselite@blockops.party
78
+ executables:
79
+ - list_computes
80
+ - update_pcidb
81
+ extensions: []
82
+ extra_rdoc_files: []
83
+ files:
84
+ - ".gitignore"
85
+ - ".gitlab-ci.yml"
86
+ - ".rspec"
87
+ - ".rubocop.yml"
88
+ - ".rubocop_todo.yml"
89
+ - ".ruby_version"
90
+ - CHANGELOG.md
91
+ - CODE_OF_CONDUCT.md
92
+ - Gemfile
93
+ - LICENSE.txt
94
+ - README.md
95
+ - Rakefile
96
+ - bin/console
97
+ - bin/setup
98
+ - compute_unit.gemspec
99
+ - exe/list_computes
100
+ - exe/update_pcidb
101
+ - lib/compute_unit.rb
102
+ - lib/compute_unit/asic.rb
103
+ - lib/compute_unit/cache_store.rb
104
+ - lib/compute_unit/compute_base.rb
105
+ - lib/compute_unit/cpu.rb
106
+ - lib/compute_unit/device.rb
107
+ - lib/compute_unit/exceptions.rb
108
+ - lib/compute_unit/formatters.rb
109
+ - lib/compute_unit/gpu.rb
110
+ - lib/compute_unit/gpus/amd_gpu.rb
111
+ - lib/compute_unit/gpus/nvidia_gpu.rb
112
+ - lib/compute_unit/logger.rb
113
+ - lib/compute_unit/monkey_patches.rb
114
+ - lib/compute_unit/utils.rb
115
+ - lib/compute_unit/version.rb
116
+ homepage: https://gitlab.com/blockops/compute_unit
117
+ licenses:
118
+ - MIT
119
+ metadata:
120
+ homepage_uri: https://gitlab.com/blockops/compute_unit
121
+ source_code_uri: https://gitlab.com/blockops/compute_unit
122
+ changelog_uri: https://gitlab.com/blockops/compute_unit/CHANGELOG
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '2.5'
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubygems_version: 3.0.3
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: A ruby library for compute unit devices
142
+ test_files: []