gitlab-monitor 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -0
  3. data/.gitlab-ci.yml +18 -0
  4. data/.rubocop.yml +34 -0
  5. data/CONTRIBUTING.md +651 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +75 -0
  8. data/LICENSE +25 -0
  9. data/README.md +110 -0
  10. data/bin/gitlab-mon +17 -0
  11. data/config/gitlab-monitor.yml.example +112 -0
  12. data/gitlab-monitor.gemspec +33 -0
  13. data/lib/gitlab_monitor.rb +18 -0
  14. data/lib/gitlab_monitor/cli.rb +341 -0
  15. data/lib/gitlab_monitor/database.rb +13 -0
  16. data/lib/gitlab_monitor/database/base.rb +44 -0
  17. data/lib/gitlab_monitor/database/bloat.rb +74 -0
  18. data/lib/gitlab_monitor/database/bloat_btree.sql +84 -0
  19. data/lib/gitlab_monitor/database/bloat_table.sql +63 -0
  20. data/lib/gitlab_monitor/database/ci_builds.rb +527 -0
  21. data/lib/gitlab_monitor/database/remote_mirrors.rb +74 -0
  22. data/lib/gitlab_monitor/database/row_count.rb +164 -0
  23. data/lib/gitlab_monitor/database/tuple_stats.rb +53 -0
  24. data/lib/gitlab_monitor/git.rb +144 -0
  25. data/lib/gitlab_monitor/memstats.rb +98 -0
  26. data/lib/gitlab_monitor/memstats/mapping.rb +91 -0
  27. data/lib/gitlab_monitor/prober.rb +40 -0
  28. data/lib/gitlab_monitor/process.rb +122 -0
  29. data/lib/gitlab_monitor/prometheus.rb +64 -0
  30. data/lib/gitlab_monitor/sidekiq.rb +149 -0
  31. data/lib/gitlab_monitor/sidekiq_queue_job_stats.lua +42 -0
  32. data/lib/gitlab_monitor/util.rb +83 -0
  33. data/lib/gitlab_monitor/version.rb +5 -0
  34. data/lib/gitlab_monitor/web_exporter.rb +77 -0
  35. data/spec/cli_spec.rb +31 -0
  36. data/spec/database/bloat_spec.rb +99 -0
  37. data/spec/database/ci_builds_spec.rb +421 -0
  38. data/spec/database/row_count_spec.rb +37 -0
  39. data/spec/fixtures/smaps/sample.txt +10108 -0
  40. data/spec/git_process_proper_spec.rb +27 -0
  41. data/spec/git_spec.rb +52 -0
  42. data/spec/memstats_spec.rb +28 -0
  43. data/spec/prometheus_metrics_spec.rb +17 -0
  44. data/spec/spec_helper.rb +63 -0
  45. data/spec/util_spec.rb +15 -0
  46. metadata +225 -0
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GitLab
4
+ module Monitor
5
+ module MemStats
6
+ # Parses one entry in /proc/[pid]/smaps. For example:
7
+ #
8
+ # 00400000-00401000 r-xp 00000000 08:01 541055 /opt/gitlab/embedded/bin/ruby
9
+ # Size: 4 kB
10
+ # Rss: 4 kB
11
+ # Pss: 0 kB
12
+ # Shared_Clean: 4 kB
13
+ # Shared_Dirty: 0 kB
14
+ # Private_Clean: 0 kB
15
+ # Private_Dirty: 0 kB
16
+ # Referenced: 4 kB
17
+ # Anonymous: 0 kB
18
+ # AnonHugePages: 0 kB
19
+ # Shared_Hugetlb: 0 kB
20
+ # Private_Hugetlb: 0 kB
21
+ # Swap: 0 kB
22
+ # SwapPss: 0 kB
23
+ # KernelPageSize: 4 kB
24
+ # MMUPageSize: 4 kB
25
+ # Locked: 0 kB
26
+ # VmFlags: rd ex mr mw me dw sd
27
+ class Mapping
28
+ FIELDS = %w(size rss shared_clean shared_dirty private_clean private_dirty swap pss).freeze
29
+
30
+ attr_reader :address_start
31
+ attr_reader :address_end
32
+ attr_reader :perms
33
+ attr_reader :offset
34
+ attr_reader :device_major
35
+ attr_reader :device_minor
36
+ attr_reader :inode
37
+ attr_reader :region
38
+
39
+ attr_accessor :size
40
+ attr_accessor :rss
41
+ attr_accessor :shared_clean
42
+ attr_accessor :shared_dirty
43
+ attr_accessor :private_dirty
44
+ attr_accessor :private_clean
45
+ attr_accessor :swap
46
+ attr_accessor :pss
47
+
48
+ def initialize(lines)
49
+ FIELDS.each do |field|
50
+ send("#{field}=", 0)
51
+ end
52
+
53
+ parse_first_line(lines.shift)
54
+
55
+ lines.each do |l|
56
+ parse_field_line(l)
57
+ end
58
+ end
59
+
60
+ def parse_first_line(line)
61
+ line.strip!
62
+
63
+ parts = line.split
64
+ @address_start, @address_end = parts[0].split("-")
65
+ @perms = parts[1]
66
+ @offset = parts[2]
67
+ @device_major, @device_minor = parts[3].split(":")
68
+ @inode = parts[4]
69
+ @region = parts[5] || "anonymous"
70
+ end
71
+
72
+ def parse_field_line(line)
73
+ line.strip!
74
+
75
+ parts = line.split
76
+
77
+ return unless parts
78
+
79
+ parts[0].downcase!
80
+ parts[0].sub!(":", "")
81
+ field = parts[0]
82
+
83
+ return unless respond_to? "#{field}="
84
+
85
+ value = Float(parts[1]).to_i
86
+ send("#{field}=", value)
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,40 @@
1
+ module GitLab
2
+ module Monitor
3
+ # A class to combine multiple probers into one
4
+ class Prober
5
+ def initialize(prober_opts, metrics: PrometheusMetrics.new)
6
+ @prober_opts = prober_opts
7
+ @metrics = metrics
8
+
9
+ resolve_prober_classes
10
+ end
11
+
12
+ def probe_all
13
+ @prober_opts.each do |_probe_name, params|
14
+ Utils.wrap_in_array(params[:opts]).each do |opts|
15
+ prober = params[:class].new(opts, metrics: @metrics)
16
+ params[:methods].each do |meth|
17
+ prober.send(meth)
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ def write_to(target)
24
+ target.write(@metrics.to_s)
25
+ end
26
+
27
+ private
28
+
29
+ def resolve_prober_classes
30
+ @prober_opts.each do |probe_name, params|
31
+ prober_class_name = params[:class_name] || Utils.camel_case_string("#{probe_name}_prober")
32
+
33
+ klass = prober_class_name.split("::").reduce(GitLab::Monitor) { |a, e| a.const_get(e) }
34
+
35
+ params[:class] = klass
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "memstats"
4
+
5
+ module GitLab
6
+ module Monitor
7
+ # A helper class to extract memory info from /proc/<pid>/status
8
+ #
9
+
10
+ # A helper class to stats from /proc/<pid>/stat
11
+ #
12
+ # See: man 5 proc
13
+ #
14
+ # It takes a pid
15
+ class ProcessStats
16
+ def initialize(pid)
17
+ @pid = pid
18
+ @user_hertz = retrieve_user_hertz
19
+ @stats = populate_info
20
+ end
21
+
22
+ def valid?
23
+ !@stats.nil?
24
+ end
25
+
26
+ def cpu_time
27
+ (@stats[14].to_i + @stats[15].to_i) / @user_hertz
28
+ end
29
+
30
+ def start_time
31
+ @stats[22].to_i / @user_hertz
32
+ end
33
+
34
+ def vsize
35
+ # Virtual memory size in bytes.
36
+ @stats[23].to_i
37
+ end
38
+
39
+ def rss
40
+ # Resident Set Size: number of pages the process has in real memory.
41
+ @stats[24].to_i * 4096
42
+ end
43
+
44
+ private
45
+
46
+ def populate_info
47
+ # Pad the array by one element to make field numbers match the man page.
48
+ [""].concat(File.read("/proc/#{@pid}/stat").split(" "))
49
+ rescue Errno::ENOENT
50
+ nil
51
+ end
52
+
53
+ def retrieve_user_hertz
54
+ Process.clock_getres(:TIMES_BASED_CLOCK_PROCESS_CPUTIME_ID, :hertz)
55
+ rescue Errno::EINVAL
56
+ 100.0
57
+ end
58
+ end
59
+
60
+ # Probes a process for info then writes metrics to a target
61
+ class ProcessProber
62
+ def initialize(options, metrics: PrometheusMetrics.new)
63
+ @metrics = metrics
64
+ @name = options[:name]
65
+ @pids = if options[:pid_or_pattern] =~ /^\d+$/
66
+ [options[:pid_or_pattern]]
67
+ else
68
+ Utils.pgrep(options[:pid_or_pattern])
69
+ end
70
+ @use_quantiles = options.fetch(:quantiles, false)
71
+ end
72
+
73
+ def probe_stat
74
+ @pids.each do |pid|
75
+ stats = ProcessStats.new(pid)
76
+ next unless stats.valid?
77
+
78
+ labels = { name: @name.downcase }
79
+ labels[:pid] = pid unless @use_quantiles
80
+
81
+ @metrics.add("process_cpu_seconds_total", stats.cpu_time, @use_quantiles, **labels)
82
+ @metrics.add("process_resident_memory_bytes", stats.rss, @use_quantiles, **labels)
83
+ @metrics.add("process_virtual_memory_bytes", stats.vsize, @use_quantiles, **labels)
84
+ @metrics.add("process_start_time_seconds", stats.start_time, @use_quantiles, **labels)
85
+ end
86
+
87
+ self
88
+ end
89
+
90
+ def probe_count
91
+ @metrics.add("process_count", @pids.count, name: @name.downcase)
92
+
93
+ self
94
+ end
95
+
96
+ def probe_smaps
97
+ @pids.each do |pid|
98
+ stats = ::GitLab::Monitor::MemStats::Aggregator.new(pid)
99
+
100
+ next unless stats.valid?
101
+
102
+ labels = { name: @name.downcase }
103
+ labels[:pid] = pid unless @use_quantiles
104
+
105
+ ::GitLab::Monitor::MemStats::Mapping::FIELDS.each do |field|
106
+ value = stats.totals[field]
107
+
108
+ if value >= 0
109
+ @metrics.add("process_smaps_#{field}_bytes", value * 1024, @use_quantiles, **labels)
110
+ end
111
+ end
112
+ end
113
+
114
+ self
115
+ end
116
+
117
+ def write_to(target)
118
+ target.write(@metrics.to_s)
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,64 @@
1
+ require "quantile"
2
+
3
+ module GitLab
4
+ module Monitor
5
+ # Prometheus metrics container
6
+ #
7
+ # Provides a simple API to `add` metrics and then turn them `to_s` which will just
8
+ # dump all the metrics in prometheus format
9
+ #
10
+ # The add method also can take any arbitrary amount of labels in a `key: value` format.
11
+ class PrometheusMetrics
12
+ def initialize(include_timestamp: true)
13
+ @metrics = Hash.new { |h, k| h[k] = [] }
14
+ @quantiles = Hash.new { |h, k| h[k] = [] }
15
+ @include_timestamp = include_timestamp
16
+ end
17
+
18
+ def add(name, value, quantile = false, **labels)
19
+ if quantile
20
+ @quantiles[{ name: name, labels: labels }] << value
21
+ else
22
+ @metrics[name] << { value: value, labels: labels, timestamp: (Time.now.to_f * 1000).to_i }
23
+ end
24
+
25
+ self
26
+ end
27
+
28
+ def to_s
29
+ add_quantiles_to_metrics
30
+
31
+ buffer = ""
32
+ @metrics.each do |name, measurements|
33
+ measurements.each do |measurement|
34
+ buffer << name.to_s
35
+ labels = (measurement[:labels] || {}).map { |label, value| "#{label}=\"#{value}\"" }.join(",")
36
+ buffer << "{#{labels}}" unless labels.empty?
37
+ buffer << " #{measurement[:value]}"
38
+ buffer << " #{measurement[:timestamp]}" if @include_timestamp
39
+ buffer << "\n"
40
+ end
41
+ end
42
+ buffer
43
+ end
44
+
45
+ private
46
+
47
+ def add_quantiles_to_metrics
48
+ @quantiles.each do |data, measurements|
49
+ estimator = Quantile::Estimator.new
50
+
51
+ measurements.each do |value|
52
+ estimator.observe(value)
53
+ end
54
+
55
+ estimator.invariants.each do |invariant|
56
+ data[:labels][:quantile] = "#{(invariant.quantile * 100).to_i}th"
57
+
58
+ add(data[:name], estimator.query(invariant.quantile), **data[:labels])
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,149 @@
1
+ require "sidekiq/api"
2
+ require "digest"
3
+
4
+ module GitLab
5
+ module Monitor
6
+ # A prober for Sidekiq queues
7
+ #
8
+ # It takes the Redis URL Sidekiq is connected to
9
+ class SidekiqProber
10
+ QUEUE_JOB_STATS_SCRIPT = File.read(File.expand_path("#{__FILE__}/../sidekiq_queue_job_stats.lua")).freeze
11
+ QUEUE_JOB_STATS_SHA = Digest::SHA1.hexdigest(QUEUE_JOB_STATS_SCRIPT).freeze
12
+
13
+ def initialize(opts, metrics: PrometheusMetrics.new)
14
+ @opts = opts
15
+ @metrics = metrics
16
+
17
+ Sidekiq.configure_client do |config|
18
+ config.redis = redis_options
19
+ end
20
+
21
+ ensure_queue_job_stats_script_loaded
22
+ end
23
+
24
+ def probe_queues
25
+ return self unless connected?
26
+
27
+ Sidekiq::Queue.all.each do |queue|
28
+ @metrics.add("sidekiq_queue_size", queue.size, name: queue.name)
29
+ @metrics.add("sidekiq_queue_latency_seconds", queue.latency, name: queue.name)
30
+ @metrics.add("sidekiq_queue_paused", queue.paused? ? 1 : 0, name: queue.name)
31
+ end
32
+
33
+ self
34
+ end
35
+
36
+ def probe_jobs # rubocop:disable Metrics/MethodLength
37
+ return self unless connected?
38
+
39
+ job_stats = {}
40
+
41
+ Sidekiq::Queue.all.each do |queue|
42
+ begin
43
+ Sidekiq.redis do |conn|
44
+ stats = conn.evalsha(QUEUE_JOB_STATS_SHA, ["queue:#{queue.name}"])
45
+ job_stats.merge!(stats.to_h)
46
+ end
47
+ rescue Redis::CommandError # Could happen if the script exceeded the maximum run time (5 seconds by default)
48
+ # FIXME: Should we call SCRIPT KILL?
49
+ return self
50
+ end
51
+ end
52
+
53
+ job_stats.each do |class_name, count|
54
+ @metrics.add("sidekiq_enqueued_jobs", count, name: class_name)
55
+ end
56
+
57
+ self
58
+ end
59
+
60
+ def probe_workers
61
+ return self unless connected?
62
+
63
+ worker_stats = Hash.new(0)
64
+
65
+ Sidekiq::Workers.new.map do |_pid, _tid, work|
66
+ job_klass = work["payload"]["class"]
67
+
68
+ worker_stats[job_klass] += 1
69
+ end
70
+
71
+ worker_stats.each do |class_name, count|
72
+ @metrics.add("sidekiq_running_jobs", count, name: class_name)
73
+ end
74
+
75
+ self
76
+ end
77
+
78
+ def probe_retries
79
+ return self unless connected?
80
+
81
+ retry_stats = Hash.new(0)
82
+
83
+ Sidekiq::RetrySet.new.map do |job|
84
+ retry_stats[job.klass] += 1
85
+ end
86
+
87
+ retry_stats.each do |class_name, count|
88
+ @metrics.add("sidekiq_to_be_retried_jobs", count, name: class_name)
89
+ end
90
+
91
+ self
92
+ end
93
+
94
+ def probe_dead
95
+ return self unless connected?
96
+
97
+ @metrics.add("sidekiq_dead_jobs", Sidekiq::Stats.new.dead_size)
98
+
99
+ self
100
+ end
101
+
102
+ def write_to(target)
103
+ target.write(@metrics.to_s)
104
+ end
105
+
106
+ private
107
+
108
+ def redis_options
109
+ options = {
110
+ url: @opts[:redis_url],
111
+ namespace: "resque:gitlab",
112
+ connect_timeout: 1,
113
+ reconnect_attempts: 0
114
+ }
115
+
116
+ options[:id] = nil unless redis_enable_client?
117
+ options
118
+ end
119
+
120
+ def redis_enable_client?
121
+ return true if @opts[:redis_enable_client].nil?
122
+
123
+ @opts[:redis_enable_client]
124
+ end
125
+
126
+ def connected?
127
+ @connected ||= begin
128
+ Sidekiq.redis do |conn|
129
+ conn.get("foo")
130
+ end
131
+ true
132
+ end
133
+ rescue Redis::CannotConnectError, Redis::TimeoutError # rubocop:disable Lint/HandleExceptions
134
+ # Maybe we're trying connecting to a slave
135
+ end
136
+
137
+ def ensure_queue_job_stats_script_loaded
138
+ return unless connected?
139
+
140
+ Sidekiq.redis do |conn|
141
+ # Using administrative commands on conn directly (which is a Redis::Namespace)
142
+ # will be removed in redis-namespace 2.0.
143
+ next if conn.redis.script(:exists, QUEUE_JOB_STATS_SHA)
144
+ conn.redis.script(:load, QUEUE_JOB_STATS_SCRIPT)
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end