gitlab-monitor 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -0
  3. data/.gitlab-ci.yml +18 -0
  4. data/.rubocop.yml +34 -0
  5. data/CONTRIBUTING.md +651 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +75 -0
  8. data/LICENSE +25 -0
  9. data/README.md +110 -0
  10. data/bin/gitlab-mon +17 -0
  11. data/config/gitlab-monitor.yml.example +112 -0
  12. data/gitlab-monitor.gemspec +33 -0
  13. data/lib/gitlab_monitor.rb +18 -0
  14. data/lib/gitlab_monitor/cli.rb +341 -0
  15. data/lib/gitlab_monitor/database.rb +13 -0
  16. data/lib/gitlab_monitor/database/base.rb +44 -0
  17. data/lib/gitlab_monitor/database/bloat.rb +74 -0
  18. data/lib/gitlab_monitor/database/bloat_btree.sql +84 -0
  19. data/lib/gitlab_monitor/database/bloat_table.sql +63 -0
  20. data/lib/gitlab_monitor/database/ci_builds.rb +527 -0
  21. data/lib/gitlab_monitor/database/remote_mirrors.rb +74 -0
  22. data/lib/gitlab_monitor/database/row_count.rb +164 -0
  23. data/lib/gitlab_monitor/database/tuple_stats.rb +53 -0
  24. data/lib/gitlab_monitor/git.rb +144 -0
  25. data/lib/gitlab_monitor/memstats.rb +98 -0
  26. data/lib/gitlab_monitor/memstats/mapping.rb +91 -0
  27. data/lib/gitlab_monitor/prober.rb +40 -0
  28. data/lib/gitlab_monitor/process.rb +122 -0
  29. data/lib/gitlab_monitor/prometheus.rb +64 -0
  30. data/lib/gitlab_monitor/sidekiq.rb +149 -0
  31. data/lib/gitlab_monitor/sidekiq_queue_job_stats.lua +42 -0
  32. data/lib/gitlab_monitor/util.rb +83 -0
  33. data/lib/gitlab_monitor/version.rb +5 -0
  34. data/lib/gitlab_monitor/web_exporter.rb +77 -0
  35. data/spec/cli_spec.rb +31 -0
  36. data/spec/database/bloat_spec.rb +99 -0
  37. data/spec/database/ci_builds_spec.rb +421 -0
  38. data/spec/database/row_count_spec.rb +37 -0
  39. data/spec/fixtures/smaps/sample.txt +10108 -0
  40. data/spec/git_process_proper_spec.rb +27 -0
  41. data/spec/git_spec.rb +52 -0
  42. data/spec/memstats_spec.rb +28 -0
  43. data/spec/prometheus_metrics_spec.rb +17 -0
  44. data/spec/spec_helper.rb +63 -0
  45. data/spec/util_spec.rb +15 -0
  46. metadata +225 -0
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module GitLab
4
+ module Monitor
5
+ module MemStats
6
+ # Parses one entry in /proc/[pid]/smaps. For example:
7
+ #
8
+ # 00400000-00401000 r-xp 00000000 08:01 541055 /opt/gitlab/embedded/bin/ruby
9
+ # Size: 4 kB
10
+ # Rss: 4 kB
11
+ # Pss: 0 kB
12
+ # Shared_Clean: 4 kB
13
+ # Shared_Dirty: 0 kB
14
+ # Private_Clean: 0 kB
15
+ # Private_Dirty: 0 kB
16
+ # Referenced: 4 kB
17
+ # Anonymous: 0 kB
18
+ # AnonHugePages: 0 kB
19
+ # Shared_Hugetlb: 0 kB
20
+ # Private_Hugetlb: 0 kB
21
+ # Swap: 0 kB
22
+ # SwapPss: 0 kB
23
+ # KernelPageSize: 4 kB
24
+ # MMUPageSize: 4 kB
25
+ # Locked: 0 kB
26
+ # VmFlags: rd ex mr mw me dw sd
27
+ class Mapping
28
+ FIELDS = %w(size rss shared_clean shared_dirty private_clean private_dirty swap pss).freeze
29
+
30
+ attr_reader :address_start
31
+ attr_reader :address_end
32
+ attr_reader :perms
33
+ attr_reader :offset
34
+ attr_reader :device_major
35
+ attr_reader :device_minor
36
+ attr_reader :inode
37
+ attr_reader :region
38
+
39
+ attr_accessor :size
40
+ attr_accessor :rss
41
+ attr_accessor :shared_clean
42
+ attr_accessor :shared_dirty
43
+ attr_accessor :private_dirty
44
+ attr_accessor :private_clean
45
+ attr_accessor :swap
46
+ attr_accessor :pss
47
+
48
+ def initialize(lines)
49
+ FIELDS.each do |field|
50
+ send("#{field}=", 0)
51
+ end
52
+
53
+ parse_first_line(lines.shift)
54
+
55
+ lines.each do |l|
56
+ parse_field_line(l)
57
+ end
58
+ end
59
+
60
+ def parse_first_line(line)
61
+ line.strip!
62
+
63
+ parts = line.split
64
+ @address_start, @address_end = parts[0].split("-")
65
+ @perms = parts[1]
66
+ @offset = parts[2]
67
+ @device_major, @device_minor = parts[3].split(":")
68
+ @inode = parts[4]
69
+ @region = parts[5] || "anonymous"
70
+ end
71
+
72
+ def parse_field_line(line)
73
+ line.strip!
74
+
75
+ parts = line.split
76
+
77
+ return unless parts
78
+
79
+ parts[0].downcase!
80
+ parts[0].sub!(":", "")
81
+ field = parts[0]
82
+
83
+ return unless respond_to? "#{field}="
84
+
85
+ value = Float(parts[1]).to_i
86
+ send("#{field}=", value)
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,40 @@
1
+ module GitLab
2
+ module Monitor
3
+ # A class to combine multiple probers into one
4
+ class Prober
5
+ def initialize(prober_opts, metrics: PrometheusMetrics.new)
6
+ @prober_opts = prober_opts
7
+ @metrics = metrics
8
+
9
+ resolve_prober_classes
10
+ end
11
+
12
+ def probe_all
13
+ @prober_opts.each do |_probe_name, params|
14
+ Utils.wrap_in_array(params[:opts]).each do |opts|
15
+ prober = params[:class].new(opts, metrics: @metrics)
16
+ params[:methods].each do |meth|
17
+ prober.send(meth)
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ def write_to(target)
24
+ target.write(@metrics.to_s)
25
+ end
26
+
27
+ private
28
+
29
+ def resolve_prober_classes
30
+ @prober_opts.each do |probe_name, params|
31
+ prober_class_name = params[:class_name] || Utils.camel_case_string("#{probe_name}_prober")
32
+
33
+ klass = prober_class_name.split("::").reduce(GitLab::Monitor) { |a, e| a.const_get(e) }
34
+
35
+ params[:class] = klass
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "memstats"
4
+
5
+ module GitLab
6
+ module Monitor
7
+ # A helper class to extract memory info from /proc/<pid>/status
8
+ #
9
+
10
+ # A helper class to stats from /proc/<pid>/stat
11
+ #
12
+ # See: man 5 proc
13
+ #
14
+ # It takes a pid
15
+ class ProcessStats
16
+ def initialize(pid)
17
+ @pid = pid
18
+ @user_hertz = retrieve_user_hertz
19
+ @stats = populate_info
20
+ end
21
+
22
+ def valid?
23
+ !@stats.nil?
24
+ end
25
+
26
+ def cpu_time
27
+ (@stats[14].to_i + @stats[15].to_i) / @user_hertz
28
+ end
29
+
30
+ def start_time
31
+ @stats[22].to_i / @user_hertz
32
+ end
33
+
34
+ def vsize
35
+ # Virtual memory size in bytes.
36
+ @stats[23].to_i
37
+ end
38
+
39
+ def rss
40
+ # Resident Set Size: number of pages the process has in real memory.
41
+ @stats[24].to_i * 4096
42
+ end
43
+
44
+ private
45
+
46
+ def populate_info
47
+ # Pad the array by one element to make field numbers match the man page.
48
+ [""].concat(File.read("/proc/#{@pid}/stat").split(" "))
49
+ rescue Errno::ENOENT
50
+ nil
51
+ end
52
+
53
+ def retrieve_user_hertz
54
+ Process.clock_getres(:TIMES_BASED_CLOCK_PROCESS_CPUTIME_ID, :hertz)
55
+ rescue Errno::EINVAL
56
+ 100.0
57
+ end
58
+ end
59
+
60
+ # Probes a process for info then writes metrics to a target
61
+ class ProcessProber
62
+ def initialize(options, metrics: PrometheusMetrics.new)
63
+ @metrics = metrics
64
+ @name = options[:name]
65
+ @pids = if options[:pid_or_pattern] =~ /^\d+$/
66
+ [options[:pid_or_pattern]]
67
+ else
68
+ Utils.pgrep(options[:pid_or_pattern])
69
+ end
70
+ @use_quantiles = options.fetch(:quantiles, false)
71
+ end
72
+
73
+ def probe_stat
74
+ @pids.each do |pid|
75
+ stats = ProcessStats.new(pid)
76
+ next unless stats.valid?
77
+
78
+ labels = { name: @name.downcase }
79
+ labels[:pid] = pid unless @use_quantiles
80
+
81
+ @metrics.add("process_cpu_seconds_total", stats.cpu_time, @use_quantiles, **labels)
82
+ @metrics.add("process_resident_memory_bytes", stats.rss, @use_quantiles, **labels)
83
+ @metrics.add("process_virtual_memory_bytes", stats.vsize, @use_quantiles, **labels)
84
+ @metrics.add("process_start_time_seconds", stats.start_time, @use_quantiles, **labels)
85
+ end
86
+
87
+ self
88
+ end
89
+
90
+ def probe_count
91
+ @metrics.add("process_count", @pids.count, name: @name.downcase)
92
+
93
+ self
94
+ end
95
+
96
+ def probe_smaps
97
+ @pids.each do |pid|
98
+ stats = ::GitLab::Monitor::MemStats::Aggregator.new(pid)
99
+
100
+ next unless stats.valid?
101
+
102
+ labels = { name: @name.downcase }
103
+ labels[:pid] = pid unless @use_quantiles
104
+
105
+ ::GitLab::Monitor::MemStats::Mapping::FIELDS.each do |field|
106
+ value = stats.totals[field]
107
+
108
+ if value >= 0
109
+ @metrics.add("process_smaps_#{field}_bytes", value * 1024, @use_quantiles, **labels)
110
+ end
111
+ end
112
+ end
113
+
114
+ self
115
+ end
116
+
117
+ def write_to(target)
118
+ target.write(@metrics.to_s)
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,64 @@
1
+ require "quantile"
2
+
3
+ module GitLab
4
+ module Monitor
5
+ # Prometheus metrics container
6
+ #
7
+ # Provides a simple API to `add` metrics and then turn them `to_s` which will just
8
+ # dump all the metrics in prometheus format
9
+ #
10
+ # The add method also can take any arbitrary amount of labels in a `key: value` format.
11
+ class PrometheusMetrics
12
+ def initialize(include_timestamp: true)
13
+ @metrics = Hash.new { |h, k| h[k] = [] }
14
+ @quantiles = Hash.new { |h, k| h[k] = [] }
15
+ @include_timestamp = include_timestamp
16
+ end
17
+
18
+ def add(name, value, quantile = false, **labels)
19
+ if quantile
20
+ @quantiles[{ name: name, labels: labels }] << value
21
+ else
22
+ @metrics[name] << { value: value, labels: labels, timestamp: (Time.now.to_f * 1000).to_i }
23
+ end
24
+
25
+ self
26
+ end
27
+
28
+ def to_s
29
+ add_quantiles_to_metrics
30
+
31
+ buffer = ""
32
+ @metrics.each do |name, measurements|
33
+ measurements.each do |measurement|
34
+ buffer << name.to_s
35
+ labels = (measurement[:labels] || {}).map { |label, value| "#{label}=\"#{value}\"" }.join(",")
36
+ buffer << "{#{labels}}" unless labels.empty?
37
+ buffer << " #{measurement[:value]}"
38
+ buffer << " #{measurement[:timestamp]}" if @include_timestamp
39
+ buffer << "\n"
40
+ end
41
+ end
42
+ buffer
43
+ end
44
+
45
+ private
46
+
47
+ def add_quantiles_to_metrics
48
+ @quantiles.each do |data, measurements|
49
+ estimator = Quantile::Estimator.new
50
+
51
+ measurements.each do |value|
52
+ estimator.observe(value)
53
+ end
54
+
55
+ estimator.invariants.each do |invariant|
56
+ data[:labels][:quantile] = "#{(invariant.quantile * 100).to_i}th"
57
+
58
+ add(data[:name], estimator.query(invariant.quantile), **data[:labels])
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,149 @@
1
+ require "sidekiq/api"
2
+ require "digest"
3
+
4
+ module GitLab
5
+ module Monitor
6
+ # A prober for Sidekiq queues
7
+ #
8
+ # It takes the Redis URL Sidekiq is connected to
9
+ class SidekiqProber
10
+ QUEUE_JOB_STATS_SCRIPT = File.read(File.expand_path("#{__FILE__}/../sidekiq_queue_job_stats.lua")).freeze
11
+ QUEUE_JOB_STATS_SHA = Digest::SHA1.hexdigest(QUEUE_JOB_STATS_SCRIPT).freeze
12
+
13
+ def initialize(opts, metrics: PrometheusMetrics.new)
14
+ @opts = opts
15
+ @metrics = metrics
16
+
17
+ Sidekiq.configure_client do |config|
18
+ config.redis = redis_options
19
+ end
20
+
21
+ ensure_queue_job_stats_script_loaded
22
+ end
23
+
24
+ def probe_queues
25
+ return self unless connected?
26
+
27
+ Sidekiq::Queue.all.each do |queue|
28
+ @metrics.add("sidekiq_queue_size", queue.size, name: queue.name)
29
+ @metrics.add("sidekiq_queue_latency_seconds", queue.latency, name: queue.name)
30
+ @metrics.add("sidekiq_queue_paused", queue.paused? ? 1 : 0, name: queue.name)
31
+ end
32
+
33
+ self
34
+ end
35
+
36
+ def probe_jobs # rubocop:disable Metrics/MethodLength
37
+ return self unless connected?
38
+
39
+ job_stats = {}
40
+
41
+ Sidekiq::Queue.all.each do |queue|
42
+ begin
43
+ Sidekiq.redis do |conn|
44
+ stats = conn.evalsha(QUEUE_JOB_STATS_SHA, ["queue:#{queue.name}"])
45
+ job_stats.merge!(stats.to_h)
46
+ end
47
+ rescue Redis::CommandError # Could happen if the script exceeded the maximum run time (5 seconds by default)
48
+ # FIXME: Should we call SCRIPT KILL?
49
+ return self
50
+ end
51
+ end
52
+
53
+ job_stats.each do |class_name, count|
54
+ @metrics.add("sidekiq_enqueued_jobs", count, name: class_name)
55
+ end
56
+
57
+ self
58
+ end
59
+
60
+ def probe_workers
61
+ return self unless connected?
62
+
63
+ worker_stats = Hash.new(0)
64
+
65
+ Sidekiq::Workers.new.map do |_pid, _tid, work|
66
+ job_klass = work["payload"]["class"]
67
+
68
+ worker_stats[job_klass] += 1
69
+ end
70
+
71
+ worker_stats.each do |class_name, count|
72
+ @metrics.add("sidekiq_running_jobs", count, name: class_name)
73
+ end
74
+
75
+ self
76
+ end
77
+
78
+ def probe_retries
79
+ return self unless connected?
80
+
81
+ retry_stats = Hash.new(0)
82
+
83
+ Sidekiq::RetrySet.new.map do |job|
84
+ retry_stats[job.klass] += 1
85
+ end
86
+
87
+ retry_stats.each do |class_name, count|
88
+ @metrics.add("sidekiq_to_be_retried_jobs", count, name: class_name)
89
+ end
90
+
91
+ self
92
+ end
93
+
94
+ def probe_dead
95
+ return self unless connected?
96
+
97
+ @metrics.add("sidekiq_dead_jobs", Sidekiq::Stats.new.dead_size)
98
+
99
+ self
100
+ end
101
+
102
+ def write_to(target)
103
+ target.write(@metrics.to_s)
104
+ end
105
+
106
+ private
107
+
108
+ def redis_options
109
+ options = {
110
+ url: @opts[:redis_url],
111
+ namespace: "resque:gitlab",
112
+ connect_timeout: 1,
113
+ reconnect_attempts: 0
114
+ }
115
+
116
+ options[:id] = nil unless redis_enable_client?
117
+ options
118
+ end
119
+
120
+ def redis_enable_client?
121
+ return true if @opts[:redis_enable_client].nil?
122
+
123
+ @opts[:redis_enable_client]
124
+ end
125
+
126
+ def connected?
127
+ @connected ||= begin
128
+ Sidekiq.redis do |conn|
129
+ conn.get("foo")
130
+ end
131
+ true
132
+ end
133
+ rescue Redis::CannotConnectError, Redis::TimeoutError # rubocop:disable Lint/HandleExceptions
134
+ # Maybe we're trying connecting to a slave
135
+ end
136
+
137
+ def ensure_queue_job_stats_script_loaded
138
+ return unless connected?
139
+
140
+ Sidekiq.redis do |conn|
141
+ # Using administrative commands on conn directly (which is a Redis::Namespace)
142
+ # will be removed in redis-namespace 2.0.
143
+ next if conn.redis.script(:exists, QUEUE_JOB_STATS_SHA)
144
+ conn.redis.script(:load, QUEUE_JOB_STATS_SCRIPT)
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end