gitlab-monitor 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +2 -0
- data/.gitlab-ci.yml +18 -0
- data/.rubocop.yml +34 -0
- data/CONTRIBUTING.md +651 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +75 -0
- data/LICENSE +25 -0
- data/README.md +110 -0
- data/bin/gitlab-mon +17 -0
- data/config/gitlab-monitor.yml.example +112 -0
- data/gitlab-monitor.gemspec +33 -0
- data/lib/gitlab_monitor.rb +18 -0
- data/lib/gitlab_monitor/cli.rb +341 -0
- data/lib/gitlab_monitor/database.rb +13 -0
- data/lib/gitlab_monitor/database/base.rb +44 -0
- data/lib/gitlab_monitor/database/bloat.rb +74 -0
- data/lib/gitlab_monitor/database/bloat_btree.sql +84 -0
- data/lib/gitlab_monitor/database/bloat_table.sql +63 -0
- data/lib/gitlab_monitor/database/ci_builds.rb +527 -0
- data/lib/gitlab_monitor/database/remote_mirrors.rb +74 -0
- data/lib/gitlab_monitor/database/row_count.rb +164 -0
- data/lib/gitlab_monitor/database/tuple_stats.rb +53 -0
- data/lib/gitlab_monitor/git.rb +144 -0
- data/lib/gitlab_monitor/memstats.rb +98 -0
- data/lib/gitlab_monitor/memstats/mapping.rb +91 -0
- data/lib/gitlab_monitor/prober.rb +40 -0
- data/lib/gitlab_monitor/process.rb +122 -0
- data/lib/gitlab_monitor/prometheus.rb +64 -0
- data/lib/gitlab_monitor/sidekiq.rb +149 -0
- data/lib/gitlab_monitor/sidekiq_queue_job_stats.lua +42 -0
- data/lib/gitlab_monitor/util.rb +83 -0
- data/lib/gitlab_monitor/version.rb +5 -0
- data/lib/gitlab_monitor/web_exporter.rb +77 -0
- data/spec/cli_spec.rb +31 -0
- data/spec/database/bloat_spec.rb +99 -0
- data/spec/database/ci_builds_spec.rb +421 -0
- data/spec/database/row_count_spec.rb +37 -0
- data/spec/fixtures/smaps/sample.txt +10108 -0
- data/spec/git_process_proper_spec.rb +27 -0
- data/spec/git_spec.rb +52 -0
- data/spec/memstats_spec.rb +28 -0
- data/spec/prometheus_metrics_spec.rb +17 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/util_spec.rb +15 -0
- metadata +225 -0
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module GitLab
|
4
|
+
module Monitor
|
5
|
+
module MemStats
|
6
|
+
# Parses one entry in /proc/[pid]/smaps. For example:
|
7
|
+
#
|
8
|
+
# 00400000-00401000 r-xp 00000000 08:01 541055 /opt/gitlab/embedded/bin/ruby
|
9
|
+
# Size: 4 kB
|
10
|
+
# Rss: 4 kB
|
11
|
+
# Pss: 0 kB
|
12
|
+
# Shared_Clean: 4 kB
|
13
|
+
# Shared_Dirty: 0 kB
|
14
|
+
# Private_Clean: 0 kB
|
15
|
+
# Private_Dirty: 0 kB
|
16
|
+
# Referenced: 4 kB
|
17
|
+
# Anonymous: 0 kB
|
18
|
+
# AnonHugePages: 0 kB
|
19
|
+
# Shared_Hugetlb: 0 kB
|
20
|
+
# Private_Hugetlb: 0 kB
|
21
|
+
# Swap: 0 kB
|
22
|
+
# SwapPss: 0 kB
|
23
|
+
# KernelPageSize: 4 kB
|
24
|
+
# MMUPageSize: 4 kB
|
25
|
+
# Locked: 0 kB
|
26
|
+
# VmFlags: rd ex mr mw me dw sd
|
27
|
+
class Mapping
|
28
|
+
FIELDS = %w(size rss shared_clean shared_dirty private_clean private_dirty swap pss).freeze
|
29
|
+
|
30
|
+
attr_reader :address_start
|
31
|
+
attr_reader :address_end
|
32
|
+
attr_reader :perms
|
33
|
+
attr_reader :offset
|
34
|
+
attr_reader :device_major
|
35
|
+
attr_reader :device_minor
|
36
|
+
attr_reader :inode
|
37
|
+
attr_reader :region
|
38
|
+
|
39
|
+
attr_accessor :size
|
40
|
+
attr_accessor :rss
|
41
|
+
attr_accessor :shared_clean
|
42
|
+
attr_accessor :shared_dirty
|
43
|
+
attr_accessor :private_dirty
|
44
|
+
attr_accessor :private_clean
|
45
|
+
attr_accessor :swap
|
46
|
+
attr_accessor :pss
|
47
|
+
|
48
|
+
def initialize(lines)
|
49
|
+
FIELDS.each do |field|
|
50
|
+
send("#{field}=", 0)
|
51
|
+
end
|
52
|
+
|
53
|
+
parse_first_line(lines.shift)
|
54
|
+
|
55
|
+
lines.each do |l|
|
56
|
+
parse_field_line(l)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def parse_first_line(line)
|
61
|
+
line.strip!
|
62
|
+
|
63
|
+
parts = line.split
|
64
|
+
@address_start, @address_end = parts[0].split("-")
|
65
|
+
@perms = parts[1]
|
66
|
+
@offset = parts[2]
|
67
|
+
@device_major, @device_minor = parts[3].split(":")
|
68
|
+
@inode = parts[4]
|
69
|
+
@region = parts[5] || "anonymous"
|
70
|
+
end
|
71
|
+
|
72
|
+
def parse_field_line(line)
|
73
|
+
line.strip!
|
74
|
+
|
75
|
+
parts = line.split
|
76
|
+
|
77
|
+
return unless parts
|
78
|
+
|
79
|
+
parts[0].downcase!
|
80
|
+
parts[0].sub!(":", "")
|
81
|
+
field = parts[0]
|
82
|
+
|
83
|
+
return unless respond_to? "#{field}="
|
84
|
+
|
85
|
+
value = Float(parts[1]).to_i
|
86
|
+
send("#{field}=", value)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module GitLab
|
2
|
+
module Monitor
|
3
|
+
# A class to combine multiple probers into one
|
4
|
+
class Prober
|
5
|
+
def initialize(prober_opts, metrics: PrometheusMetrics.new)
|
6
|
+
@prober_opts = prober_opts
|
7
|
+
@metrics = metrics
|
8
|
+
|
9
|
+
resolve_prober_classes
|
10
|
+
end
|
11
|
+
|
12
|
+
def probe_all
|
13
|
+
@prober_opts.each do |_probe_name, params|
|
14
|
+
Utils.wrap_in_array(params[:opts]).each do |opts|
|
15
|
+
prober = params[:class].new(opts, metrics: @metrics)
|
16
|
+
params[:methods].each do |meth|
|
17
|
+
prober.send(meth)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def write_to(target)
|
24
|
+
target.write(@metrics.to_s)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def resolve_prober_classes
|
30
|
+
@prober_opts.each do |probe_name, params|
|
31
|
+
prober_class_name = params[:class_name] || Utils.camel_case_string("#{probe_name}_prober")
|
32
|
+
|
33
|
+
klass = prober_class_name.split("::").reduce(GitLab::Monitor) { |a, e| a.const_get(e) }
|
34
|
+
|
35
|
+
params[:class] = klass
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "memstats"
|
4
|
+
|
5
|
+
module GitLab
|
6
|
+
module Monitor
|
7
|
+
# A helper class to extract memory info from /proc/<pid>/status
|
8
|
+
#
|
9
|
+
|
10
|
+
# A helper class to stats from /proc/<pid>/stat
|
11
|
+
#
|
12
|
+
# See: man 5 proc
|
13
|
+
#
|
14
|
+
# It takes a pid
|
15
|
+
class ProcessStats
|
16
|
+
def initialize(pid)
|
17
|
+
@pid = pid
|
18
|
+
@user_hertz = retrieve_user_hertz
|
19
|
+
@stats = populate_info
|
20
|
+
end
|
21
|
+
|
22
|
+
def valid?
|
23
|
+
!@stats.nil?
|
24
|
+
end
|
25
|
+
|
26
|
+
def cpu_time
|
27
|
+
(@stats[14].to_i + @stats[15].to_i) / @user_hertz
|
28
|
+
end
|
29
|
+
|
30
|
+
def start_time
|
31
|
+
@stats[22].to_i / @user_hertz
|
32
|
+
end
|
33
|
+
|
34
|
+
def vsize
|
35
|
+
# Virtual memory size in bytes.
|
36
|
+
@stats[23].to_i
|
37
|
+
end
|
38
|
+
|
39
|
+
def rss
|
40
|
+
# Resident Set Size: number of pages the process has in real memory.
|
41
|
+
@stats[24].to_i * 4096
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def populate_info
|
47
|
+
# Pad the array by one element to make field numbers match the man page.
|
48
|
+
[""].concat(File.read("/proc/#{@pid}/stat").split(" "))
|
49
|
+
rescue Errno::ENOENT
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
|
53
|
+
def retrieve_user_hertz
|
54
|
+
Process.clock_getres(:TIMES_BASED_CLOCK_PROCESS_CPUTIME_ID, :hertz)
|
55
|
+
rescue Errno::EINVAL
|
56
|
+
100.0
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Probes a process for info then writes metrics to a target
|
61
|
+
class ProcessProber
|
62
|
+
def initialize(options, metrics: PrometheusMetrics.new)
|
63
|
+
@metrics = metrics
|
64
|
+
@name = options[:name]
|
65
|
+
@pids = if options[:pid_or_pattern] =~ /^\d+$/
|
66
|
+
[options[:pid_or_pattern]]
|
67
|
+
else
|
68
|
+
Utils.pgrep(options[:pid_or_pattern])
|
69
|
+
end
|
70
|
+
@use_quantiles = options.fetch(:quantiles, false)
|
71
|
+
end
|
72
|
+
|
73
|
+
def probe_stat
|
74
|
+
@pids.each do |pid|
|
75
|
+
stats = ProcessStats.new(pid)
|
76
|
+
next unless stats.valid?
|
77
|
+
|
78
|
+
labels = { name: @name.downcase }
|
79
|
+
labels[:pid] = pid unless @use_quantiles
|
80
|
+
|
81
|
+
@metrics.add("process_cpu_seconds_total", stats.cpu_time, @use_quantiles, **labels)
|
82
|
+
@metrics.add("process_resident_memory_bytes", stats.rss, @use_quantiles, **labels)
|
83
|
+
@metrics.add("process_virtual_memory_bytes", stats.vsize, @use_quantiles, **labels)
|
84
|
+
@metrics.add("process_start_time_seconds", stats.start_time, @use_quantiles, **labels)
|
85
|
+
end
|
86
|
+
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
def probe_count
|
91
|
+
@metrics.add("process_count", @pids.count, name: @name.downcase)
|
92
|
+
|
93
|
+
self
|
94
|
+
end
|
95
|
+
|
96
|
+
def probe_smaps
|
97
|
+
@pids.each do |pid|
|
98
|
+
stats = ::GitLab::Monitor::MemStats::Aggregator.new(pid)
|
99
|
+
|
100
|
+
next unless stats.valid?
|
101
|
+
|
102
|
+
labels = { name: @name.downcase }
|
103
|
+
labels[:pid] = pid unless @use_quantiles
|
104
|
+
|
105
|
+
::GitLab::Monitor::MemStats::Mapping::FIELDS.each do |field|
|
106
|
+
value = stats.totals[field]
|
107
|
+
|
108
|
+
if value >= 0
|
109
|
+
@metrics.add("process_smaps_#{field}_bytes", value * 1024, @use_quantiles, **labels)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
self
|
115
|
+
end
|
116
|
+
|
117
|
+
def write_to(target)
|
118
|
+
target.write(@metrics.to_s)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require "quantile"
|
2
|
+
|
3
|
+
module GitLab
|
4
|
+
module Monitor
|
5
|
+
# Prometheus metrics container
|
6
|
+
#
|
7
|
+
# Provides a simple API to `add` metrics and then turn them `to_s` which will just
|
8
|
+
# dump all the metrics in prometheus format
|
9
|
+
#
|
10
|
+
# The add method also can take any arbitrary amount of labels in a `key: value` format.
|
11
|
+
class PrometheusMetrics
|
12
|
+
def initialize(include_timestamp: true)
|
13
|
+
@metrics = Hash.new { |h, k| h[k] = [] }
|
14
|
+
@quantiles = Hash.new { |h, k| h[k] = [] }
|
15
|
+
@include_timestamp = include_timestamp
|
16
|
+
end
|
17
|
+
|
18
|
+
def add(name, value, quantile = false, **labels)
|
19
|
+
if quantile
|
20
|
+
@quantiles[{ name: name, labels: labels }] << value
|
21
|
+
else
|
22
|
+
@metrics[name] << { value: value, labels: labels, timestamp: (Time.now.to_f * 1000).to_i }
|
23
|
+
end
|
24
|
+
|
25
|
+
self
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s
|
29
|
+
add_quantiles_to_metrics
|
30
|
+
|
31
|
+
buffer = ""
|
32
|
+
@metrics.each do |name, measurements|
|
33
|
+
measurements.each do |measurement|
|
34
|
+
buffer << name.to_s
|
35
|
+
labels = (measurement[:labels] || {}).map { |label, value| "#{label}=\"#{value}\"" }.join(",")
|
36
|
+
buffer << "{#{labels}}" unless labels.empty?
|
37
|
+
buffer << " #{measurement[:value]}"
|
38
|
+
buffer << " #{measurement[:timestamp]}" if @include_timestamp
|
39
|
+
buffer << "\n"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
buffer
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def add_quantiles_to_metrics
|
48
|
+
@quantiles.each do |data, measurements|
|
49
|
+
estimator = Quantile::Estimator.new
|
50
|
+
|
51
|
+
measurements.each do |value|
|
52
|
+
estimator.observe(value)
|
53
|
+
end
|
54
|
+
|
55
|
+
estimator.invariants.each do |invariant|
|
56
|
+
data[:labels][:quantile] = "#{(invariant.quantile * 100).to_i}th"
|
57
|
+
|
58
|
+
add(data[:name], estimator.query(invariant.quantile), **data[:labels])
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
require "sidekiq/api"
|
2
|
+
require "digest"
|
3
|
+
|
4
|
+
module GitLab
|
5
|
+
module Monitor
|
6
|
+
# A prober for Sidekiq queues
|
7
|
+
#
|
8
|
+
# It takes the Redis URL Sidekiq is connected to
|
9
|
+
class SidekiqProber
|
10
|
+
QUEUE_JOB_STATS_SCRIPT = File.read(File.expand_path("#{__FILE__}/../sidekiq_queue_job_stats.lua")).freeze
|
11
|
+
QUEUE_JOB_STATS_SHA = Digest::SHA1.hexdigest(QUEUE_JOB_STATS_SCRIPT).freeze
|
12
|
+
|
13
|
+
def initialize(opts, metrics: PrometheusMetrics.new)
|
14
|
+
@opts = opts
|
15
|
+
@metrics = metrics
|
16
|
+
|
17
|
+
Sidekiq.configure_client do |config|
|
18
|
+
config.redis = redis_options
|
19
|
+
end
|
20
|
+
|
21
|
+
ensure_queue_job_stats_script_loaded
|
22
|
+
end
|
23
|
+
|
24
|
+
def probe_queues
|
25
|
+
return self unless connected?
|
26
|
+
|
27
|
+
Sidekiq::Queue.all.each do |queue|
|
28
|
+
@metrics.add("sidekiq_queue_size", queue.size, name: queue.name)
|
29
|
+
@metrics.add("sidekiq_queue_latency_seconds", queue.latency, name: queue.name)
|
30
|
+
@metrics.add("sidekiq_queue_paused", queue.paused? ? 1 : 0, name: queue.name)
|
31
|
+
end
|
32
|
+
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def probe_jobs # rubocop:disable Metrics/MethodLength
|
37
|
+
return self unless connected?
|
38
|
+
|
39
|
+
job_stats = {}
|
40
|
+
|
41
|
+
Sidekiq::Queue.all.each do |queue|
|
42
|
+
begin
|
43
|
+
Sidekiq.redis do |conn|
|
44
|
+
stats = conn.evalsha(QUEUE_JOB_STATS_SHA, ["queue:#{queue.name}"])
|
45
|
+
job_stats.merge!(stats.to_h)
|
46
|
+
end
|
47
|
+
rescue Redis::CommandError # Could happen if the script exceeded the maximum run time (5 seconds by default)
|
48
|
+
# FIXME: Should we call SCRIPT KILL?
|
49
|
+
return self
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
job_stats.each do |class_name, count|
|
54
|
+
@metrics.add("sidekiq_enqueued_jobs", count, name: class_name)
|
55
|
+
end
|
56
|
+
|
57
|
+
self
|
58
|
+
end
|
59
|
+
|
60
|
+
def probe_workers
|
61
|
+
return self unless connected?
|
62
|
+
|
63
|
+
worker_stats = Hash.new(0)
|
64
|
+
|
65
|
+
Sidekiq::Workers.new.map do |_pid, _tid, work|
|
66
|
+
job_klass = work["payload"]["class"]
|
67
|
+
|
68
|
+
worker_stats[job_klass] += 1
|
69
|
+
end
|
70
|
+
|
71
|
+
worker_stats.each do |class_name, count|
|
72
|
+
@metrics.add("sidekiq_running_jobs", count, name: class_name)
|
73
|
+
end
|
74
|
+
|
75
|
+
self
|
76
|
+
end
|
77
|
+
|
78
|
+
def probe_retries
|
79
|
+
return self unless connected?
|
80
|
+
|
81
|
+
retry_stats = Hash.new(0)
|
82
|
+
|
83
|
+
Sidekiq::RetrySet.new.map do |job|
|
84
|
+
retry_stats[job.klass] += 1
|
85
|
+
end
|
86
|
+
|
87
|
+
retry_stats.each do |class_name, count|
|
88
|
+
@metrics.add("sidekiq_to_be_retried_jobs", count, name: class_name)
|
89
|
+
end
|
90
|
+
|
91
|
+
self
|
92
|
+
end
|
93
|
+
|
94
|
+
def probe_dead
|
95
|
+
return self unless connected?
|
96
|
+
|
97
|
+
@metrics.add("sidekiq_dead_jobs", Sidekiq::Stats.new.dead_size)
|
98
|
+
|
99
|
+
self
|
100
|
+
end
|
101
|
+
|
102
|
+
def write_to(target)
|
103
|
+
target.write(@metrics.to_s)
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
|
108
|
+
def redis_options
|
109
|
+
options = {
|
110
|
+
url: @opts[:redis_url],
|
111
|
+
namespace: "resque:gitlab",
|
112
|
+
connect_timeout: 1,
|
113
|
+
reconnect_attempts: 0
|
114
|
+
}
|
115
|
+
|
116
|
+
options[:id] = nil unless redis_enable_client?
|
117
|
+
options
|
118
|
+
end
|
119
|
+
|
120
|
+
def redis_enable_client?
|
121
|
+
return true if @opts[:redis_enable_client].nil?
|
122
|
+
|
123
|
+
@opts[:redis_enable_client]
|
124
|
+
end
|
125
|
+
|
126
|
+
def connected?
|
127
|
+
@connected ||= begin
|
128
|
+
Sidekiq.redis do |conn|
|
129
|
+
conn.get("foo")
|
130
|
+
end
|
131
|
+
true
|
132
|
+
end
|
133
|
+
rescue Redis::CannotConnectError, Redis::TimeoutError # rubocop:disable Lint/HandleExceptions
|
134
|
+
# Maybe we're trying connecting to a slave
|
135
|
+
end
|
136
|
+
|
137
|
+
def ensure_queue_job_stats_script_loaded
|
138
|
+
return unless connected?
|
139
|
+
|
140
|
+
Sidekiq.redis do |conn|
|
141
|
+
# Using administrative commands on conn directly (which is a Redis::Namespace)
|
142
|
+
# will be removed in redis-namespace 2.0.
|
143
|
+
next if conn.redis.script(:exists, QUEUE_JOB_STATS_SHA)
|
144
|
+
conn.redis.script(:load, QUEUE_JOB_STATS_SCRIPT)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|