gitlab-monitor 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +2 -0
- data/.gitlab-ci.yml +18 -0
- data/.rubocop.yml +34 -0
- data/CONTRIBUTING.md +651 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +75 -0
- data/LICENSE +25 -0
- data/README.md +110 -0
- data/bin/gitlab-mon +17 -0
- data/config/gitlab-monitor.yml.example +112 -0
- data/gitlab-monitor.gemspec +33 -0
- data/lib/gitlab_monitor.rb +18 -0
- data/lib/gitlab_monitor/cli.rb +341 -0
- data/lib/gitlab_monitor/database.rb +13 -0
- data/lib/gitlab_monitor/database/base.rb +44 -0
- data/lib/gitlab_monitor/database/bloat.rb +74 -0
- data/lib/gitlab_monitor/database/bloat_btree.sql +84 -0
- data/lib/gitlab_monitor/database/bloat_table.sql +63 -0
- data/lib/gitlab_monitor/database/ci_builds.rb +527 -0
- data/lib/gitlab_monitor/database/remote_mirrors.rb +74 -0
- data/lib/gitlab_monitor/database/row_count.rb +164 -0
- data/lib/gitlab_monitor/database/tuple_stats.rb +53 -0
- data/lib/gitlab_monitor/git.rb +144 -0
- data/lib/gitlab_monitor/memstats.rb +98 -0
- data/lib/gitlab_monitor/memstats/mapping.rb +91 -0
- data/lib/gitlab_monitor/prober.rb +40 -0
- data/lib/gitlab_monitor/process.rb +122 -0
- data/lib/gitlab_monitor/prometheus.rb +64 -0
- data/lib/gitlab_monitor/sidekiq.rb +149 -0
- data/lib/gitlab_monitor/sidekiq_queue_job_stats.lua +42 -0
- data/lib/gitlab_monitor/util.rb +83 -0
- data/lib/gitlab_monitor/version.rb +5 -0
- data/lib/gitlab_monitor/web_exporter.rb +77 -0
- data/spec/cli_spec.rb +31 -0
- data/spec/database/bloat_spec.rb +99 -0
- data/spec/database/ci_builds_spec.rb +421 -0
- data/spec/database/row_count_spec.rb +37 -0
- data/spec/fixtures/smaps/sample.txt +10108 -0
- data/spec/git_process_proper_spec.rb +27 -0
- data/spec/git_spec.rb +52 -0
- data/spec/memstats_spec.rb +28 -0
- data/spec/prometheus_metrics_spec.rb +17 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/util_spec.rb +15 -0
- metadata +225 -0
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module GitLab
|
4
|
+
module Monitor
|
5
|
+
module MemStats
|
6
|
+
# Parses one entry in /proc/[pid]/smaps. For example:
|
7
|
+
#
|
8
|
+
# 00400000-00401000 r-xp 00000000 08:01 541055 /opt/gitlab/embedded/bin/ruby
|
9
|
+
# Size: 4 kB
|
10
|
+
# Rss: 4 kB
|
11
|
+
# Pss: 0 kB
|
12
|
+
# Shared_Clean: 4 kB
|
13
|
+
# Shared_Dirty: 0 kB
|
14
|
+
# Private_Clean: 0 kB
|
15
|
+
# Private_Dirty: 0 kB
|
16
|
+
# Referenced: 4 kB
|
17
|
+
# Anonymous: 0 kB
|
18
|
+
# AnonHugePages: 0 kB
|
19
|
+
# Shared_Hugetlb: 0 kB
|
20
|
+
# Private_Hugetlb: 0 kB
|
21
|
+
# Swap: 0 kB
|
22
|
+
# SwapPss: 0 kB
|
23
|
+
# KernelPageSize: 4 kB
|
24
|
+
# MMUPageSize: 4 kB
|
25
|
+
# Locked: 0 kB
|
26
|
+
# VmFlags: rd ex mr mw me dw sd
|
27
|
+
class Mapping
|
28
|
+
FIELDS = %w(size rss shared_clean shared_dirty private_clean private_dirty swap pss).freeze
|
29
|
+
|
30
|
+
attr_reader :address_start
|
31
|
+
attr_reader :address_end
|
32
|
+
attr_reader :perms
|
33
|
+
attr_reader :offset
|
34
|
+
attr_reader :device_major
|
35
|
+
attr_reader :device_minor
|
36
|
+
attr_reader :inode
|
37
|
+
attr_reader :region
|
38
|
+
|
39
|
+
attr_accessor :size
|
40
|
+
attr_accessor :rss
|
41
|
+
attr_accessor :shared_clean
|
42
|
+
attr_accessor :shared_dirty
|
43
|
+
attr_accessor :private_dirty
|
44
|
+
attr_accessor :private_clean
|
45
|
+
attr_accessor :swap
|
46
|
+
attr_accessor :pss
|
47
|
+
|
48
|
+
def initialize(lines)
|
49
|
+
FIELDS.each do |field|
|
50
|
+
send("#{field}=", 0)
|
51
|
+
end
|
52
|
+
|
53
|
+
parse_first_line(lines.shift)
|
54
|
+
|
55
|
+
lines.each do |l|
|
56
|
+
parse_field_line(l)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def parse_first_line(line)
|
61
|
+
line.strip!
|
62
|
+
|
63
|
+
parts = line.split
|
64
|
+
@address_start, @address_end = parts[0].split("-")
|
65
|
+
@perms = parts[1]
|
66
|
+
@offset = parts[2]
|
67
|
+
@device_major, @device_minor = parts[3].split(":")
|
68
|
+
@inode = parts[4]
|
69
|
+
@region = parts[5] || "anonymous"
|
70
|
+
end
|
71
|
+
|
72
|
+
def parse_field_line(line)
|
73
|
+
line.strip!
|
74
|
+
|
75
|
+
parts = line.split
|
76
|
+
|
77
|
+
return unless parts
|
78
|
+
|
79
|
+
parts[0].downcase!
|
80
|
+
parts[0].sub!(":", "")
|
81
|
+
field = parts[0]
|
82
|
+
|
83
|
+
return unless respond_to? "#{field}="
|
84
|
+
|
85
|
+
value = Float(parts[1]).to_i
|
86
|
+
send("#{field}=", value)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module GitLab
|
2
|
+
module Monitor
|
3
|
+
# A class to combine multiple probers into one
|
4
|
+
class Prober
|
5
|
+
def initialize(prober_opts, metrics: PrometheusMetrics.new)
|
6
|
+
@prober_opts = prober_opts
|
7
|
+
@metrics = metrics
|
8
|
+
|
9
|
+
resolve_prober_classes
|
10
|
+
end
|
11
|
+
|
12
|
+
def probe_all
|
13
|
+
@prober_opts.each do |_probe_name, params|
|
14
|
+
Utils.wrap_in_array(params[:opts]).each do |opts|
|
15
|
+
prober = params[:class].new(opts, metrics: @metrics)
|
16
|
+
params[:methods].each do |meth|
|
17
|
+
prober.send(meth)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def write_to(target)
|
24
|
+
target.write(@metrics.to_s)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def resolve_prober_classes
|
30
|
+
@prober_opts.each do |probe_name, params|
|
31
|
+
prober_class_name = params[:class_name] || Utils.camel_case_string("#{probe_name}_prober")
|
32
|
+
|
33
|
+
klass = prober_class_name.split("::").reduce(GitLab::Monitor) { |a, e| a.const_get(e) }
|
34
|
+
|
35
|
+
params[:class] = klass
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "memstats"
|
4
|
+
|
5
|
+
module GitLab
|
6
|
+
module Monitor
|
7
|
+
# A helper class to extract memory info from /proc/<pid>/status
|
8
|
+
#
|
9
|
+
|
10
|
+
# A helper class to stats from /proc/<pid>/stat
|
11
|
+
#
|
12
|
+
# See: man 5 proc
|
13
|
+
#
|
14
|
+
# It takes a pid
|
15
|
+
class ProcessStats
|
16
|
+
def initialize(pid)
|
17
|
+
@pid = pid
|
18
|
+
@user_hertz = retrieve_user_hertz
|
19
|
+
@stats = populate_info
|
20
|
+
end
|
21
|
+
|
22
|
+
def valid?
|
23
|
+
!@stats.nil?
|
24
|
+
end
|
25
|
+
|
26
|
+
def cpu_time
|
27
|
+
(@stats[14].to_i + @stats[15].to_i) / @user_hertz
|
28
|
+
end
|
29
|
+
|
30
|
+
def start_time
|
31
|
+
@stats[22].to_i / @user_hertz
|
32
|
+
end
|
33
|
+
|
34
|
+
def vsize
|
35
|
+
# Virtual memory size in bytes.
|
36
|
+
@stats[23].to_i
|
37
|
+
end
|
38
|
+
|
39
|
+
def rss
|
40
|
+
# Resident Set Size: number of pages the process has in real memory.
|
41
|
+
@stats[24].to_i * 4096
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def populate_info
|
47
|
+
# Pad the array by one element to make field numbers match the man page.
|
48
|
+
[""].concat(File.read("/proc/#{@pid}/stat").split(" "))
|
49
|
+
rescue Errno::ENOENT
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
|
53
|
+
def retrieve_user_hertz
|
54
|
+
Process.clock_getres(:TIMES_BASED_CLOCK_PROCESS_CPUTIME_ID, :hertz)
|
55
|
+
rescue Errno::EINVAL
|
56
|
+
100.0
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Probes a process for info then writes metrics to a target
|
61
|
+
class ProcessProber
|
62
|
+
def initialize(options, metrics: PrometheusMetrics.new)
|
63
|
+
@metrics = metrics
|
64
|
+
@name = options[:name]
|
65
|
+
@pids = if options[:pid_or_pattern] =~ /^\d+$/
|
66
|
+
[options[:pid_or_pattern]]
|
67
|
+
else
|
68
|
+
Utils.pgrep(options[:pid_or_pattern])
|
69
|
+
end
|
70
|
+
@use_quantiles = options.fetch(:quantiles, false)
|
71
|
+
end
|
72
|
+
|
73
|
+
def probe_stat
|
74
|
+
@pids.each do |pid|
|
75
|
+
stats = ProcessStats.new(pid)
|
76
|
+
next unless stats.valid?
|
77
|
+
|
78
|
+
labels = { name: @name.downcase }
|
79
|
+
labels[:pid] = pid unless @use_quantiles
|
80
|
+
|
81
|
+
@metrics.add("process_cpu_seconds_total", stats.cpu_time, @use_quantiles, **labels)
|
82
|
+
@metrics.add("process_resident_memory_bytes", stats.rss, @use_quantiles, **labels)
|
83
|
+
@metrics.add("process_virtual_memory_bytes", stats.vsize, @use_quantiles, **labels)
|
84
|
+
@metrics.add("process_start_time_seconds", stats.start_time, @use_quantiles, **labels)
|
85
|
+
end
|
86
|
+
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
def probe_count
|
91
|
+
@metrics.add("process_count", @pids.count, name: @name.downcase)
|
92
|
+
|
93
|
+
self
|
94
|
+
end
|
95
|
+
|
96
|
+
def probe_smaps
|
97
|
+
@pids.each do |pid|
|
98
|
+
stats = ::GitLab::Monitor::MemStats::Aggregator.new(pid)
|
99
|
+
|
100
|
+
next unless stats.valid?
|
101
|
+
|
102
|
+
labels = { name: @name.downcase }
|
103
|
+
labels[:pid] = pid unless @use_quantiles
|
104
|
+
|
105
|
+
::GitLab::Monitor::MemStats::Mapping::FIELDS.each do |field|
|
106
|
+
value = stats.totals[field]
|
107
|
+
|
108
|
+
if value >= 0
|
109
|
+
@metrics.add("process_smaps_#{field}_bytes", value * 1024, @use_quantiles, **labels)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
self
|
115
|
+
end
|
116
|
+
|
117
|
+
def write_to(target)
|
118
|
+
target.write(@metrics.to_s)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require "quantile"
|
2
|
+
|
3
|
+
module GitLab
|
4
|
+
module Monitor
|
5
|
+
# Prometheus metrics container
|
6
|
+
#
|
7
|
+
# Provides a simple API to `add` metrics and then turn them `to_s` which will just
|
8
|
+
# dump all the metrics in prometheus format
|
9
|
+
#
|
10
|
+
# The add method also can take any arbitrary amount of labels in a `key: value` format.
|
11
|
+
class PrometheusMetrics
|
12
|
+
def initialize(include_timestamp: true)
|
13
|
+
@metrics = Hash.new { |h, k| h[k] = [] }
|
14
|
+
@quantiles = Hash.new { |h, k| h[k] = [] }
|
15
|
+
@include_timestamp = include_timestamp
|
16
|
+
end
|
17
|
+
|
18
|
+
def add(name, value, quantile = false, **labels)
|
19
|
+
if quantile
|
20
|
+
@quantiles[{ name: name, labels: labels }] << value
|
21
|
+
else
|
22
|
+
@metrics[name] << { value: value, labels: labels, timestamp: (Time.now.to_f * 1000).to_i }
|
23
|
+
end
|
24
|
+
|
25
|
+
self
|
26
|
+
end
|
27
|
+
|
28
|
+
def to_s
|
29
|
+
add_quantiles_to_metrics
|
30
|
+
|
31
|
+
buffer = ""
|
32
|
+
@metrics.each do |name, measurements|
|
33
|
+
measurements.each do |measurement|
|
34
|
+
buffer << name.to_s
|
35
|
+
labels = (measurement[:labels] || {}).map { |label, value| "#{label}=\"#{value}\"" }.join(",")
|
36
|
+
buffer << "{#{labels}}" unless labels.empty?
|
37
|
+
buffer << " #{measurement[:value]}"
|
38
|
+
buffer << " #{measurement[:timestamp]}" if @include_timestamp
|
39
|
+
buffer << "\n"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
buffer
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def add_quantiles_to_metrics
|
48
|
+
@quantiles.each do |data, measurements|
|
49
|
+
estimator = Quantile::Estimator.new
|
50
|
+
|
51
|
+
measurements.each do |value|
|
52
|
+
estimator.observe(value)
|
53
|
+
end
|
54
|
+
|
55
|
+
estimator.invariants.each do |invariant|
|
56
|
+
data[:labels][:quantile] = "#{(invariant.quantile * 100).to_i}th"
|
57
|
+
|
58
|
+
add(data[:name], estimator.query(invariant.quantile), **data[:labels])
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
require "sidekiq/api"
|
2
|
+
require "digest"
|
3
|
+
|
4
|
+
module GitLab
|
5
|
+
module Monitor
|
6
|
+
# A prober for Sidekiq queues
|
7
|
+
#
|
8
|
+
# It takes the Redis URL Sidekiq is connected to
|
9
|
+
class SidekiqProber
|
10
|
+
QUEUE_JOB_STATS_SCRIPT = File.read(File.expand_path("#{__FILE__}/../sidekiq_queue_job_stats.lua")).freeze
|
11
|
+
QUEUE_JOB_STATS_SHA = Digest::SHA1.hexdigest(QUEUE_JOB_STATS_SCRIPT).freeze
|
12
|
+
|
13
|
+
def initialize(opts, metrics: PrometheusMetrics.new)
|
14
|
+
@opts = opts
|
15
|
+
@metrics = metrics
|
16
|
+
|
17
|
+
Sidekiq.configure_client do |config|
|
18
|
+
config.redis = redis_options
|
19
|
+
end
|
20
|
+
|
21
|
+
ensure_queue_job_stats_script_loaded
|
22
|
+
end
|
23
|
+
|
24
|
+
def probe_queues
|
25
|
+
return self unless connected?
|
26
|
+
|
27
|
+
Sidekiq::Queue.all.each do |queue|
|
28
|
+
@metrics.add("sidekiq_queue_size", queue.size, name: queue.name)
|
29
|
+
@metrics.add("sidekiq_queue_latency_seconds", queue.latency, name: queue.name)
|
30
|
+
@metrics.add("sidekiq_queue_paused", queue.paused? ? 1 : 0, name: queue.name)
|
31
|
+
end
|
32
|
+
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def probe_jobs # rubocop:disable Metrics/MethodLength
|
37
|
+
return self unless connected?
|
38
|
+
|
39
|
+
job_stats = {}
|
40
|
+
|
41
|
+
Sidekiq::Queue.all.each do |queue|
|
42
|
+
begin
|
43
|
+
Sidekiq.redis do |conn|
|
44
|
+
stats = conn.evalsha(QUEUE_JOB_STATS_SHA, ["queue:#{queue.name}"])
|
45
|
+
job_stats.merge!(stats.to_h)
|
46
|
+
end
|
47
|
+
rescue Redis::CommandError # Could happen if the script exceeded the maximum run time (5 seconds by default)
|
48
|
+
# FIXME: Should we call SCRIPT KILL?
|
49
|
+
return self
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
job_stats.each do |class_name, count|
|
54
|
+
@metrics.add("sidekiq_enqueued_jobs", count, name: class_name)
|
55
|
+
end
|
56
|
+
|
57
|
+
self
|
58
|
+
end
|
59
|
+
|
60
|
+
def probe_workers
|
61
|
+
return self unless connected?
|
62
|
+
|
63
|
+
worker_stats = Hash.new(0)
|
64
|
+
|
65
|
+
Sidekiq::Workers.new.map do |_pid, _tid, work|
|
66
|
+
job_klass = work["payload"]["class"]
|
67
|
+
|
68
|
+
worker_stats[job_klass] += 1
|
69
|
+
end
|
70
|
+
|
71
|
+
worker_stats.each do |class_name, count|
|
72
|
+
@metrics.add("sidekiq_running_jobs", count, name: class_name)
|
73
|
+
end
|
74
|
+
|
75
|
+
self
|
76
|
+
end
|
77
|
+
|
78
|
+
def probe_retries
|
79
|
+
return self unless connected?
|
80
|
+
|
81
|
+
retry_stats = Hash.new(0)
|
82
|
+
|
83
|
+
Sidekiq::RetrySet.new.map do |job|
|
84
|
+
retry_stats[job.klass] += 1
|
85
|
+
end
|
86
|
+
|
87
|
+
retry_stats.each do |class_name, count|
|
88
|
+
@metrics.add("sidekiq_to_be_retried_jobs", count, name: class_name)
|
89
|
+
end
|
90
|
+
|
91
|
+
self
|
92
|
+
end
|
93
|
+
|
94
|
+
def probe_dead
|
95
|
+
return self unless connected?
|
96
|
+
|
97
|
+
@metrics.add("sidekiq_dead_jobs", Sidekiq::Stats.new.dead_size)
|
98
|
+
|
99
|
+
self
|
100
|
+
end
|
101
|
+
|
102
|
+
def write_to(target)
|
103
|
+
target.write(@metrics.to_s)
|
104
|
+
end
|
105
|
+
|
106
|
+
private
|
107
|
+
|
108
|
+
def redis_options
|
109
|
+
options = {
|
110
|
+
url: @opts[:redis_url],
|
111
|
+
namespace: "resque:gitlab",
|
112
|
+
connect_timeout: 1,
|
113
|
+
reconnect_attempts: 0
|
114
|
+
}
|
115
|
+
|
116
|
+
options[:id] = nil unless redis_enable_client?
|
117
|
+
options
|
118
|
+
end
|
119
|
+
|
120
|
+
def redis_enable_client?
|
121
|
+
return true if @opts[:redis_enable_client].nil?
|
122
|
+
|
123
|
+
@opts[:redis_enable_client]
|
124
|
+
end
|
125
|
+
|
126
|
+
def connected?
|
127
|
+
@connected ||= begin
|
128
|
+
Sidekiq.redis do |conn|
|
129
|
+
conn.get("foo")
|
130
|
+
end
|
131
|
+
true
|
132
|
+
end
|
133
|
+
rescue Redis::CannotConnectError, Redis::TimeoutError # rubocop:disable Lint/HandleExceptions
|
134
|
+
# Maybe we're trying connecting to a slave
|
135
|
+
end
|
136
|
+
|
137
|
+
def ensure_queue_job_stats_script_loaded
|
138
|
+
return unless connected?
|
139
|
+
|
140
|
+
Sidekiq.redis do |conn|
|
141
|
+
# Using administrative commands on conn directly (which is a Redis::Namespace)
|
142
|
+
# will be removed in redis-namespace 2.0.
|
143
|
+
next if conn.redis.script(:exists, QUEUE_JOB_STATS_SHA)
|
144
|
+
conn.redis.script(:load, QUEUE_JOB_STATS_SCRIPT)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|