gitlab-monitor 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -0
  3. data/.gitlab-ci.yml +18 -0
  4. data/.rubocop.yml +34 -0
  5. data/CONTRIBUTING.md +651 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +75 -0
  8. data/LICENSE +25 -0
  9. data/README.md +110 -0
  10. data/bin/gitlab-mon +17 -0
  11. data/config/gitlab-monitor.yml.example +112 -0
  12. data/gitlab-monitor.gemspec +33 -0
  13. data/lib/gitlab_monitor.rb +18 -0
  14. data/lib/gitlab_monitor/cli.rb +341 -0
  15. data/lib/gitlab_monitor/database.rb +13 -0
  16. data/lib/gitlab_monitor/database/base.rb +44 -0
  17. data/lib/gitlab_monitor/database/bloat.rb +74 -0
  18. data/lib/gitlab_monitor/database/bloat_btree.sql +84 -0
  19. data/lib/gitlab_monitor/database/bloat_table.sql +63 -0
  20. data/lib/gitlab_monitor/database/ci_builds.rb +527 -0
  21. data/lib/gitlab_monitor/database/remote_mirrors.rb +74 -0
  22. data/lib/gitlab_monitor/database/row_count.rb +164 -0
  23. data/lib/gitlab_monitor/database/tuple_stats.rb +53 -0
  24. data/lib/gitlab_monitor/git.rb +144 -0
  25. data/lib/gitlab_monitor/memstats.rb +98 -0
  26. data/lib/gitlab_monitor/memstats/mapping.rb +91 -0
  27. data/lib/gitlab_monitor/prober.rb +40 -0
  28. data/lib/gitlab_monitor/process.rb +122 -0
  29. data/lib/gitlab_monitor/prometheus.rb +64 -0
  30. data/lib/gitlab_monitor/sidekiq.rb +149 -0
  31. data/lib/gitlab_monitor/sidekiq_queue_job_stats.lua +42 -0
  32. data/lib/gitlab_monitor/util.rb +83 -0
  33. data/lib/gitlab_monitor/version.rb +5 -0
  34. data/lib/gitlab_monitor/web_exporter.rb +77 -0
  35. data/spec/cli_spec.rb +31 -0
  36. data/spec/database/bloat_spec.rb +99 -0
  37. data/spec/database/ci_builds_spec.rb +421 -0
  38. data/spec/database/row_count_spec.rb +37 -0
  39. data/spec/fixtures/smaps/sample.txt +10108 -0
  40. data/spec/git_process_proper_spec.rb +27 -0
  41. data/spec/git_spec.rb +52 -0
  42. data/spec/memstats_spec.rb +28 -0
  43. data/spec/prometheus_metrics_spec.rb +17 -0
  44. data/spec/spec_helper.rb +63 -0
  45. data/spec/util_spec.rb +15 -0
  46. metadata +225 -0
@@ -0,0 +1,74 @@
1
+ module GitLab
2
+ module Monitor
3
+ module Database
4
+ # A helper class to collect remote mirrors metrics.
5
+ class RemoteMirrorsCollector < Base
6
+ QUERY = <<~SQL.freeze
7
+ SELECT project_id, url,
8
+ EXTRACT(EPOCH FROM last_successful_update_at) AS last_successful_update_at,
9
+ EXTRACT(EPOCH FROM last_update_at) AS last_update_at
10
+ FROM remote_mirrors WHERE project_id IN (%s) AND enabled = 't'
11
+ SQL
12
+
13
+ def initialize(args)
14
+ super(args)
15
+
16
+ @project_ids = args[:project_ids]
17
+ end
18
+
19
+ def run
20
+ return if @project_ids.nil? || @project_ids.empty?
21
+
22
+ execute(QUERY % [@project_ids.join(",")]) # rubocop:disable Style/FormatString
23
+ end
24
+
25
+ private
26
+
27
+ def execute(query)
28
+ with_connection_pool do |conn|
29
+ conn.exec(query)
30
+ end
31
+ rescue PG::UndefinedTable, PG::UndefinedColumn
32
+ nil
33
+ end
34
+ end
35
+
36
+ # The prober which is called when gathering metrics
37
+ class RemoteMirrorsProber
38
+ def initialize(opts, metrics: PrometheusMetrics.new)
39
+ @metrics = metrics
40
+ @collector = RemoteMirrorsCollector.new(
41
+ connection_string: opts[:connection_string],
42
+ project_ids: opts[:project_ids]
43
+ )
44
+ end
45
+
46
+ def probe_db # rubocop:disable Metrics/MethodLength
47
+ results = @collector.run
48
+ results.to_a.each do |row|
49
+ @metrics.add(
50
+ "project_remote_mirror_last_successful_update_time_seconds",
51
+ row["last_successful_update_at"].to_i,
52
+ project_id: row["project_id"],
53
+ url: row["url"]
54
+ )
55
+ @metrics.add(
56
+ "project_remote_mirror_last_update_time_seconds",
57
+ row["last_update_at"].to_i,
58
+ project_id: row["project_id"],
59
+ url: row["url"]
60
+ )
61
+ end
62
+
63
+ self
64
+ rescue PG::ConnectionBad
65
+ self
66
+ end
67
+
68
+ def write_to(target)
69
+ target.write(@metrics.to_s)
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,164 @@
1
+ require "set"
2
+
3
+ module GitLab
4
+ module Monitor
5
+ module Database
6
+ # A helper class that executes the query its given and returns an int of
7
+ # the row count
8
+ # This class works under the assumption you do COUNT(*) queries, define
9
+ # queries in the QUERIES constant. If in doubt how these work, read
10
+ # #construct_query
11
+ class RowCountCollector < Base
12
+ WHERE_MIRROR_ENABLED = <<~SQL.freeze
13
+ projects.mirror = true
14
+ AND project_mirror_data.retry_count <= 14
15
+ AND (projects.visibility_level = 20 OR plans.name IN ('early_adopter', 'bronze', 'silver', 'gold'))
16
+ SQL
17
+
18
+ MIRROR_QUERY = {
19
+ select: :projects,
20
+ joins: <<~SQL,
21
+ INNER JOIN project_mirror_data ON project_mirror_data.project_id = projects.id
22
+ INNER JOIN namespaces ON projects.namespace_id = namespaces.id
23
+ LEFT JOIN plans ON namespaces.plan_id = plans.id
24
+ SQL
25
+ check: "SELECT 1 FROM information_schema.tables WHERE table_name='plans'"
26
+ }.freeze
27
+
28
+ QUERIES = {
29
+ mirrors_ready_to_sync: MIRROR_QUERY.merge( # EE only
30
+ where: <<~SQL
31
+ #{WHERE_MIRROR_ENABLED}
32
+ AND project_mirror_data.status NOT IN ('scheduled', 'started')
33
+ AND project_mirror_data.next_execution_timestamp <= NOW()
34
+ SQL
35
+ ),
36
+ mirrors_not_updated_recently: MIRROR_QUERY.merge( # EE only
37
+ where: <<~SQL
38
+ #{WHERE_MIRROR_ENABLED}
39
+ AND project_mirror_data.status NOT IN ('scheduled', 'started')
40
+ AND (project_mirror_data.next_execution_timestamp - project_mirror_data.last_update_at) <= '30 minutes'::interval
41
+ AND project_mirror_data.last_update_at < NOW() - '30 minutes'::interval
42
+ SQL
43
+ ),
44
+ mirrors_updated_very_recently: MIRROR_QUERY.merge( # EE only
45
+ where: <<~SQL
46
+ #{WHERE_MIRROR_ENABLED}
47
+ AND project_mirror_data.status NOT IN ('scheduled', 'started')
48
+ AND project_mirror_data.last_update_at >= NOW() - '30 seconds'::interval
49
+ SQL
50
+ ),
51
+ mirrors_behind_schedule: MIRROR_QUERY.merge( # EE only
52
+ where: <<~SQL
53
+ #{WHERE_MIRROR_ENABLED}
54
+ AND project_mirror_data.status NOT IN ('scheduled', 'started')
55
+ AND project_mirror_data.next_execution_timestamp <= NOW() - '10 seconds'::interval
56
+ SQL
57
+ ),
58
+ mirrors_scheduled_or_started: MIRROR_QUERY.merge( # EE only
59
+ where: <<~SQL
60
+ #{WHERE_MIRROR_ENABLED}
61
+ AND project_mirror_data.status IN ('scheduled', 'started')
62
+ SQL
63
+ ),
64
+ mirrors_scheduled: MIRROR_QUERY.merge( # EE only
65
+ where: <<~SQL
66
+ #{WHERE_MIRROR_ENABLED}
67
+ AND project_mirror_data.status = 'scheduled'
68
+ SQL
69
+ ),
70
+ mirrors_started: MIRROR_QUERY.merge( # EE only
71
+ where: <<~SQL
72
+ #{WHERE_MIRROR_ENABLED}
73
+ AND project_mirror_data.status = 'started'
74
+ SQL
75
+ ),
76
+ soft_deleted_projects: { select: :projects, where: "pending_delete=true" },
77
+ orphaned_projects: {
78
+ select: :projects,
79
+ joins: "LEFT JOIN namespaces ON projects.namespace_id = namespaces.id",
80
+ where: "namespaces.id IS NULL"
81
+ },
82
+ uploads: { select: :uploads }
83
+ }.freeze
84
+
85
+ def initialize(args)
86
+ super(args)
87
+
88
+ @selected_queries = Set.new(args[:selected_queries].map(&:to_sym)) unless args[:selected_queries].nil?
89
+ end
90
+
91
+ def run
92
+ results = Hash.new(0)
93
+
94
+ QUERIES.each do |key, query_hash|
95
+ next if query_hash[:check] && !successful_check?(query_hash[:check])
96
+ next if !@selected_queries.nil? && !@selected_queries.include?(key)
97
+
98
+ results[key] = count_from_query_hash(query_hash)
99
+ end
100
+
101
+ results
102
+ end
103
+
104
+ private
105
+
106
+ def count_from_query_hash(query_hash)
107
+ result = execute(construct_query(query_hash))
108
+ return 0 unless result
109
+
110
+ result[0]["count"]
111
+ end
112
+
113
+ def successful_check?(query)
114
+ result = execute("SELECT EXISTS (#{query})")
115
+ return unless result
116
+
117
+ result[0]["exists"] == "t"
118
+ end
119
+
120
+ def execute(query)
121
+ with_connection_pool do |conn|
122
+ conn.exec(query)
123
+ end
124
+ rescue PG::UndefinedTable, PG::UndefinedColumn
125
+ nil
126
+ end
127
+
128
+ # Not private so I can test it without meta programming tricks
129
+ def construct_query(query)
130
+ query_string = "SELECT COUNT(*) FROM #{query[:select]} "
131
+ query_string << "#{query[:joins]} " if query[:joins]
132
+ query_string << "WHERE #{query[:where]}" if query[:where]
133
+ query_string << ";"
134
+ end
135
+ end
136
+
137
+ # The prober which is called when gathering metrics
138
+ class RowCountProber
139
+ def initialize(opts, metrics: PrometheusMetrics.new)
140
+ @metrics = metrics
141
+ @collector = RowCountCollector.new(
142
+ connection_string: opts[:connection_string],
143
+ selected_queries: opts[:selected_queries]
144
+ )
145
+ end
146
+
147
+ def probe_db
148
+ results = @collector.run
149
+ results.each do |key, value|
150
+ @metrics.add("gitlab_database_rows", value.to_i, query_name: key.to_s)
151
+ end
152
+
153
+ self
154
+ rescue PG::ConnectionBad
155
+ self
156
+ end
157
+
158
+ def write_to(target)
159
+ target.write(@metrics.to_s)
160
+ end
161
+ end
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,53 @@
1
+ module GitLab
2
+ module Monitor
3
+ module Database
4
+ # A helper class to collect tuple stats from the database
5
+ #
6
+ # It takes a connection string (e.g. "dbname=test port=5432")
7
+ class TupleStatsCollector < Base
8
+ COLUMNS = %w(relname seq_tup_read idx_tup_fetch n_tup_ins n_tup_upd n_tup_del n_tup_hot_upd n_dead_tup seq_scan)
9
+ .join(",")
10
+ QUERY = <<-SQL.freeze
11
+ SELECT #{COLUMNS}
12
+ FROM pg_stat_user_tables
13
+ WHERE relname IN (SELECT tablename FROM pg_tables WHERE tableowner = 'gitlab')
14
+ GROUP BY #{COLUMNS}
15
+ SQL
16
+
17
+ def run
18
+ with_connection_pool do |conn|
19
+ conn.exec(QUERY).each.with_object({}) do |row, stats|
20
+ stats[row.delete("relname")] = row
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ # Probes the DB specified by opts[:connection_string] for tuple stats, then converts them to metrics
27
+ class TuplesProber
28
+ def initialize(opts, metrics: PrometheusMetrics.new)
29
+ @metrics = metrics
30
+ @collector = TupleStatsCollector.new(connection_string: opts[:connection_string])
31
+ end
32
+
33
+ def probe_db
34
+ result = @collector.run
35
+
36
+ result.each do |table_name, tuple_stats|
37
+ tuple_stats.each do |column_name, value|
38
+ @metrics.add("gitlab_database_stat_table_#{column_name}", value.to_s, table_name: table_name)
39
+ end
40
+ end
41
+
42
+ self
43
+ rescue PG::ConnectionBad
44
+ self
45
+ end
46
+
47
+ def write_to(target)
48
+ target.write(@metrics.to_s)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,144 @@
1
+ require "open3"
2
+
3
+ module GitLab
4
+ module Monitor
5
+ # Git monitoring helping class
6
+ #
7
+ # Takes a repository path for construction and provides 2 main methods:
8
+ # - pull
9
+ # - push
10
+ #
11
+ # Both methods return a CommandResult which includes the output of the execution
12
+ # plus the tracked execution time.
13
+ class Git
14
+ def initialize(repo)
15
+ fail "Repository #{repo} does not exists" unless Dir.exist? repo
16
+ @repo = repo
17
+ @tracker = TimeTracker.new
18
+ end
19
+
20
+ def pull
21
+ @tracker.track { execute "git pull -q" }
22
+ end
23
+
24
+ def push
25
+ empty_commit
26
+ @tracker.track { execute "git push -q" }
27
+ end
28
+
29
+ def empty_commit(message = "Beep")
30
+ @tracker.track { execute("git commit --allow-empty -m '#{message}'") }
31
+ end
32
+
33
+ private
34
+
35
+ def execute(command)
36
+ result = CommandResult.new(*Open3.capture2e(command, chdir: @repo))
37
+ fail "Command #{command} failed with status #{result.status}\n#{result.stdout}" if result.failed?
38
+ result
39
+ end
40
+ end
41
+
42
+ # Result of a command
43
+ #
44
+ # Provides some handy methods for checking if the execution failed and a simple to_s that will
45
+ # return the command output
46
+ CommandResult = Struct.new(:stdout, :status) do
47
+ def failed?
48
+ status.nonzero?
49
+ end
50
+
51
+ def status
52
+ self[:status].exitstatus
53
+ end
54
+
55
+ def to_s
56
+ stdout
57
+ end
58
+ end
59
+
60
+ # Handles creating a Git object, probing for both pull and push, and finally writing to metrics
61
+ #
62
+ # Optionally takes a metrics object which by default is a PrometheusMetrics, useful to change the
63
+ # metrics writer to something else.
64
+ class GitProber
65
+ def initialize(opts, metrics: PrometheusMetrics.new)
66
+ @metrics = metrics
67
+ @labels = opts[:labels] || {}
68
+ @git = Git.new(opts[:source])
69
+ end
70
+
71
+ def probe_pull
72
+ @metrics.add "git_pull_time_milliseconds", (@git.pull.time * 1000).to_i, **@labels
73
+ self
74
+ end
75
+
76
+ def probe_push
77
+ @metrics.add "git_push_time_milliseconds", (@git.push.time * 1000).to_i, **@labels
78
+ self
79
+ end
80
+
81
+ def write_to(target)
82
+ target.write(@metrics.to_s)
83
+ end
84
+ end
85
+
86
+ # A special prober for git processes
87
+ class GitProcessProber
88
+ def initialize(opts, metrics: PrometheusMetrics.new)
89
+ @opts = opts
90
+ @metrics = metrics
91
+ end
92
+
93
+ def probe_git # rubocop:disable Metrics/MethodLength
94
+ counts = Hash.new(0)
95
+
96
+ Utils.pgrep("^git ").each do |pid|
97
+ process_cmd = begin
98
+ File.read("/proc/#{pid}/cmdline")
99
+ rescue
100
+ "" # Process file is gone (race condition)
101
+ end
102
+ subcommand = self.class.extract_subcommand(process_cmd)
103
+ next unless subcommand # Unlikely, but just to be safe
104
+
105
+ name = "git #{subcommand}"
106
+ counts[name] += 1
107
+
108
+ prober = ProcessProber.new(
109
+ {
110
+ name: name,
111
+ pid_or_pattern: pid,
112
+ quantiles: @opts[:quantiles]
113
+ },
114
+ metrics: @metrics
115
+ )
116
+
117
+ prober
118
+ .probe_stat
119
+ end
120
+
121
+ counts.each do |name, count|
122
+ @metrics.add("process_count", count, name: name)
123
+ end
124
+
125
+ self
126
+ end
127
+
128
+ def write_to(target)
129
+ target.write(@metrics.to_s)
130
+ end
131
+
132
+ def self.extract_subcommand(cmd)
133
+ return if cmd.empty?
134
+ cmd_splitted = cmd.split("\u0000") # cmdline does not return it space-separated
135
+
136
+ cmd_splitted.shift # Because it's "git"
137
+ cmd_splitted.shift while cmd_splitted.first &&
138
+ (cmd_splitted.first.empty? || cmd_splitted.first !~ /^[^-][a-z\-]*$/)
139
+
140
+ cmd_splitted[0]
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "memstats/mapping"
4
+
5
+ # Ported from https://github.com/discourse/discourse/blob/master/script/memstats.rb
6
+ #
7
+ # Aggregate Print useful information from /proc/[pid]/smaps
8
+ #
9
+ # pss - Roughly the amount of memory that is "really" being used by the pid
10
+ # swap - Amount of swap this process is currently using
11
+ #
12
+ # Reference:
13
+ # http://www.mjmwired.net/kernel/Documentation/filesystems/proc.txt#361
14
+ #
15
+ # Example:
16
+ # # ./memstats.rb 4386
17
+ # Process: 4386
18
+ # Command Line: /usr/bin/mongod -f /etc/mongo/mongod.conf
19
+ # Memory Summary:
20
+ # private_clean 107,132 kB
21
+ # private_dirty 2,020,676 kB
22
+ # pss 2,127,860 kB
23
+ # rss 2,128,536 kB
24
+ # shared_clean 728 kB
25
+ # shared_dirty 0 kB
26
+ # size 149,281,668 kB
27
+ # swap 1,719,792 kB
28
+ module GitLab
29
+ module Monitor
30
+ module MemStats
31
+ # Aggregates all metrics for a single PID in /proc/<pid>/smaps
32
+ class Aggregator
33
+ attr_accessor :pid, :totals
34
+
35
+ def initialize(pid)
36
+ @pid = pid
37
+ @totals = Hash.new(0)
38
+ @mappings = []
39
+ @valid = true
40
+
41
+ populate_info
42
+ end
43
+
44
+ def valid?
45
+ @valid
46
+ end
47
+
48
+ private
49
+
50
+ attr_accessor :mappings
51
+
52
+ def consume_mapping(map_lines, totals)
53
+ m = Mapping.new(map_lines)
54
+
55
+ Mapping::FIELDS.each do |field|
56
+ totals[field] += m.send(field)
57
+ end
58
+
59
+ m
60
+ end
61
+
62
+ def create_memstats_not_available(totals)
63
+ Mapping::FIELDS.each do |field|
64
+ totals[field] += Float::NAN
65
+ end
66
+ end
67
+
68
+ def populate_info # rubocop:disable Metrics/MethodLength
69
+ File.open("/proc/#{@pid}/smaps") do |smaps|
70
+ map_lines = []
71
+
72
+ loop do
73
+ break if smaps.eof?
74
+
75
+ line = smaps.readline.strip
76
+
77
+ case line
78
+ when /\w+:\s+/
79
+ map_lines << line
80
+ when /[0-9a-f]+:[0-9a-f]+\s+/
81
+ mappings << consume_mapping(map_lines, totals) if map_lines.size.positive?
82
+
83
+ map_lines.clear
84
+ map_lines << line
85
+ else
86
+ break
87
+ end
88
+ end
89
+ end
90
+ rescue => e
91
+ puts "Error: #{e}"
92
+ @valid = false
93
+ create_memstats_not_available(totals)
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end