gitlab-monitor 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -0
  3. data/.gitlab-ci.yml +18 -0
  4. data/.rubocop.yml +34 -0
  5. data/CONTRIBUTING.md +651 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +75 -0
  8. data/LICENSE +25 -0
  9. data/README.md +110 -0
  10. data/bin/gitlab-mon +17 -0
  11. data/config/gitlab-monitor.yml.example +112 -0
  12. data/gitlab-monitor.gemspec +33 -0
  13. data/lib/gitlab_monitor.rb +18 -0
  14. data/lib/gitlab_monitor/cli.rb +341 -0
  15. data/lib/gitlab_monitor/database.rb +13 -0
  16. data/lib/gitlab_monitor/database/base.rb +44 -0
  17. data/lib/gitlab_monitor/database/bloat.rb +74 -0
  18. data/lib/gitlab_monitor/database/bloat_btree.sql +84 -0
  19. data/lib/gitlab_monitor/database/bloat_table.sql +63 -0
  20. data/lib/gitlab_monitor/database/ci_builds.rb +527 -0
  21. data/lib/gitlab_monitor/database/remote_mirrors.rb +74 -0
  22. data/lib/gitlab_monitor/database/row_count.rb +164 -0
  23. data/lib/gitlab_monitor/database/tuple_stats.rb +53 -0
  24. data/lib/gitlab_monitor/git.rb +144 -0
  25. data/lib/gitlab_monitor/memstats.rb +98 -0
  26. data/lib/gitlab_monitor/memstats/mapping.rb +91 -0
  27. data/lib/gitlab_monitor/prober.rb +40 -0
  28. data/lib/gitlab_monitor/process.rb +122 -0
  29. data/lib/gitlab_monitor/prometheus.rb +64 -0
  30. data/lib/gitlab_monitor/sidekiq.rb +149 -0
  31. data/lib/gitlab_monitor/sidekiq_queue_job_stats.lua +42 -0
  32. data/lib/gitlab_monitor/util.rb +83 -0
  33. data/lib/gitlab_monitor/version.rb +5 -0
  34. data/lib/gitlab_monitor/web_exporter.rb +77 -0
  35. data/spec/cli_spec.rb +31 -0
  36. data/spec/database/bloat_spec.rb +99 -0
  37. data/spec/database/ci_builds_spec.rb +421 -0
  38. data/spec/database/row_count_spec.rb +37 -0
  39. data/spec/fixtures/smaps/sample.txt +10108 -0
  40. data/spec/git_process_proper_spec.rb +27 -0
  41. data/spec/git_spec.rb +52 -0
  42. data/spec/memstats_spec.rb +28 -0
  43. data/spec/prometheus_metrics_spec.rb +17 -0
  44. data/spec/spec_helper.rb +63 -0
  45. data/spec/util_spec.rb +15 -0
  46. metadata +225 -0
@@ -0,0 +1,13 @@
1
+ module GitLab
2
+ module Monitor
3
+ # Database-related classes
4
+ module Database
5
+ autoload :Base, "gitlab_monitor/database/base"
6
+ autoload :CiBuildsProber, "gitlab_monitor/database/ci_builds"
7
+ autoload :TuplesProber, "gitlab_monitor/database/tuple_stats"
8
+ autoload :RowCountProber, "gitlab_monitor/database/row_count"
9
+ autoload :BloatProber, "gitlab_monitor/database/bloat"
10
+ autoload :RemoteMirrorsProber, "gitlab_monitor/database/remote_mirrors"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,44 @@
1
+ require "pg"
2
+ require "connection_pool"
3
+
4
+ module GitLab
5
+ module Monitor
6
+ module Database
7
+ # An abstract class for interacting with DB
8
+ #
9
+ # It takes a connection string (e.g. "dbname=test port=5432")
10
+ class Base
11
+ def self.connection_pool
12
+ @connection_pool ||= Hash.new do |h, connection_string|
13
+ h[connection_string] = ConnectionPool.new(size: 3, timeout: 5) do
14
+ PG.connect(connection_string)
15
+ end
16
+ end
17
+ end
18
+
19
+ def initialize(args)
20
+ @connection_string = args[:connection_string]
21
+ end
22
+
23
+ def run
24
+ fail NotImplemented
25
+ end
26
+
27
+ def connection_pool
28
+ self.class.connection_pool[@connection_string]
29
+ end
30
+
31
+ def with_connection_pool
32
+ connection_pool.with do |conn|
33
+ begin
34
+ yield conn
35
+ rescue PG::UnableToSend => e
36
+ conn.reset
37
+ raise e
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,74 @@
1
+ module GitLab
2
+ module Monitor
3
+ module Database
4
+ # Helper to collect bloat metrics.
5
+ class BloatCollector < Base
6
+ def run(type = :btree)
7
+ execute(self.class.query_for(type)).each_with_object({}) do |row, h|
8
+ h[row["object_name"]] = row
9
+ end
10
+ end
11
+
12
+ private
13
+
14
+ def execute(query)
15
+ with_connection_pool do |conn|
16
+ conn.exec(query)
17
+ end
18
+ end
19
+
20
+ class << self
21
+ def query_for(type)
22
+ @queries ||= {}
23
+
24
+ return @queries[type] if @queries[type]
25
+
26
+ file = File.join(__dir__, "bloat_#{type}.sql")
27
+ fail "Unknown bloat query file: #{file}" unless File.exist?(file)
28
+
29
+ @queries[type] = File.read(file)
30
+ end
31
+ end
32
+ end
33
+
34
+ # Prober class to gather bloat metrics
35
+ class BloatProber
36
+ METRIC_KEYS = %w(bloat_ratio bloat_size extra_size real_size).freeze
37
+
38
+ attr_reader :metrics, :collector, :bloat_types
39
+
40
+ def initialize(opts,
41
+ metrics: PrometheusMetrics.new,
42
+ collector: BloatCollector.new(connection_string: opts[:connection_string]))
43
+ @metrics = metrics
44
+ @collector = collector
45
+ @bloat_types = opts[:bloat_types] || %i(btree table)
46
+ end
47
+
48
+ def probe_db
49
+ bloat_types.each do |type|
50
+ probe_for_type(type)
51
+ end
52
+ end
53
+
54
+ def write_to(target)
55
+ target.write(metrics.to_s)
56
+ end
57
+
58
+ private
59
+
60
+ def probe_for_type(type)
61
+ collector.run(type).each do |query_name, data|
62
+ METRIC_KEYS.each do |key|
63
+ metrics.add("gitlab_database_bloat_#{type}_#{key}", data[key], query_name: query_name)
64
+ end
65
+ end
66
+
67
+ self
68
+ rescue PG::ConnectionBad
69
+ self
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,84 @@
1
+ -- Originally from: https://github.com/ioguix/pgsql-bloat-estimation/blob/master/btree/btree_bloat.sql
2
+ -- WARNING: executed with a non-superuser role, the query inspect only index on tables you are granted to read.
3
+ -- WARNING: rows with is_na = 't' are known to have bad statistics ("name" type is not supported).
4
+ -- This query is compatible with PostgreSQL 8.2 and after
5
+ SELECT current_database(), nspname AS schemaname, tblname, idxname AS object_name, bs*(relpages)::bigint AS real_size,
6
+ bs*(relpages-est_pages)::bigint AS extra_size,
7
+ 100 * (relpages-est_pages)::float / relpages AS extra_ratio,
8
+ fillfactor,
9
+ CASE WHEN relpages > est_pages_ff
10
+ THEN bs*(relpages-est_pages_ff)
11
+ ELSE 0
12
+ END AS bloat_size,
13
+ 100 * (relpages-est_pages_ff)::float / relpages AS bloat_ratio,
14
+ is_na
15
+ -- , 100-(sub.pst).avg_leaf_density, est_pages, index_tuple_hdr_bm, maxalign, pagehdr, nulldatawidth, nulldatahdrwidth, sub.reltuples, sub.relpages -- (DEBUG INFO)
16
+ FROM (
17
+ SELECT coalesce(1 +
18
+ ceil(reltuples/floor((bs-pageopqdata-pagehdr)/(4+nulldatahdrwidth)::float)), 0 -- ItemIdData size + computed avg size of a tuple (nulldatahdrwidth)
19
+ ) AS est_pages,
20
+ coalesce(1 +
21
+ ceil(reltuples/floor((bs-pageopqdata-pagehdr)*fillfactor/(100*(4+nulldatahdrwidth)::float))), 0
22
+ ) AS est_pages_ff,
23
+ bs, nspname, table_oid, tblname, idxname, relpages, fillfactor, is_na
24
+ -- , stattuple.pgstatindex(quote_ident(nspname)||'.'||quote_ident(idxname)) AS pst, index_tuple_hdr_bm, maxalign, pagehdr, nulldatawidth, nulldatahdrwidth, reltuples -- (DEBUG INFO)
25
+ FROM (
26
+ SELECT maxalign, bs, nspname, tblname, idxname, reltuples, relpages, relam, table_oid, fillfactor,
27
+ ( index_tuple_hdr_bm +
28
+ maxalign - CASE -- Add padding to the index tuple header to align on MAXALIGN
29
+ WHEN index_tuple_hdr_bm%maxalign = 0 THEN maxalign
30
+ ELSE index_tuple_hdr_bm%maxalign
31
+ END
32
+ + nulldatawidth + maxalign - CASE -- Add padding to the data to align on MAXALIGN
33
+ WHEN nulldatawidth = 0 THEN 0
34
+ WHEN nulldatawidth::integer%maxalign = 0 THEN maxalign
35
+ ELSE nulldatawidth::integer%maxalign
36
+ END
37
+ )::numeric AS nulldatahdrwidth, pagehdr, pageopqdata, is_na
38
+ -- , index_tuple_hdr_bm, nulldatawidth -- (DEBUG INFO)
39
+ FROM (
40
+ SELECT
41
+ i.nspname, i.tblname, i.idxname, i.reltuples, i.relpages, i.relam, a.attrelid AS table_oid,
42
+ current_setting('block_size')::numeric AS bs, fillfactor,
43
+ CASE -- MAXALIGN: 4 on 32bits, 8 on 64bits (and mingw32 ?)
44
+ WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8
45
+ ELSE 4
46
+ END AS maxalign,
47
+ /* per page header, fixed size: 20 for 7.X, 24 for others */
48
+ 24 AS pagehdr,
49
+ /* per page btree opaque data */
50
+ 16 AS pageopqdata,
51
+ /* per tuple header: add IndexAttributeBitMapData if some cols are null-able */
52
+ CASE WHEN max(coalesce(s.null_frac,0)) = 0
53
+ THEN 2 -- IndexTupleData size
54
+ ELSE 2 + (( 32 + 8 - 1 ) / 8) -- IndexTupleData size + IndexAttributeBitMapData size ( max num filed per index + 8 - 1 /8)
55
+ END AS index_tuple_hdr_bm,
56
+ /* data len: we remove null values save space using it fractionnal part from stats */
57
+ sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024)) AS nulldatawidth,
58
+ max( CASE WHEN a.atttypid = 'pg_catalog.name'::regtype THEN 1 ELSE 0 END ) > 0 AS is_na
59
+ FROM pg_attribute AS a
60
+ JOIN (
61
+ SELECT nspname, tbl.relname AS tblname, idx.relname AS idxname, idx.reltuples, idx.relpages, idx.relam,
62
+ indrelid, indexrelid, indkey::smallint[] AS attnum,
63
+ coalesce(substring(
64
+ array_to_string(idx.reloptions, ' ')
65
+ from 'fillfactor=([0-9]+)')::smallint, 90) AS fillfactor
66
+ FROM pg_index
67
+ JOIN pg_class idx ON idx.oid=pg_index.indexrelid
68
+ JOIN pg_class tbl ON tbl.oid=pg_index.indrelid
69
+ JOIN pg_namespace ON pg_namespace.oid = idx.relnamespace
70
+ WHERE pg_index.indisvalid AND tbl.relkind = 'r' AND idx.relpages > 0
71
+ ) AS i ON a.attrelid = i.indexrelid
72
+ JOIN pg_stats AS s ON s.schemaname = i.nspname
73
+ AND ((s.tablename = i.tblname AND s.attname = pg_catalog.pg_get_indexdef(a.attrelid, a.attnum, TRUE)) -- stats from tbl
74
+ OR (s.tablename = i.idxname AND s.attname = a.attname))-- stats from functionnal cols
75
+ JOIN pg_type AS t ON a.atttypid = t.oid
76
+ WHERE a.attnum > 0
77
+ GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9
78
+ ) AS s1
79
+ ) AS s2
80
+ JOIN pg_am am ON s2.relam = am.oid WHERE am.amname = 'btree'
81
+ ) AS sub
82
+ WHERE NOT is_na
83
+ AND nspname = 'public'
84
+ ORDER BY 2,3,4;
@@ -0,0 +1,63 @@
1
+ -- Originally from: https://github.com/ioguix/pgsql-bloat-estimation/blob/master/table/table_bloat.sql
2
+ /* WARNING: executed with a non-superuser role, the query inspect only tables you are granted to read.
3
+ * This query is compatible with PostgreSQL 9.0 and more
4
+ */
5
+ SELECT current_database(), schemaname, tblname AS object_name, bs*tblpages AS real_size,
6
+ (tblpages-est_tblpages)*bs AS extra_size,
7
+ CASE WHEN tblpages - est_tblpages > 0
8
+ THEN 100 * (tblpages - est_tblpages)/tblpages::float
9
+ ELSE 0
10
+ END AS extra_ratio, fillfactor,
11
+ CASE WHEN tblpages - est_tblpages_ff > 0
12
+ THEN (tblpages-est_tblpages_ff)*bs
13
+ ELSE 0
14
+ END AS bloat_size,
15
+ CASE WHEN tblpages - est_tblpages_ff > 0
16
+ THEN 100 * (tblpages - est_tblpages_ff)/tblpages::float
17
+ ELSE 0
18
+ END AS bloat_ratio, is_na
19
+ -- , (pst).free_percent + (pst).dead_tuple_percent AS real_frag
20
+ FROM (
21
+ SELECT ceil( reltuples / ( (bs-page_hdr)/tpl_size ) ) + ceil( toasttuples / 4 ) AS est_tblpages,
22
+ ceil( reltuples / ( (bs-page_hdr)*fillfactor/(tpl_size*100) ) ) + ceil( toasttuples / 4 ) AS est_tblpages_ff,
23
+ tblpages, fillfactor, bs, tblid, schemaname, tblname, heappages, toastpages, is_na
24
+ -- , stattuple.pgstattuple(tblid) AS pst
25
+ FROM (
26
+ SELECT
27
+ ( 4 + tpl_hdr_size + tpl_data_size + (2*ma)
28
+ - CASE WHEN tpl_hdr_size%ma = 0 THEN ma ELSE tpl_hdr_size%ma END
29
+ - CASE WHEN ceil(tpl_data_size)::int%ma = 0 THEN ma ELSE ceil(tpl_data_size)::int%ma END
30
+ ) AS tpl_size, bs - page_hdr AS size_per_block, (heappages + toastpages) AS tblpages, heappages,
31
+ toastpages, reltuples, toasttuples, bs, page_hdr, tblid, schemaname, tblname, fillfactor, is_na
32
+ FROM (
33
+ SELECT
34
+ tbl.oid AS tblid, ns.nspname AS schemaname, tbl.relname AS tblname, tbl.reltuples,
35
+ tbl.relpages AS heappages, coalesce(toast.relpages, 0) AS toastpages,
36
+ coalesce(toast.reltuples, 0) AS toasttuples,
37
+ coalesce(substring(
38
+ array_to_string(tbl.reloptions, ' ')
39
+ FROM 'fillfactor=([0-9]+)')::smallint, 100) AS fillfactor,
40
+ current_setting('block_size')::numeric AS bs,
41
+ CASE WHEN version()~'mingw32' OR version()~'64-bit|x86_64|ppc64|ia64|amd64' THEN 8 ELSE 4 END AS ma,
42
+ 24 AS page_hdr,
43
+ 23 + CASE WHEN MAX(coalesce(null_frac,0)) > 0 THEN ( 7 + count(*) ) / 8 ELSE 0::int END
44
+ + CASE WHEN tbl.relhasoids THEN 4 ELSE 0 END AS tpl_hdr_size,
45
+ sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024) ) AS tpl_data_size,
46
+ bool_or(att.atttypid = 'pg_catalog.name'::regtype)
47
+ OR count(att.attname) <> count(s.attname) AS is_na
48
+ FROM pg_attribute AS att
49
+ JOIN pg_class AS tbl ON att.attrelid = tbl.oid
50
+ JOIN pg_namespace AS ns ON ns.oid = tbl.relnamespace
51
+ LEFT JOIN pg_stats AS s ON s.schemaname=ns.nspname
52
+ AND s.tablename = tbl.relname AND s.inherited=false AND s.attname=att.attname
53
+ LEFT JOIN pg_class AS toast ON tbl.reltoastrelid = toast.oid
54
+ WHERE att.attnum > 0 AND NOT att.attisdropped
55
+ AND tbl.relkind = 'r'
56
+ GROUP BY 1,2,3,4,5,6,7,8,9,10, tbl.relhasoids
57
+ ORDER BY 2,3
58
+ ) AS s
59
+ ) AS s2
60
+ ) AS s3
61
+ WHERE NOT is_na
62
+ -- AND tblpages*((pst).free_percent + (pst).dead_tuple_percent)::float4/100 >= 1
63
+ AND schemaname= 'public';
@@ -0,0 +1,527 @@
1
+ module GitLab
2
+ module Monitor
3
+ module Database
4
+ # A helper class to collect CI builds metrics.
5
+ class CiBuildsCollector < Base # rubocop:disable Metrics/ClassLength
6
+ SET_RANDOM_PAGE_COST = "SET LOCAL random_page_cost TO 1".freeze
7
+
8
+ BUILDS_QUERY_EE =
9
+ <<~SQL.freeze
10
+ SELECT
11
+ projects.namespace_id,
12
+ ci_builds.status,
13
+ projects.shared_runners_enabled,
14
+ (COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
15
+ COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
16
+ COUNT(*) AS count
17
+ FROM ci_builds
18
+ JOIN projects
19
+ ON projects.id = ci_builds.project_id
20
+ JOIN namespaces
21
+ ON namespaces.id = projects.namespace_id
22
+ LEFT JOIN namespace_statistics
23
+ ON namespace_statistics.namespace_id = namespaces.id
24
+ JOIN application_settings
25
+ ON application_settings.id = 1
26
+ WHERE ci_builds.type = 'Ci::Build'
27
+ AND ci_builds.status = '%s'
28
+ AND projects.pending_delete = 'f'
29
+ GROUP BY
30
+ projects.namespace_id,
31
+ ci_builds.status,
32
+ projects.shared_runners_enabled,
33
+ namespaces.shared_runners_minutes_limit,
34
+ namespace_statistics.shared_runners_seconds,
35
+ application_settings.shared_runners_minutes
36
+ SQL
37
+
38
+ BUILDS_QUERY_CE =
39
+ <<~SQL.freeze
40
+ SELECT
41
+ projects.namespace_id,
42
+ ci_builds.status,
43
+ projects.shared_runners_enabled,
44
+ COUNT(*) AS count
45
+ FROM ci_builds
46
+ JOIN projects
47
+ ON projects.id = ci_builds.project_id
48
+ WHERE ci_builds.type = 'Ci::Build'
49
+ AND ci_builds.status = '%s'
50
+ AND projects.pending_delete = 'f'
51
+ GROUP BY
52
+ projects.namespace_id,
53
+ ci_builds.status,
54
+ projects.shared_runners_enabled
55
+ SQL
56
+
57
+ STALE_BUILDS_QUERY =
58
+ <<~SQL.freeze
59
+ SELECT
60
+ COUNT(*) AS count
61
+ FROM ci_builds
62
+ JOIN projects
63
+ ON projects.id = ci_builds.project_id
64
+ WHERE ci_builds.type = 'Ci::Build'
65
+ AND ci_builds.status = 'running'
66
+ AND ci_builds.updated_at < NOW() - INTERVAL '1 hour'
67
+ AND projects.pending_delete = 'f'
68
+ SQL
69
+
70
+ PER_RUNNER_QUERY_EE =
71
+ <<~SQL.freeze
72
+ SELECT
73
+ ci_builds.runner_id,
74
+ ci_runners.is_shared,
75
+ projects.namespace_id,
76
+ projects.mirror,
77
+ projects.mirror_trigger_builds,
78
+ ci_pipelines.pipeline_schedule_id,
79
+ ci_builds.trigger_request_id,
80
+ (COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
81
+ COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
82
+ COUNT(*) AS count
83
+ FROM ci_builds
84
+ JOIN ci_runners
85
+ ON ci_runners.id = ci_builds.runner_id
86
+ JOIN projects
87
+ ON projects.id = ci_builds.project_id
88
+ JOIN ci_pipelines
89
+ ON ci_pipelines.id = ci_builds.commit_id
90
+ JOIN namespaces
91
+ ON namespaces.id = projects.namespace_id
92
+ LEFT JOIN namespace_statistics
93
+ ON namespace_statistics.namespace_id = namespaces.id
94
+ JOIN application_settings
95
+ ON application_settings.id = 1
96
+ WHERE ci_builds.type = 'Ci::Build'
97
+ AND ci_builds.status = 'running'
98
+ AND projects.pending_delete = 'f'
99
+ GROUP BY
100
+ ci_builds.runner_id,
101
+ ci_runners.is_shared,
102
+ projects.namespace_id,
103
+ projects.mirror,
104
+ projects.mirror_trigger_builds,
105
+ ci_pipelines.pipeline_schedule_id,
106
+ ci_builds.trigger_request_id,
107
+ namespaces.shared_runners_minutes_limit,
108
+ namespace_statistics.shared_runners_seconds,
109
+ application_settings.shared_runners_minutes
110
+ SQL
111
+
112
+ PER_RUNNER_QUERY_CE =
113
+ <<~SQL.freeze
114
+ SELECT
115
+ ci_builds.runner_id,
116
+ ci_runners.is_shared,
117
+ projects.namespace_id,
118
+ ci_pipelines.pipeline_schedule_id,
119
+ ci_builds.trigger_request_id,
120
+ COUNT(*) AS count
121
+ FROM ci_builds
122
+ JOIN ci_runners
123
+ ON ci_runners.id = ci_builds.runner_id
124
+ JOIN projects
125
+ ON projects.id = ci_builds.project_id
126
+ JOIN ci_pipelines
127
+ ON ci_pipelines.id = ci_builds.commit_id
128
+ WHERE ci_builds.type = 'Ci::Build'
129
+ AND ci_builds.status = 'running'
130
+ AND projects.pending_delete = 'f'
131
+ GROUP BY
132
+ ci_builds.runner_id,
133
+ ci_runners.is_shared,
134
+ projects.namespace_id,
135
+ ci_pipelines.pipeline_schedule_id,
136
+ ci_builds.trigger_request_id
137
+ SQL
138
+
139
+ MIRROR_COLUMN_QUERY =
140
+ <<~SQL.freeze
141
+ SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='projects' AND column_name='mirror')
142
+ SQL
143
+
144
+ REPEATED_COMMANDS_QUERY_EE =
145
+ <<~SQL.freeze
146
+ SELECT
147
+ subquery.namespace_id,
148
+ subquery.shared_runners_enabled,
149
+ subquery.project_id,
150
+ subquery.status,
151
+ subquery.has_minutes,
152
+ MAX(subquery.count) as count
153
+ FROM (
154
+ SELECT
155
+ projects.namespace_id,
156
+ projects.shared_runners_enabled,
157
+ ci_builds.project_id,
158
+ ci_builds.commit_id,
159
+ ci_builds.status,
160
+ (COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
161
+ COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
162
+ COUNT(*) AS count
163
+ FROM ci_builds
164
+ JOIN projects
165
+ ON projects.id = ci_builds.project_id
166
+ JOIN namespaces
167
+ ON namespaces.id = projects.namespace_id
168
+ LEFT JOIN namespace_statistics
169
+ ON namespace_statistics.namespace_id = namespaces.id
170
+ JOIN application_settings
171
+ ON application_settings.id = 1
172
+ WHERE ci_builds.type = 'Ci::Build'
173
+ AND ci_builds.status IN ('running', 'pending')
174
+ GROUP BY
175
+ projects.namespace_id,
176
+ projects.shared_runners_enabled,
177
+ ci_builds.project_id,
178
+ ci_builds.commit_id,
179
+ ci_builds.status,
180
+ ci_builds.commands,
181
+ namespaces.shared_runners_minutes_limit,
182
+ namespace_statistics.shared_runners_seconds,
183
+ application_settings.shared_runners_minutes
184
+ HAVING COUNT(*) > %d
185
+ ) AS subquery
186
+ GROUP BY
187
+ subquery.namespace_id,
188
+ subquery.shared_runners_enabled,
189
+ subquery.project_id,
190
+ subquery.commit_id,
191
+ subquery.status,
192
+ subquery.has_minutes
193
+ SQL
194
+
195
+ REPEATED_COMMANDS_QUERY_CE =
196
+ <<~SQL.freeze
197
+ SELECT
198
+ subquery.namespace_id,
199
+ subquery.shared_runners_enabled,
200
+ subquery.project_id,
201
+ subquery.status,
202
+ MAX(subquery.count) as count
203
+ FROM (
204
+ SELECT
205
+ projects.namespace_id,
206
+ projects.shared_runners_enabled,
207
+ ci_builds.project_id,
208
+ ci_builds.commit_id,
209
+ ci_builds.status,
210
+ COUNT(*) AS count
211
+ FROM ci_builds
212
+ JOIN projects
213
+ ON projects.id = ci_builds.project_id
214
+ JOIN namespaces
215
+ ON namespaces.id = projects.namespace_id
216
+ WHERE ci_builds.type = 'Ci::Build'
217
+ AND ci_builds.status IN ('running', 'pending')
218
+ GROUP BY
219
+ projects.namespace_id,
220
+ projects.shared_runners_enabled,
221
+ ci_builds.project_id,
222
+ ci_builds.commit_id,
223
+ ci_builds.status,
224
+ ci_builds.commands
225
+ HAVING COUNT(*) > %d
226
+ ) AS subquery
227
+ GROUP BY
228
+ subquery.namespace_id,
229
+ subquery.shared_runners_enabled,
230
+ subquery.project_id,
231
+ subquery.commit_id,
232
+ subquery.status
233
+ SQL
234
+
235
+ UNARCHIVED_TRACES_QUERY =
236
+ <<~SQL.freeze
237
+ SELECT
238
+ COUNT(*) as count
239
+ FROM ci_builds
240
+ JOIN ci_build_trace_chunks
241
+ ON ci_build_trace_chunks.build_id = ci_builds.id
242
+ LEFT JOIN ci_job_artifacts
243
+ ON ci_job_artifacts.job_id = ci_builds.id
244
+ AND ci_job_artifacts.file_type = 3
245
+ WHERE ci_builds.type = 'Ci::Build'
246
+ AND ci_builds.status IN ('success', 'failed', 'canceled')
247
+ AND ci_builds.finished_at < '%s'
248
+ AND ci_job_artifacts.job_id IS NULL
249
+ SQL
250
+
251
+ STATUS_CREATED = "created".freeze
252
+ STATUS_PENDING = "pending".freeze
253
+
254
+ DEFAULT_UNARCHIVED_TRACES_OFFSET_MINUTES = 1440
255
+
256
+ def initialize(opts)
257
+ super(opts)
258
+
259
+ @allowed_repeated_commands_count = opts[:allowed_repeated_commands_count]
260
+ @created_builds_counting_disabled = opts[:created_builds_counting_disabled]
261
+ @unarchived_traces_offset_minutes = opts[:unarchived_traces_offset_minutes]
262
+ end
263
+
264
+ def run
265
+ results = {}
266
+ results[:created_builds] = builds(STATUS_CREATED) unless @created_builds_counting_disabled
267
+ results[:pending_builds] = builds(STATUS_PENDING)
268
+ results[:stale_builds] = stale_builds
269
+ results[:per_runner] = per_runner_builds
270
+ results[:repeated_commands] = repeated_commands
271
+ results[:unarchived_traces] = unarchived_traces
272
+ results
273
+ end
274
+
275
+ private
276
+
277
+ def builds(status)
278
+ results = []
279
+
280
+ query = mirror_column? ? BUILDS_QUERY_EE : BUILDS_QUERY_CE
281
+ query = query % [status] # rubocop:disable Style/FormatString
282
+ exec_query_with_custom_random_page_cost(query).each do |row|
283
+ results << transform_builds_row_to_values(row)
284
+ end
285
+
286
+ results
287
+ rescue PG::UndefinedTable, PG::UndefinedColumn
288
+ results
289
+ end
290
+
291
+ def transform_builds_row_to_values(row)
292
+ values = { namespace: row["namespace_id"].to_s,
293
+ shared_runners: row["shared_runners_enabled"] == "t" ? "yes" : "no",
294
+ value: row["count"].to_i }
295
+ include_ee_fields(values, row)
296
+ end
297
+
298
+ def stale_builds
299
+ with_connection_pool do |conn|
300
+ conn.exec(STALE_BUILDS_QUERY)[0]["count"].to_i
301
+ end
302
+ rescue PG::UndefinedTable, PG::UndefinedColumn
303
+ 0
304
+ end
305
+
306
+ def per_runner_builds
307
+ results = []
308
+
309
+ query = mirror_column? ? PER_RUNNER_QUERY_EE : PER_RUNNER_QUERY_CE
310
+ exec_query_with_custom_random_page_cost(query).each do |row|
311
+ results << transform_per_runners_builds_row_to_values(row)
312
+ end
313
+
314
+ results
315
+ rescue PG::UndefinedTable, PG::UndefinedColumn
316
+ []
317
+ end
318
+
319
+ def transform_per_runners_builds_row_to_values(row)
320
+ values = { runner: row["runner_id"].to_s,
321
+ shared_runner: row["is_shared"] == "t" ? "yes" : "no",
322
+ namespace: row["namespace_id"].to_s,
323
+ scheduled: row["pipeline_schedule_id"] ? "yes" : "no",
324
+ triggered: row["trigger_request_id"] ? "yes" : "no",
325
+ value: row["count"].to_i }
326
+ include_ee_fields(values, row)
327
+ end
328
+
329
+ def repeated_commands
330
+ results = []
331
+
332
+ query = mirror_column? ? REPEATED_COMMANDS_QUERY_EE : REPEATED_COMMANDS_QUERY_CE
333
+ query = query % [allowed_repeated_commands_count] # rubocop:disable Style/FormatString
334
+ exec_query_with_custom_random_page_cost(query).each do |row|
335
+ results << transform_repeated_commands_row_to_values(row)
336
+ end
337
+
338
+ results
339
+ rescue PG::UndefinedTable, PG::UndefinedColumn
340
+ []
341
+ end
342
+
343
+ def allowed_repeated_commands_count
344
+ @allowed_repeated_commands_count ||= 2
345
+ end
346
+
347
+ def transform_repeated_commands_row_to_values(row)
348
+ values = { namespace: row["namespace_id"].to_s,
349
+ project: row["project_id"].to_s,
350
+ shared_runners: row["shared_runners_enabled"] == "t" ? "yes" : "no",
351
+ status: row["status"].to_s,
352
+ value: row["count"].to_i }
353
+
354
+ include_has_minutes_field(values, row)
355
+ end
356
+
357
+ def unarchived_traces
358
+ time = Time.now - (unarchived_traces_offset_minutes * 60)
359
+ query = UNARCHIVED_TRACES_QUERY % [time.strftime("%F %T")] # rubocop:disable Style/FormatString
360
+
361
+ with_connection_pool do |conn|
362
+ conn.exec(query)[0]["count"].to_i
363
+ end
364
+ rescue PG::UndefinedTable, PG::UndefinedColumn
365
+ 0
366
+ end
367
+
368
+ def unarchived_traces_offset_minutes
369
+ @unarchived_traces_offset_minutes ||= DEFAULT_UNARCHIVED_TRACES_OFFSET_MINUTES
370
+ end
371
+
372
+ def include_ee_fields(values, row)
373
+ values.merge!(include_bool_if_row_defined(row, :mirror))
374
+ values.merge!(include_bool_if_row_defined(row, :mirror_trigger_builds))
375
+ include_has_minutes_field(values, row)
376
+ end
377
+
378
+ def include_has_minutes_field(values, row)
379
+ values.merge!(include_bool_if_row_defined(row, :has_minutes))
380
+ values
381
+ end
382
+
383
+ def include_bool_if_row_defined(row, field)
384
+ return {} unless row[field.to_s]
385
+ { field => row[field.to_s] == "t" ? "yes" : "no" }
386
+ end
387
+
388
+ def exec_query_with_custom_random_page_cost(query)
389
+ with_connection_pool do |conn|
390
+ conn.transaction do |trans|
391
+ trans.exec(SET_RANDOM_PAGE_COST)
392
+ trans.exec(query)
393
+ end
394
+ end
395
+ end
396
+
397
+ def mirror_column?
398
+ @mirror_column ||=
399
+ begin
400
+ with_connection_pool do |conn|
401
+ conn.exec(MIRROR_COLUMN_QUERY)[0]["exists"] == "t"
402
+ end
403
+ rescue PG::UndefinedColumn
404
+ false
405
+ end
406
+ end
407
+ end
408
+
409
+ # The prober which is called when gathering metrics
410
+ class CiBuildsProber
411
+ def initialize(opts, metrics: PrometheusMetrics.new)
412
+ @metrics = metrics
413
+
414
+ collector_opts = { connection_string: opts[:connection_string],
415
+ allowed_repeated_commands_count: opts[:allowed_repeated_commands_count],
416
+ created_builds_counting_disabled: opts[:created_builds_counting_disabled],
417
+ unarchived_traces_offset_minutes: opts[:unarchived_traces_offset_minutes] }
418
+ @collector = CiBuildsCollector.new(collector_opts)
419
+ end
420
+
421
+ def probe_db
422
+ @results = @collector.run
423
+
424
+ ci_builds_metrics(@results[:created_builds], "ci_created_builds") if @results[:created_builds]
425
+ ci_builds_metrics(@results[:pending_builds], "ci_pending_builds")
426
+ ci_stale_builds_metrics
427
+ metrics_per_runner
428
+ repeated_commands_metrics
429
+ unarchived_traces_metrics
430
+
431
+ self
432
+ rescue PG::ConnectionBad
433
+ self
434
+ end
435
+
436
+ def write_to(target)
437
+ target.write(@metrics.to_s)
438
+ end
439
+
440
+ private
441
+
442
+ def ci_builds_metrics(results_list, metric_name)
443
+ other_values = {}
444
+
445
+ results_list.each do |metric|
446
+ # If we have a low value, put the value into an "other" bucket.
447
+ if metric[:value] < 10
448
+ key = { shared_runners: metric[:shared_runners] }
449
+ key[:has_minutes] = metric[:has_minutes] if metric[:has_minutes]
450
+
451
+ other_values[key] ||= 0
452
+ other_values[key] += metric[:value]
453
+ else
454
+ add_ci_created_pending_builds(metric_name, metric[:value], metric)
455
+ end
456
+ end
457
+
458
+ # Add metrics for the "other" bucket.
459
+ other_values.each { |key, value| add_ci_created_pending_builds(metric_name, value, key) }
460
+ end
461
+
462
+ def add_ci_created_pending_builds(metric_name, value, labels)
463
+ add_metric_with_namespace_label(metric_name,
464
+ [:namespace, :shared_runners, :has_minutes],
465
+ value,
466
+ labels)
467
+ end
468
+
469
+ def ci_stale_builds_metrics
470
+ @metrics.add("ci_stale_builds", @results[:stale_builds])
471
+ end
472
+
473
+ def metrics_per_runner
474
+ other_values = {}
475
+
476
+ @results[:per_runner].each do |metric|
477
+ # If we have a low value, put the value into an "other" bucket.
478
+ if metric[:value] < 10
479
+ key = { runner: metric[:runner], shared_runner: metric[:shared_runner],
480
+ scheduled: metric[:scheduled], triggered: metric[:triggered] }
481
+ key[:mirror] = metric[:mirror] if metric[:mirror]
482
+ key[:mirror_trigger_builds] = metric[:mirror_trigger_builds] if metric[:mirror_trigger_builds]
483
+ key[:has_minutes] = metric[:has_minutes] if metric[:has_minutes]
484
+
485
+ other_values[key] ||= 0
486
+ other_values[key] += metric[:value]
487
+ else
488
+ add_ci_running_builds(metric[:value], metric)
489
+ end
490
+ end
491
+
492
+ # Add metrics for the "other" bucket.
493
+ other_values.each { |key, value| add_ci_running_builds(value, key) }
494
+ end
495
+
496
+ def add_ci_running_builds(value, labels)
497
+ add_metric_with_namespace_label(
498
+ "ci_running_builds",
499
+ [:runner, :namespace, :shared_runner, :scheduled,
500
+ :triggered, :mirror, :mirror_trigger_builds, :has_minutes],
501
+ value,
502
+ labels
503
+ )
504
+ end
505
+
506
+ def add_metric_with_namespace_label(metric_name, allowed_labels, value, labels)
507
+ labels[:namespace] = "" unless labels[:namespace]
508
+
509
+ selected_labels = labels.select { |k, _| allowed_labels.include?(k) }.sort.to_h
510
+ @metrics.add(metric_name, value, selected_labels)
511
+ end
512
+
513
+ def repeated_commands_metrics
514
+ @results[:repeated_commands].each do |metric|
515
+ value = metric.delete(:value)
516
+
517
+ @metrics.add("ci_repeated_commands_builds", value, metric)
518
+ end
519
+ end
520
+
521
+ def unarchived_traces_metrics
522
+ @metrics.add("ci_unarchived_traces", @results[:unarchived_traces])
523
+ end
524
+ end
525
+ end
526
+ end
527
+ end