gitlab-monitor 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +2 -0
  3. data/.gitlab-ci.yml +18 -0
  4. data/.rubocop.yml +34 -0
  5. data/CONTRIBUTING.md +651 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +75 -0
  8. data/LICENSE +25 -0
  9. data/README.md +110 -0
  10. data/bin/gitlab-mon +17 -0
  11. data/config/gitlab-monitor.yml.example +112 -0
  12. data/gitlab-monitor.gemspec +33 -0
  13. data/lib/gitlab_monitor.rb +18 -0
  14. data/lib/gitlab_monitor/cli.rb +341 -0
  15. data/lib/gitlab_monitor/database.rb +13 -0
  16. data/lib/gitlab_monitor/database/base.rb +44 -0
  17. data/lib/gitlab_monitor/database/bloat.rb +74 -0
  18. data/lib/gitlab_monitor/database/bloat_btree.sql +84 -0
  19. data/lib/gitlab_monitor/database/bloat_table.sql +63 -0
  20. data/lib/gitlab_monitor/database/ci_builds.rb +527 -0
  21. data/lib/gitlab_monitor/database/remote_mirrors.rb +74 -0
  22. data/lib/gitlab_monitor/database/row_count.rb +164 -0
  23. data/lib/gitlab_monitor/database/tuple_stats.rb +53 -0
  24. data/lib/gitlab_monitor/git.rb +144 -0
  25. data/lib/gitlab_monitor/memstats.rb +98 -0
  26. data/lib/gitlab_monitor/memstats/mapping.rb +91 -0
  27. data/lib/gitlab_monitor/prober.rb +40 -0
  28. data/lib/gitlab_monitor/process.rb +122 -0
  29. data/lib/gitlab_monitor/prometheus.rb +64 -0
  30. data/lib/gitlab_monitor/sidekiq.rb +149 -0
  31. data/lib/gitlab_monitor/sidekiq_queue_job_stats.lua +42 -0
  32. data/lib/gitlab_monitor/util.rb +83 -0
  33. data/lib/gitlab_monitor/version.rb +5 -0
  34. data/lib/gitlab_monitor/web_exporter.rb +77 -0
  35. data/spec/cli_spec.rb +31 -0
  36. data/spec/database/bloat_spec.rb +99 -0
  37. data/spec/database/ci_builds_spec.rb +421 -0
  38. data/spec/database/row_count_spec.rb +37 -0
  39. data/spec/fixtures/smaps/sample.txt +10108 -0
  40. data/spec/git_process_proper_spec.rb +27 -0
  41. data/spec/git_spec.rb +52 -0
  42. data/spec/memstats_spec.rb +28 -0
  43. data/spec/prometheus_metrics_spec.rb +17 -0
  44. data/spec/spec_helper.rb +63 -0
  45. data/spec/util_spec.rb +15 -0
  46. metadata +225 -0
@@ -0,0 +1,13 @@
1
+ module GitLab
2
+ module Monitor
3
+ # Database-related classes
4
+ module Database
5
+ autoload :Base, "gitlab_monitor/database/base"
6
+ autoload :CiBuildsProber, "gitlab_monitor/database/ci_builds"
7
+ autoload :TuplesProber, "gitlab_monitor/database/tuple_stats"
8
+ autoload :RowCountProber, "gitlab_monitor/database/row_count"
9
+ autoload :BloatProber, "gitlab_monitor/database/bloat"
10
+ autoload :RemoteMirrorsProber, "gitlab_monitor/database/remote_mirrors"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,44 @@
1
+ require "pg"
2
+ require "connection_pool"
3
+
4
+ module GitLab
5
+ module Monitor
6
+ module Database
7
+ # An abstract class for interacting with DB
8
+ #
9
+ # It takes a connection string (e.g. "dbname=test port=5432")
10
+ class Base
11
+ def self.connection_pool
12
+ @connection_pool ||= Hash.new do |h, connection_string|
13
+ h[connection_string] = ConnectionPool.new(size: 3, timeout: 5) do
14
+ PG.connect(connection_string)
15
+ end
16
+ end
17
+ end
18
+
19
+ def initialize(args)
20
+ @connection_string = args[:connection_string]
21
+ end
22
+
23
+ def run
24
+ fail NotImplemented
25
+ end
26
+
27
+ def connection_pool
28
+ self.class.connection_pool[@connection_string]
29
+ end
30
+
31
+ def with_connection_pool
32
+ connection_pool.with do |conn|
33
+ begin
34
+ yield conn
35
+ rescue PG::UnableToSend => e
36
+ conn.reset
37
+ raise e
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,74 @@
1
+ module GitLab
2
+ module Monitor
3
+ module Database
4
+ # Helper to collect bloat metrics.
5
+ class BloatCollector < Base
6
+ def run(type = :btree)
7
+ execute(self.class.query_for(type)).each_with_object({}) do |row, h|
8
+ h[row["object_name"]] = row
9
+ end
10
+ end
11
+
12
+ private
13
+
14
+ def execute(query)
15
+ with_connection_pool do |conn|
16
+ conn.exec(query)
17
+ end
18
+ end
19
+
20
+ class << self
21
+ def query_for(type)
22
+ @queries ||= {}
23
+
24
+ return @queries[type] if @queries[type]
25
+
26
+ file = File.join(__dir__, "bloat_#{type}.sql")
27
+ fail "Unknown bloat query file: #{file}" unless File.exist?(file)
28
+
29
+ @queries[type] = File.read(file)
30
+ end
31
+ end
32
+ end
33
+
34
+ # Prober class to gather bloat metrics
35
+ class BloatProber
36
+ METRIC_KEYS = %w(bloat_ratio bloat_size extra_size real_size).freeze
37
+
38
+ attr_reader :metrics, :collector, :bloat_types
39
+
40
+ def initialize(opts,
41
+ metrics: PrometheusMetrics.new,
42
+ collector: BloatCollector.new(connection_string: opts[:connection_string]))
43
+ @metrics = metrics
44
+ @collector = collector
45
+ @bloat_types = opts[:bloat_types] || %i(btree table)
46
+ end
47
+
48
+ def probe_db
49
+ bloat_types.each do |type|
50
+ probe_for_type(type)
51
+ end
52
+ end
53
+
54
+ def write_to(target)
55
+ target.write(metrics.to_s)
56
+ end
57
+
58
+ private
59
+
60
+ def probe_for_type(type)
61
+ collector.run(type).each do |query_name, data|
62
+ METRIC_KEYS.each do |key|
63
+ metrics.add("gitlab_database_bloat_#{type}_#{key}", data[key], query_name: query_name)
64
+ end
65
+ end
66
+
67
+ self
68
+ rescue PG::ConnectionBad
69
+ self
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,84 @@
1
+ -- Originally from: https://github.com/ioguix/pgsql-bloat-estimation/blob/master/btree/btree_bloat.sql
2
+ -- WARNING: executed with a non-superuser role, the query inspect only index on tables you are granted to read.
3
+ -- WARNING: rows with is_na = 't' are known to have bad statistics ("name" type is not supported).
4
+ -- This query is compatible with PostgreSQL 8.2 and after
5
+ SELECT current_database(), nspname AS schemaname, tblname, idxname AS object_name, bs*(relpages)::bigint AS real_size,
6
+ bs*(relpages-est_pages)::bigint AS extra_size,
7
+ 100 * (relpages-est_pages)::float / relpages AS extra_ratio,
8
+ fillfactor,
9
+ CASE WHEN relpages > est_pages_ff
10
+ THEN bs*(relpages-est_pages_ff)
11
+ ELSE 0
12
+ END AS bloat_size,
13
+ 100 * (relpages-est_pages_ff)::float / relpages AS bloat_ratio,
14
+ is_na
15
+ -- , 100-(sub.pst).avg_leaf_density, est_pages, index_tuple_hdr_bm, maxalign, pagehdr, nulldatawidth, nulldatahdrwidth, sub.reltuples, sub.relpages -- (DEBUG INFO)
16
+ FROM (
17
+ SELECT coalesce(1 +
18
+ ceil(reltuples/floor((bs-pageopqdata-pagehdr)/(4+nulldatahdrwidth)::float)), 0 -- ItemIdData size + computed avg size of a tuple (nulldatahdrwidth)
19
+ ) AS est_pages,
20
+ coalesce(1 +
21
+ ceil(reltuples/floor((bs-pageopqdata-pagehdr)*fillfactor/(100*(4+nulldatahdrwidth)::float))), 0
22
+ ) AS est_pages_ff,
23
+ bs, nspname, table_oid, tblname, idxname, relpages, fillfactor, is_na
24
+ -- , stattuple.pgstatindex(quote_ident(nspname)||'.'||quote_ident(idxname)) AS pst, index_tuple_hdr_bm, maxalign, pagehdr, nulldatawidth, nulldatahdrwidth, reltuples -- (DEBUG INFO)
25
+ FROM (
26
+ SELECT maxalign, bs, nspname, tblname, idxname, reltuples, relpages, relam, table_oid, fillfactor,
27
+ ( index_tuple_hdr_bm +
28
+ maxalign - CASE -- Add padding to the index tuple header to align on MAXALIGN
29
+ WHEN index_tuple_hdr_bm%maxalign = 0 THEN maxalign
30
+ ELSE index_tuple_hdr_bm%maxalign
31
+ END
32
+ + nulldatawidth + maxalign - CASE -- Add padding to the data to align on MAXALIGN
33
+ WHEN nulldatawidth = 0 THEN 0
34
+ WHEN nulldatawidth::integer%maxalign = 0 THEN maxalign
35
+ ELSE nulldatawidth::integer%maxalign
36
+ END
37
+ )::numeric AS nulldatahdrwidth, pagehdr, pageopqdata, is_na
38
+ -- , index_tuple_hdr_bm, nulldatawidth -- (DEBUG INFO)
39
+ FROM (
40
+ SELECT
41
+ i.nspname, i.tblname, i.idxname, i.reltuples, i.relpages, i.relam, a.attrelid AS table_oid,
42
+ current_setting('block_size')::numeric AS bs, fillfactor,
43
+ CASE -- MAXALIGN: 4 on 32bits, 8 on 64bits (and mingw32 ?)
44
+ WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8
45
+ ELSE 4
46
+ END AS maxalign,
47
+ /* per page header, fixed size: 20 for 7.X, 24 for others */
48
+ 24 AS pagehdr,
49
+ /* per page btree opaque data */
50
+ 16 AS pageopqdata,
51
+ /* per tuple header: add IndexAttributeBitMapData if some cols are null-able */
52
+ CASE WHEN max(coalesce(s.null_frac,0)) = 0
53
+ THEN 2 -- IndexTupleData size
54
+ ELSE 2 + (( 32 + 8 - 1 ) / 8) -- IndexTupleData size + IndexAttributeBitMapData size ( max num filed per index + 8 - 1 /8)
55
+ END AS index_tuple_hdr_bm,
56
+ /* data len: we remove null values save space using it fractionnal part from stats */
57
+ sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024)) AS nulldatawidth,
58
+ max( CASE WHEN a.atttypid = 'pg_catalog.name'::regtype THEN 1 ELSE 0 END ) > 0 AS is_na
59
+ FROM pg_attribute AS a
60
+ JOIN (
61
+ SELECT nspname, tbl.relname AS tblname, idx.relname AS idxname, idx.reltuples, idx.relpages, idx.relam,
62
+ indrelid, indexrelid, indkey::smallint[] AS attnum,
63
+ coalesce(substring(
64
+ array_to_string(idx.reloptions, ' ')
65
+ from 'fillfactor=([0-9]+)')::smallint, 90) AS fillfactor
66
+ FROM pg_index
67
+ JOIN pg_class idx ON idx.oid=pg_index.indexrelid
68
+ JOIN pg_class tbl ON tbl.oid=pg_index.indrelid
69
+ JOIN pg_namespace ON pg_namespace.oid = idx.relnamespace
70
+ WHERE pg_index.indisvalid AND tbl.relkind = 'r' AND idx.relpages > 0
71
+ ) AS i ON a.attrelid = i.indexrelid
72
+ JOIN pg_stats AS s ON s.schemaname = i.nspname
73
+ AND ((s.tablename = i.tblname AND s.attname = pg_catalog.pg_get_indexdef(a.attrelid, a.attnum, TRUE)) -- stats from tbl
74
+ OR (s.tablename = i.idxname AND s.attname = a.attname))-- stats from functionnal cols
75
+ JOIN pg_type AS t ON a.atttypid = t.oid
76
+ WHERE a.attnum > 0
77
+ GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9
78
+ ) AS s1
79
+ ) AS s2
80
+ JOIN pg_am am ON s2.relam = am.oid WHERE am.amname = 'btree'
81
+ ) AS sub
82
+ WHERE NOT is_na
83
+ AND nspname = 'public'
84
+ ORDER BY 2,3,4;
@@ -0,0 +1,63 @@
1
+ -- Originally from: https://github.com/ioguix/pgsql-bloat-estimation/blob/master/table/table_bloat.sql
2
+ /* WARNING: executed with a non-superuser role, the query inspect only tables you are granted to read.
3
+ * This query is compatible with PostgreSQL 9.0 and more
4
+ */
5
+ SELECT current_database(), schemaname, tblname AS object_name, bs*tblpages AS real_size,
6
+ (tblpages-est_tblpages)*bs AS extra_size,
7
+ CASE WHEN tblpages - est_tblpages > 0
8
+ THEN 100 * (tblpages - est_tblpages)/tblpages::float
9
+ ELSE 0
10
+ END AS extra_ratio, fillfactor,
11
+ CASE WHEN tblpages - est_tblpages_ff > 0
12
+ THEN (tblpages-est_tblpages_ff)*bs
13
+ ELSE 0
14
+ END AS bloat_size,
15
+ CASE WHEN tblpages - est_tblpages_ff > 0
16
+ THEN 100 * (tblpages - est_tblpages_ff)/tblpages::float
17
+ ELSE 0
18
+ END AS bloat_ratio, is_na
19
+ -- , (pst).free_percent + (pst).dead_tuple_percent AS real_frag
20
+ FROM (
21
+ SELECT ceil( reltuples / ( (bs-page_hdr)/tpl_size ) ) + ceil( toasttuples / 4 ) AS est_tblpages,
22
+ ceil( reltuples / ( (bs-page_hdr)*fillfactor/(tpl_size*100) ) ) + ceil( toasttuples / 4 ) AS est_tblpages_ff,
23
+ tblpages, fillfactor, bs, tblid, schemaname, tblname, heappages, toastpages, is_na
24
+ -- , stattuple.pgstattuple(tblid) AS pst
25
+ FROM (
26
+ SELECT
27
+ ( 4 + tpl_hdr_size + tpl_data_size + (2*ma)
28
+ - CASE WHEN tpl_hdr_size%ma = 0 THEN ma ELSE tpl_hdr_size%ma END
29
+ - CASE WHEN ceil(tpl_data_size)::int%ma = 0 THEN ma ELSE ceil(tpl_data_size)::int%ma END
30
+ ) AS tpl_size, bs - page_hdr AS size_per_block, (heappages + toastpages) AS tblpages, heappages,
31
+ toastpages, reltuples, toasttuples, bs, page_hdr, tblid, schemaname, tblname, fillfactor, is_na
32
+ FROM (
33
+ SELECT
34
+ tbl.oid AS tblid, ns.nspname AS schemaname, tbl.relname AS tblname, tbl.reltuples,
35
+ tbl.relpages AS heappages, coalesce(toast.relpages, 0) AS toastpages,
36
+ coalesce(toast.reltuples, 0) AS toasttuples,
37
+ coalesce(substring(
38
+ array_to_string(tbl.reloptions, ' ')
39
+ FROM 'fillfactor=([0-9]+)')::smallint, 100) AS fillfactor,
40
+ current_setting('block_size')::numeric AS bs,
41
+ CASE WHEN version()~'mingw32' OR version()~'64-bit|x86_64|ppc64|ia64|amd64' THEN 8 ELSE 4 END AS ma,
42
+ 24 AS page_hdr,
43
+ 23 + CASE WHEN MAX(coalesce(null_frac,0)) > 0 THEN ( 7 + count(*) ) / 8 ELSE 0::int END
44
+ + CASE WHEN tbl.relhasoids THEN 4 ELSE 0 END AS tpl_hdr_size,
45
+ sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024) ) AS tpl_data_size,
46
+ bool_or(att.atttypid = 'pg_catalog.name'::regtype)
47
+ OR count(att.attname) <> count(s.attname) AS is_na
48
+ FROM pg_attribute AS att
49
+ JOIN pg_class AS tbl ON att.attrelid = tbl.oid
50
+ JOIN pg_namespace AS ns ON ns.oid = tbl.relnamespace
51
+ LEFT JOIN pg_stats AS s ON s.schemaname=ns.nspname
52
+ AND s.tablename = tbl.relname AND s.inherited=false AND s.attname=att.attname
53
+ LEFT JOIN pg_class AS toast ON tbl.reltoastrelid = toast.oid
54
+ WHERE att.attnum > 0 AND NOT att.attisdropped
55
+ AND tbl.relkind = 'r'
56
+ GROUP BY 1,2,3,4,5,6,7,8,9,10, tbl.relhasoids
57
+ ORDER BY 2,3
58
+ ) AS s
59
+ ) AS s2
60
+ ) AS s3
61
+ WHERE NOT is_na
62
+ -- AND tblpages*((pst).free_percent + (pst).dead_tuple_percent)::float4/100 >= 1
63
+ AND schemaname= 'public';
@@ -0,0 +1,527 @@
1
+ module GitLab
2
+ module Monitor
3
+ module Database
4
+ # A helper class to collect CI builds metrics.
5
+ class CiBuildsCollector < Base # rubocop:disable Metrics/ClassLength
6
+ SET_RANDOM_PAGE_COST = "SET LOCAL random_page_cost TO 1".freeze
7
+
8
+ BUILDS_QUERY_EE =
9
+ <<~SQL.freeze
10
+ SELECT
11
+ projects.namespace_id,
12
+ ci_builds.status,
13
+ projects.shared_runners_enabled,
14
+ (COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
15
+ COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
16
+ COUNT(*) AS count
17
+ FROM ci_builds
18
+ JOIN projects
19
+ ON projects.id = ci_builds.project_id
20
+ JOIN namespaces
21
+ ON namespaces.id = projects.namespace_id
22
+ LEFT JOIN namespace_statistics
23
+ ON namespace_statistics.namespace_id = namespaces.id
24
+ JOIN application_settings
25
+ ON application_settings.id = 1
26
+ WHERE ci_builds.type = 'Ci::Build'
27
+ AND ci_builds.status = '%s'
28
+ AND projects.pending_delete = 'f'
29
+ GROUP BY
30
+ projects.namespace_id,
31
+ ci_builds.status,
32
+ projects.shared_runners_enabled,
33
+ namespaces.shared_runners_minutes_limit,
34
+ namespace_statistics.shared_runners_seconds,
35
+ application_settings.shared_runners_minutes
36
+ SQL
37
+
38
+ BUILDS_QUERY_CE =
39
+ <<~SQL.freeze
40
+ SELECT
41
+ projects.namespace_id,
42
+ ci_builds.status,
43
+ projects.shared_runners_enabled,
44
+ COUNT(*) AS count
45
+ FROM ci_builds
46
+ JOIN projects
47
+ ON projects.id = ci_builds.project_id
48
+ WHERE ci_builds.type = 'Ci::Build'
49
+ AND ci_builds.status = '%s'
50
+ AND projects.pending_delete = 'f'
51
+ GROUP BY
52
+ projects.namespace_id,
53
+ ci_builds.status,
54
+ projects.shared_runners_enabled
55
+ SQL
56
+
57
+ STALE_BUILDS_QUERY =
58
+ <<~SQL.freeze
59
+ SELECT
60
+ COUNT(*) AS count
61
+ FROM ci_builds
62
+ JOIN projects
63
+ ON projects.id = ci_builds.project_id
64
+ WHERE ci_builds.type = 'Ci::Build'
65
+ AND ci_builds.status = 'running'
66
+ AND ci_builds.updated_at < NOW() - INTERVAL '1 hour'
67
+ AND projects.pending_delete = 'f'
68
+ SQL
69
+
70
+ PER_RUNNER_QUERY_EE =
71
+ <<~SQL.freeze
72
+ SELECT
73
+ ci_builds.runner_id,
74
+ ci_runners.is_shared,
75
+ projects.namespace_id,
76
+ projects.mirror,
77
+ projects.mirror_trigger_builds,
78
+ ci_pipelines.pipeline_schedule_id,
79
+ ci_builds.trigger_request_id,
80
+ (COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
81
+ COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
82
+ COUNT(*) AS count
83
+ FROM ci_builds
84
+ JOIN ci_runners
85
+ ON ci_runners.id = ci_builds.runner_id
86
+ JOIN projects
87
+ ON projects.id = ci_builds.project_id
88
+ JOIN ci_pipelines
89
+ ON ci_pipelines.id = ci_builds.commit_id
90
+ JOIN namespaces
91
+ ON namespaces.id = projects.namespace_id
92
+ LEFT JOIN namespace_statistics
93
+ ON namespace_statistics.namespace_id = namespaces.id
94
+ JOIN application_settings
95
+ ON application_settings.id = 1
96
+ WHERE ci_builds.type = 'Ci::Build'
97
+ AND ci_builds.status = 'running'
98
+ AND projects.pending_delete = 'f'
99
+ GROUP BY
100
+ ci_builds.runner_id,
101
+ ci_runners.is_shared,
102
+ projects.namespace_id,
103
+ projects.mirror,
104
+ projects.mirror_trigger_builds,
105
+ ci_pipelines.pipeline_schedule_id,
106
+ ci_builds.trigger_request_id,
107
+ namespaces.shared_runners_minutes_limit,
108
+ namespace_statistics.shared_runners_seconds,
109
+ application_settings.shared_runners_minutes
110
+ SQL
111
+
112
+ PER_RUNNER_QUERY_CE =
113
+ <<~SQL.freeze
114
+ SELECT
115
+ ci_builds.runner_id,
116
+ ci_runners.is_shared,
117
+ projects.namespace_id,
118
+ ci_pipelines.pipeline_schedule_id,
119
+ ci_builds.trigger_request_id,
120
+ COUNT(*) AS count
121
+ FROM ci_builds
122
+ JOIN ci_runners
123
+ ON ci_runners.id = ci_builds.runner_id
124
+ JOIN projects
125
+ ON projects.id = ci_builds.project_id
126
+ JOIN ci_pipelines
127
+ ON ci_pipelines.id = ci_builds.commit_id
128
+ WHERE ci_builds.type = 'Ci::Build'
129
+ AND ci_builds.status = 'running'
130
+ AND projects.pending_delete = 'f'
131
+ GROUP BY
132
+ ci_builds.runner_id,
133
+ ci_runners.is_shared,
134
+ projects.namespace_id,
135
+ ci_pipelines.pipeline_schedule_id,
136
+ ci_builds.trigger_request_id
137
+ SQL
138
+
139
+ MIRROR_COLUMN_QUERY =
140
+ <<~SQL.freeze
141
+ SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='projects' AND column_name='mirror')
142
+ SQL
143
+
144
+ REPEATED_COMMANDS_QUERY_EE =
145
+ <<~SQL.freeze
146
+ SELECT
147
+ subquery.namespace_id,
148
+ subquery.shared_runners_enabled,
149
+ subquery.project_id,
150
+ subquery.status,
151
+ subquery.has_minutes,
152
+ MAX(subquery.count) as count
153
+ FROM (
154
+ SELECT
155
+ projects.namespace_id,
156
+ projects.shared_runners_enabled,
157
+ ci_builds.project_id,
158
+ ci_builds.commit_id,
159
+ ci_builds.status,
160
+ (COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
161
+ COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
162
+ COUNT(*) AS count
163
+ FROM ci_builds
164
+ JOIN projects
165
+ ON projects.id = ci_builds.project_id
166
+ JOIN namespaces
167
+ ON namespaces.id = projects.namespace_id
168
+ LEFT JOIN namespace_statistics
169
+ ON namespace_statistics.namespace_id = namespaces.id
170
+ JOIN application_settings
171
+ ON application_settings.id = 1
172
+ WHERE ci_builds.type = 'Ci::Build'
173
+ AND ci_builds.status IN ('running', 'pending')
174
+ GROUP BY
175
+ projects.namespace_id,
176
+ projects.shared_runners_enabled,
177
+ ci_builds.project_id,
178
+ ci_builds.commit_id,
179
+ ci_builds.status,
180
+ ci_builds.commands,
181
+ namespaces.shared_runners_minutes_limit,
182
+ namespace_statistics.shared_runners_seconds,
183
+ application_settings.shared_runners_minutes
184
+ HAVING COUNT(*) > %d
185
+ ) AS subquery
186
+ GROUP BY
187
+ subquery.namespace_id,
188
+ subquery.shared_runners_enabled,
189
+ subquery.project_id,
190
+ subquery.commit_id,
191
+ subquery.status,
192
+ subquery.has_minutes
193
+ SQL
194
+
195
+ REPEATED_COMMANDS_QUERY_CE =
196
+ <<~SQL.freeze
197
+ SELECT
198
+ subquery.namespace_id,
199
+ subquery.shared_runners_enabled,
200
+ subquery.project_id,
201
+ subquery.status,
202
+ MAX(subquery.count) as count
203
+ FROM (
204
+ SELECT
205
+ projects.namespace_id,
206
+ projects.shared_runners_enabled,
207
+ ci_builds.project_id,
208
+ ci_builds.commit_id,
209
+ ci_builds.status,
210
+ COUNT(*) AS count
211
+ FROM ci_builds
212
+ JOIN projects
213
+ ON projects.id = ci_builds.project_id
214
+ JOIN namespaces
215
+ ON namespaces.id = projects.namespace_id
216
+ WHERE ci_builds.type = 'Ci::Build'
217
+ AND ci_builds.status IN ('running', 'pending')
218
+ GROUP BY
219
+ projects.namespace_id,
220
+ projects.shared_runners_enabled,
221
+ ci_builds.project_id,
222
+ ci_builds.commit_id,
223
+ ci_builds.status,
224
+ ci_builds.commands
225
+ HAVING COUNT(*) > %d
226
+ ) AS subquery
227
+ GROUP BY
228
+ subquery.namespace_id,
229
+ subquery.shared_runners_enabled,
230
+ subquery.project_id,
231
+ subquery.commit_id,
232
+ subquery.status
233
+ SQL
234
+
235
+ UNARCHIVED_TRACES_QUERY =
236
+ <<~SQL.freeze
237
+ SELECT
238
+ COUNT(*) as count
239
+ FROM ci_builds
240
+ JOIN ci_build_trace_chunks
241
+ ON ci_build_trace_chunks.build_id = ci_builds.id
242
+ LEFT JOIN ci_job_artifacts
243
+ ON ci_job_artifacts.job_id = ci_builds.id
244
+ AND ci_job_artifacts.file_type = 3
245
+ WHERE ci_builds.type = 'Ci::Build'
246
+ AND ci_builds.status IN ('success', 'failed', 'canceled')
247
+ AND ci_builds.finished_at < '%s'
248
+ AND ci_job_artifacts.job_id IS NULL
249
+ SQL
250
+
251
+ STATUS_CREATED = "created".freeze
252
+ STATUS_PENDING = "pending".freeze
253
+
254
+ DEFAULT_UNARCHIVED_TRACES_OFFSET_MINUTES = 1440
255
+
256
+ def initialize(opts)
257
+ super(opts)
258
+
259
+ @allowed_repeated_commands_count = opts[:allowed_repeated_commands_count]
260
+ @created_builds_counting_disabled = opts[:created_builds_counting_disabled]
261
+ @unarchived_traces_offset_minutes = opts[:unarchived_traces_offset_minutes]
262
+ end
263
+
264
+ def run
265
+ results = {}
266
+ results[:created_builds] = builds(STATUS_CREATED) unless @created_builds_counting_disabled
267
+ results[:pending_builds] = builds(STATUS_PENDING)
268
+ results[:stale_builds] = stale_builds
269
+ results[:per_runner] = per_runner_builds
270
+ results[:repeated_commands] = repeated_commands
271
+ results[:unarchived_traces] = unarchived_traces
272
+ results
273
+ end
274
+
275
+ private
276
+
277
+ def builds(status)
278
+ results = []
279
+
280
+ query = mirror_column? ? BUILDS_QUERY_EE : BUILDS_QUERY_CE
281
+ query = query % [status] # rubocop:disable Style/FormatString
282
+ exec_query_with_custom_random_page_cost(query).each do |row|
283
+ results << transform_builds_row_to_values(row)
284
+ end
285
+
286
+ results
287
+ rescue PG::UndefinedTable, PG::UndefinedColumn
288
+ results
289
+ end
290
+
291
+ def transform_builds_row_to_values(row)
292
+ values = { namespace: row["namespace_id"].to_s,
293
+ shared_runners: row["shared_runners_enabled"] == "t" ? "yes" : "no",
294
+ value: row["count"].to_i }
295
+ include_ee_fields(values, row)
296
+ end
297
+
298
+ def stale_builds
299
+ with_connection_pool do |conn|
300
+ conn.exec(STALE_BUILDS_QUERY)[0]["count"].to_i
301
+ end
302
+ rescue PG::UndefinedTable, PG::UndefinedColumn
303
+ 0
304
+ end
305
+
306
+ def per_runner_builds
307
+ results = []
308
+
309
+ query = mirror_column? ? PER_RUNNER_QUERY_EE : PER_RUNNER_QUERY_CE
310
+ exec_query_with_custom_random_page_cost(query).each do |row|
311
+ results << transform_per_runners_builds_row_to_values(row)
312
+ end
313
+
314
+ results
315
+ rescue PG::UndefinedTable, PG::UndefinedColumn
316
+ []
317
+ end
318
+
319
+ def transform_per_runners_builds_row_to_values(row)
320
+ values = { runner: row["runner_id"].to_s,
321
+ shared_runner: row["is_shared"] == "t" ? "yes" : "no",
322
+ namespace: row["namespace_id"].to_s,
323
+ scheduled: row["pipeline_schedule_id"] ? "yes" : "no",
324
+ triggered: row["trigger_request_id"] ? "yes" : "no",
325
+ value: row["count"].to_i }
326
+ include_ee_fields(values, row)
327
+ end
328
+
329
+ def repeated_commands
330
+ results = []
331
+
332
+ query = mirror_column? ? REPEATED_COMMANDS_QUERY_EE : REPEATED_COMMANDS_QUERY_CE
333
+ query = query % [allowed_repeated_commands_count] # rubocop:disable Style/FormatString
334
+ exec_query_with_custom_random_page_cost(query).each do |row|
335
+ results << transform_repeated_commands_row_to_values(row)
336
+ end
337
+
338
+ results
339
+ rescue PG::UndefinedTable, PG::UndefinedColumn
340
+ []
341
+ end
342
+
343
+ def allowed_repeated_commands_count
344
+ @allowed_repeated_commands_count ||= 2
345
+ end
346
+
347
+ def transform_repeated_commands_row_to_values(row)
348
+ values = { namespace: row["namespace_id"].to_s,
349
+ project: row["project_id"].to_s,
350
+ shared_runners: row["shared_runners_enabled"] == "t" ? "yes" : "no",
351
+ status: row["status"].to_s,
352
+ value: row["count"].to_i }
353
+
354
+ include_has_minutes_field(values, row)
355
+ end
356
+
357
+ def unarchived_traces
358
+ time = Time.now - (unarchived_traces_offset_minutes * 60)
359
+ query = UNARCHIVED_TRACES_QUERY % [time.strftime("%F %T")] # rubocop:disable Style/FormatString
360
+
361
+ with_connection_pool do |conn|
362
+ conn.exec(query)[0]["count"].to_i
363
+ end
364
+ rescue PG::UndefinedTable, PG::UndefinedColumn
365
+ 0
366
+ end
367
+
368
+ def unarchived_traces_offset_minutes
369
+ @unarchived_traces_offset_minutes ||= DEFAULT_UNARCHIVED_TRACES_OFFSET_MINUTES
370
+ end
371
+
372
+ def include_ee_fields(values, row)
373
+ values.merge!(include_bool_if_row_defined(row, :mirror))
374
+ values.merge!(include_bool_if_row_defined(row, :mirror_trigger_builds))
375
+ include_has_minutes_field(values, row)
376
+ end
377
+
378
+ def include_has_minutes_field(values, row)
379
+ values.merge!(include_bool_if_row_defined(row, :has_minutes))
380
+ values
381
+ end
382
+
383
+ def include_bool_if_row_defined(row, field)
384
+ return {} unless row[field.to_s]
385
+ { field => row[field.to_s] == "t" ? "yes" : "no" }
386
+ end
387
+
388
+ def exec_query_with_custom_random_page_cost(query)
389
+ with_connection_pool do |conn|
390
+ conn.transaction do |trans|
391
+ trans.exec(SET_RANDOM_PAGE_COST)
392
+ trans.exec(query)
393
+ end
394
+ end
395
+ end
396
+
397
+ def mirror_column?
398
+ @mirror_column ||=
399
+ begin
400
+ with_connection_pool do |conn|
401
+ conn.exec(MIRROR_COLUMN_QUERY)[0]["exists"] == "t"
402
+ end
403
+ rescue PG::UndefinedColumn
404
+ false
405
+ end
406
+ end
407
+ end
408
+
409
+ # The prober which is called when gathering metrics
410
+ class CiBuildsProber
411
+ def initialize(opts, metrics: PrometheusMetrics.new)
412
+ @metrics = metrics
413
+
414
+ collector_opts = { connection_string: opts[:connection_string],
415
+ allowed_repeated_commands_count: opts[:allowed_repeated_commands_count],
416
+ created_builds_counting_disabled: opts[:created_builds_counting_disabled],
417
+ unarchived_traces_offset_minutes: opts[:unarchived_traces_offset_minutes] }
418
+ @collector = CiBuildsCollector.new(collector_opts)
419
+ end
420
+
421
+ def probe_db
422
+ @results = @collector.run
423
+
424
+ ci_builds_metrics(@results[:created_builds], "ci_created_builds") if @results[:created_builds]
425
+ ci_builds_metrics(@results[:pending_builds], "ci_pending_builds")
426
+ ci_stale_builds_metrics
427
+ metrics_per_runner
428
+ repeated_commands_metrics
429
+ unarchived_traces_metrics
430
+
431
+ self
432
+ rescue PG::ConnectionBad
433
+ self
434
+ end
435
+
436
+ def write_to(target)
437
+ target.write(@metrics.to_s)
438
+ end
439
+
440
+ private
441
+
442
+ def ci_builds_metrics(results_list, metric_name)
443
+ other_values = {}
444
+
445
+ results_list.each do |metric|
446
+ # If we have a low value, put the value into an "other" bucket.
447
+ if metric[:value] < 10
448
+ key = { shared_runners: metric[:shared_runners] }
449
+ key[:has_minutes] = metric[:has_minutes] if metric[:has_minutes]
450
+
451
+ other_values[key] ||= 0
452
+ other_values[key] += metric[:value]
453
+ else
454
+ add_ci_created_pending_builds(metric_name, metric[:value], metric)
455
+ end
456
+ end
457
+
458
+ # Add metrics for the "other" bucket.
459
+ other_values.each { |key, value| add_ci_created_pending_builds(metric_name, value, key) }
460
+ end
461
+
462
+ def add_ci_created_pending_builds(metric_name, value, labels)
463
+ add_metric_with_namespace_label(metric_name,
464
+ [:namespace, :shared_runners, :has_minutes],
465
+ value,
466
+ labels)
467
+ end
468
+
469
+ def ci_stale_builds_metrics
470
+ @metrics.add("ci_stale_builds", @results[:stale_builds])
471
+ end
472
+
473
+ def metrics_per_runner
474
+ other_values = {}
475
+
476
+ @results[:per_runner].each do |metric|
477
+ # If we have a low value, put the value into an "other" bucket.
478
+ if metric[:value] < 10
479
+ key = { runner: metric[:runner], shared_runner: metric[:shared_runner],
480
+ scheduled: metric[:scheduled], triggered: metric[:triggered] }
481
+ key[:mirror] = metric[:mirror] if metric[:mirror]
482
+ key[:mirror_trigger_builds] = metric[:mirror_trigger_builds] if metric[:mirror_trigger_builds]
483
+ key[:has_minutes] = metric[:has_minutes] if metric[:has_minutes]
484
+
485
+ other_values[key] ||= 0
486
+ other_values[key] += metric[:value]
487
+ else
488
+ add_ci_running_builds(metric[:value], metric)
489
+ end
490
+ end
491
+
492
+ # Add metrics for the "other" bucket.
493
+ other_values.each { |key, value| add_ci_running_builds(value, key) }
494
+ end
495
+
496
+ def add_ci_running_builds(value, labels)
497
+ add_metric_with_namespace_label(
498
+ "ci_running_builds",
499
+ [:runner, :namespace, :shared_runner, :scheduled,
500
+ :triggered, :mirror, :mirror_trigger_builds, :has_minutes],
501
+ value,
502
+ labels
503
+ )
504
+ end
505
+
506
+ def add_metric_with_namespace_label(metric_name, allowed_labels, value, labels)
507
+ labels[:namespace] = "" unless labels[:namespace]
508
+
509
+ selected_labels = labels.select { |k, _| allowed_labels.include?(k) }.sort.to_h
510
+ @metrics.add(metric_name, value, selected_labels)
511
+ end
512
+
513
+ def repeated_commands_metrics
514
+ @results[:repeated_commands].each do |metric|
515
+ value = metric.delete(:value)
516
+
517
+ @metrics.add("ci_repeated_commands_builds", value, metric)
518
+ end
519
+ end
520
+
521
+ def unarchived_traces_metrics
522
+ @metrics.add("ci_unarchived_traces", @results[:unarchived_traces])
523
+ end
524
+ end
525
+ end
526
+ end
527
+ end