gitlab-monitor 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +2 -0
- data/.gitlab-ci.yml +18 -0
- data/.rubocop.yml +34 -0
- data/CONTRIBUTING.md +651 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +75 -0
- data/LICENSE +25 -0
- data/README.md +110 -0
- data/bin/gitlab-mon +17 -0
- data/config/gitlab-monitor.yml.example +112 -0
- data/gitlab-monitor.gemspec +33 -0
- data/lib/gitlab_monitor.rb +18 -0
- data/lib/gitlab_monitor/cli.rb +341 -0
- data/lib/gitlab_monitor/database.rb +13 -0
- data/lib/gitlab_monitor/database/base.rb +44 -0
- data/lib/gitlab_monitor/database/bloat.rb +74 -0
- data/lib/gitlab_monitor/database/bloat_btree.sql +84 -0
- data/lib/gitlab_monitor/database/bloat_table.sql +63 -0
- data/lib/gitlab_monitor/database/ci_builds.rb +527 -0
- data/lib/gitlab_monitor/database/remote_mirrors.rb +74 -0
- data/lib/gitlab_monitor/database/row_count.rb +164 -0
- data/lib/gitlab_monitor/database/tuple_stats.rb +53 -0
- data/lib/gitlab_monitor/git.rb +144 -0
- data/lib/gitlab_monitor/memstats.rb +98 -0
- data/lib/gitlab_monitor/memstats/mapping.rb +91 -0
- data/lib/gitlab_monitor/prober.rb +40 -0
- data/lib/gitlab_monitor/process.rb +122 -0
- data/lib/gitlab_monitor/prometheus.rb +64 -0
- data/lib/gitlab_monitor/sidekiq.rb +149 -0
- data/lib/gitlab_monitor/sidekiq_queue_job_stats.lua +42 -0
- data/lib/gitlab_monitor/util.rb +83 -0
- data/lib/gitlab_monitor/version.rb +5 -0
- data/lib/gitlab_monitor/web_exporter.rb +77 -0
- data/spec/cli_spec.rb +31 -0
- data/spec/database/bloat_spec.rb +99 -0
- data/spec/database/ci_builds_spec.rb +421 -0
- data/spec/database/row_count_spec.rb +37 -0
- data/spec/fixtures/smaps/sample.txt +10108 -0
- data/spec/git_process_proper_spec.rb +27 -0
- data/spec/git_spec.rb +52 -0
- data/spec/memstats_spec.rb +28 -0
- data/spec/prometheus_metrics_spec.rb +17 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/util_spec.rb +15 -0
- metadata +225 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
module GitLab
|
2
|
+
module Monitor
|
3
|
+
# Database-related classes
|
4
|
+
module Database
|
5
|
+
autoload :Base, "gitlab_monitor/database/base"
|
6
|
+
autoload :CiBuildsProber, "gitlab_monitor/database/ci_builds"
|
7
|
+
autoload :TuplesProber, "gitlab_monitor/database/tuple_stats"
|
8
|
+
autoload :RowCountProber, "gitlab_monitor/database/row_count"
|
9
|
+
autoload :BloatProber, "gitlab_monitor/database/bloat"
|
10
|
+
autoload :RemoteMirrorsProber, "gitlab_monitor/database/remote_mirrors"
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require "pg"
|
2
|
+
require "connection_pool"
|
3
|
+
|
4
|
+
module GitLab
|
5
|
+
module Monitor
|
6
|
+
module Database
|
7
|
+
# An abstract class for interacting with DB
|
8
|
+
#
|
9
|
+
# It takes a connection string (e.g. "dbname=test port=5432")
|
10
|
+
class Base
|
11
|
+
def self.connection_pool
|
12
|
+
@connection_pool ||= Hash.new do |h, connection_string|
|
13
|
+
h[connection_string] = ConnectionPool.new(size: 3, timeout: 5) do
|
14
|
+
PG.connect(connection_string)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(args)
|
20
|
+
@connection_string = args[:connection_string]
|
21
|
+
end
|
22
|
+
|
23
|
+
def run
|
24
|
+
fail NotImplemented
|
25
|
+
end
|
26
|
+
|
27
|
+
def connection_pool
|
28
|
+
self.class.connection_pool[@connection_string]
|
29
|
+
end
|
30
|
+
|
31
|
+
def with_connection_pool
|
32
|
+
connection_pool.with do |conn|
|
33
|
+
begin
|
34
|
+
yield conn
|
35
|
+
rescue PG::UnableToSend => e
|
36
|
+
conn.reset
|
37
|
+
raise e
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module GitLab
|
2
|
+
module Monitor
|
3
|
+
module Database
|
4
|
+
# Helper to collect bloat metrics.
|
5
|
+
class BloatCollector < Base
|
6
|
+
def run(type = :btree)
|
7
|
+
execute(self.class.query_for(type)).each_with_object({}) do |row, h|
|
8
|
+
h[row["object_name"]] = row
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def execute(query)
|
15
|
+
with_connection_pool do |conn|
|
16
|
+
conn.exec(query)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class << self
|
21
|
+
def query_for(type)
|
22
|
+
@queries ||= {}
|
23
|
+
|
24
|
+
return @queries[type] if @queries[type]
|
25
|
+
|
26
|
+
file = File.join(__dir__, "bloat_#{type}.sql")
|
27
|
+
fail "Unknown bloat query file: #{file}" unless File.exist?(file)
|
28
|
+
|
29
|
+
@queries[type] = File.read(file)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Prober class to gather bloat metrics
|
35
|
+
class BloatProber
|
36
|
+
METRIC_KEYS = %w(bloat_ratio bloat_size extra_size real_size).freeze
|
37
|
+
|
38
|
+
attr_reader :metrics, :collector, :bloat_types
|
39
|
+
|
40
|
+
def initialize(opts,
|
41
|
+
metrics: PrometheusMetrics.new,
|
42
|
+
collector: BloatCollector.new(connection_string: opts[:connection_string]))
|
43
|
+
@metrics = metrics
|
44
|
+
@collector = collector
|
45
|
+
@bloat_types = opts[:bloat_types] || %i(btree table)
|
46
|
+
end
|
47
|
+
|
48
|
+
def probe_db
|
49
|
+
bloat_types.each do |type|
|
50
|
+
probe_for_type(type)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def write_to(target)
|
55
|
+
target.write(metrics.to_s)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def probe_for_type(type)
|
61
|
+
collector.run(type).each do |query_name, data|
|
62
|
+
METRIC_KEYS.each do |key|
|
63
|
+
metrics.add("gitlab_database_bloat_#{type}_#{key}", data[key], query_name: query_name)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
self
|
68
|
+
rescue PG::ConnectionBad
|
69
|
+
self
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
-- Originally from: https://github.com/ioguix/pgsql-bloat-estimation/blob/master/btree/btree_bloat.sql
|
2
|
+
-- WARNING: executed with a non-superuser role, the query inspect only index on tables you are granted to read.
|
3
|
+
-- WARNING: rows with is_na = 't' are known to have bad statistics ("name" type is not supported).
|
4
|
+
-- This query is compatible with PostgreSQL 8.2 and after
|
5
|
+
SELECT current_database(), nspname AS schemaname, tblname, idxname AS object_name, bs*(relpages)::bigint AS real_size,
|
6
|
+
bs*(relpages-est_pages)::bigint AS extra_size,
|
7
|
+
100 * (relpages-est_pages)::float / relpages AS extra_ratio,
|
8
|
+
fillfactor,
|
9
|
+
CASE WHEN relpages > est_pages_ff
|
10
|
+
THEN bs*(relpages-est_pages_ff)
|
11
|
+
ELSE 0
|
12
|
+
END AS bloat_size,
|
13
|
+
100 * (relpages-est_pages_ff)::float / relpages AS bloat_ratio,
|
14
|
+
is_na
|
15
|
+
-- , 100-(sub.pst).avg_leaf_density, est_pages, index_tuple_hdr_bm, maxalign, pagehdr, nulldatawidth, nulldatahdrwidth, sub.reltuples, sub.relpages -- (DEBUG INFO)
|
16
|
+
FROM (
|
17
|
+
SELECT coalesce(1 +
|
18
|
+
ceil(reltuples/floor((bs-pageopqdata-pagehdr)/(4+nulldatahdrwidth)::float)), 0 -- ItemIdData size + computed avg size of a tuple (nulldatahdrwidth)
|
19
|
+
) AS est_pages,
|
20
|
+
coalesce(1 +
|
21
|
+
ceil(reltuples/floor((bs-pageopqdata-pagehdr)*fillfactor/(100*(4+nulldatahdrwidth)::float))), 0
|
22
|
+
) AS est_pages_ff,
|
23
|
+
bs, nspname, table_oid, tblname, idxname, relpages, fillfactor, is_na
|
24
|
+
-- , stattuple.pgstatindex(quote_ident(nspname)||'.'||quote_ident(idxname)) AS pst, index_tuple_hdr_bm, maxalign, pagehdr, nulldatawidth, nulldatahdrwidth, reltuples -- (DEBUG INFO)
|
25
|
+
FROM (
|
26
|
+
SELECT maxalign, bs, nspname, tblname, idxname, reltuples, relpages, relam, table_oid, fillfactor,
|
27
|
+
( index_tuple_hdr_bm +
|
28
|
+
maxalign - CASE -- Add padding to the index tuple header to align on MAXALIGN
|
29
|
+
WHEN index_tuple_hdr_bm%maxalign = 0 THEN maxalign
|
30
|
+
ELSE index_tuple_hdr_bm%maxalign
|
31
|
+
END
|
32
|
+
+ nulldatawidth + maxalign - CASE -- Add padding to the data to align on MAXALIGN
|
33
|
+
WHEN nulldatawidth = 0 THEN 0
|
34
|
+
WHEN nulldatawidth::integer%maxalign = 0 THEN maxalign
|
35
|
+
ELSE nulldatawidth::integer%maxalign
|
36
|
+
END
|
37
|
+
)::numeric AS nulldatahdrwidth, pagehdr, pageopqdata, is_na
|
38
|
+
-- , index_tuple_hdr_bm, nulldatawidth -- (DEBUG INFO)
|
39
|
+
FROM (
|
40
|
+
SELECT
|
41
|
+
i.nspname, i.tblname, i.idxname, i.reltuples, i.relpages, i.relam, a.attrelid AS table_oid,
|
42
|
+
current_setting('block_size')::numeric AS bs, fillfactor,
|
43
|
+
CASE -- MAXALIGN: 4 on 32bits, 8 on 64bits (and mingw32 ?)
|
44
|
+
WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8
|
45
|
+
ELSE 4
|
46
|
+
END AS maxalign,
|
47
|
+
/* per page header, fixed size: 20 for 7.X, 24 for others */
|
48
|
+
24 AS pagehdr,
|
49
|
+
/* per page btree opaque data */
|
50
|
+
16 AS pageopqdata,
|
51
|
+
/* per tuple header: add IndexAttributeBitMapData if some cols are null-able */
|
52
|
+
CASE WHEN max(coalesce(s.null_frac,0)) = 0
|
53
|
+
THEN 2 -- IndexTupleData size
|
54
|
+
ELSE 2 + (( 32 + 8 - 1 ) / 8) -- IndexTupleData size + IndexAttributeBitMapData size ( max num filed per index + 8 - 1 /8)
|
55
|
+
END AS index_tuple_hdr_bm,
|
56
|
+
/* data len: we remove null values save space using it fractionnal part from stats */
|
57
|
+
sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024)) AS nulldatawidth,
|
58
|
+
max( CASE WHEN a.atttypid = 'pg_catalog.name'::regtype THEN 1 ELSE 0 END ) > 0 AS is_na
|
59
|
+
FROM pg_attribute AS a
|
60
|
+
JOIN (
|
61
|
+
SELECT nspname, tbl.relname AS tblname, idx.relname AS idxname, idx.reltuples, idx.relpages, idx.relam,
|
62
|
+
indrelid, indexrelid, indkey::smallint[] AS attnum,
|
63
|
+
coalesce(substring(
|
64
|
+
array_to_string(idx.reloptions, ' ')
|
65
|
+
from 'fillfactor=([0-9]+)')::smallint, 90) AS fillfactor
|
66
|
+
FROM pg_index
|
67
|
+
JOIN pg_class idx ON idx.oid=pg_index.indexrelid
|
68
|
+
JOIN pg_class tbl ON tbl.oid=pg_index.indrelid
|
69
|
+
JOIN pg_namespace ON pg_namespace.oid = idx.relnamespace
|
70
|
+
WHERE pg_index.indisvalid AND tbl.relkind = 'r' AND idx.relpages > 0
|
71
|
+
) AS i ON a.attrelid = i.indexrelid
|
72
|
+
JOIN pg_stats AS s ON s.schemaname = i.nspname
|
73
|
+
AND ((s.tablename = i.tblname AND s.attname = pg_catalog.pg_get_indexdef(a.attrelid, a.attnum, TRUE)) -- stats from tbl
|
74
|
+
OR (s.tablename = i.idxname AND s.attname = a.attname))-- stats from functionnal cols
|
75
|
+
JOIN pg_type AS t ON a.atttypid = t.oid
|
76
|
+
WHERE a.attnum > 0
|
77
|
+
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9
|
78
|
+
) AS s1
|
79
|
+
) AS s2
|
80
|
+
JOIN pg_am am ON s2.relam = am.oid WHERE am.amname = 'btree'
|
81
|
+
) AS sub
|
82
|
+
WHERE NOT is_na
|
83
|
+
AND nspname = 'public'
|
84
|
+
ORDER BY 2,3,4;
|
@@ -0,0 +1,63 @@
|
|
1
|
+
-- Originally from: https://github.com/ioguix/pgsql-bloat-estimation/blob/master/table/table_bloat.sql
|
2
|
+
/* WARNING: executed with a non-superuser role, the query inspect only tables you are granted to read.
|
3
|
+
* This query is compatible with PostgreSQL 9.0 and more
|
4
|
+
*/
|
5
|
+
SELECT current_database(), schemaname, tblname AS object_name, bs*tblpages AS real_size,
|
6
|
+
(tblpages-est_tblpages)*bs AS extra_size,
|
7
|
+
CASE WHEN tblpages - est_tblpages > 0
|
8
|
+
THEN 100 * (tblpages - est_tblpages)/tblpages::float
|
9
|
+
ELSE 0
|
10
|
+
END AS extra_ratio, fillfactor,
|
11
|
+
CASE WHEN tblpages - est_tblpages_ff > 0
|
12
|
+
THEN (tblpages-est_tblpages_ff)*bs
|
13
|
+
ELSE 0
|
14
|
+
END AS bloat_size,
|
15
|
+
CASE WHEN tblpages - est_tblpages_ff > 0
|
16
|
+
THEN 100 * (tblpages - est_tblpages_ff)/tblpages::float
|
17
|
+
ELSE 0
|
18
|
+
END AS bloat_ratio, is_na
|
19
|
+
-- , (pst).free_percent + (pst).dead_tuple_percent AS real_frag
|
20
|
+
FROM (
|
21
|
+
SELECT ceil( reltuples / ( (bs-page_hdr)/tpl_size ) ) + ceil( toasttuples / 4 ) AS est_tblpages,
|
22
|
+
ceil( reltuples / ( (bs-page_hdr)*fillfactor/(tpl_size*100) ) ) + ceil( toasttuples / 4 ) AS est_tblpages_ff,
|
23
|
+
tblpages, fillfactor, bs, tblid, schemaname, tblname, heappages, toastpages, is_na
|
24
|
+
-- , stattuple.pgstattuple(tblid) AS pst
|
25
|
+
FROM (
|
26
|
+
SELECT
|
27
|
+
( 4 + tpl_hdr_size + tpl_data_size + (2*ma)
|
28
|
+
- CASE WHEN tpl_hdr_size%ma = 0 THEN ma ELSE tpl_hdr_size%ma END
|
29
|
+
- CASE WHEN ceil(tpl_data_size)::int%ma = 0 THEN ma ELSE ceil(tpl_data_size)::int%ma END
|
30
|
+
) AS tpl_size, bs - page_hdr AS size_per_block, (heappages + toastpages) AS tblpages, heappages,
|
31
|
+
toastpages, reltuples, toasttuples, bs, page_hdr, tblid, schemaname, tblname, fillfactor, is_na
|
32
|
+
FROM (
|
33
|
+
SELECT
|
34
|
+
tbl.oid AS tblid, ns.nspname AS schemaname, tbl.relname AS tblname, tbl.reltuples,
|
35
|
+
tbl.relpages AS heappages, coalesce(toast.relpages, 0) AS toastpages,
|
36
|
+
coalesce(toast.reltuples, 0) AS toasttuples,
|
37
|
+
coalesce(substring(
|
38
|
+
array_to_string(tbl.reloptions, ' ')
|
39
|
+
FROM 'fillfactor=([0-9]+)')::smallint, 100) AS fillfactor,
|
40
|
+
current_setting('block_size')::numeric AS bs,
|
41
|
+
CASE WHEN version()~'mingw32' OR version()~'64-bit|x86_64|ppc64|ia64|amd64' THEN 8 ELSE 4 END AS ma,
|
42
|
+
24 AS page_hdr,
|
43
|
+
23 + CASE WHEN MAX(coalesce(null_frac,0)) > 0 THEN ( 7 + count(*) ) / 8 ELSE 0::int END
|
44
|
+
+ CASE WHEN tbl.relhasoids THEN 4 ELSE 0 END AS tpl_hdr_size,
|
45
|
+
sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 1024) ) AS tpl_data_size,
|
46
|
+
bool_or(att.atttypid = 'pg_catalog.name'::regtype)
|
47
|
+
OR count(att.attname) <> count(s.attname) AS is_na
|
48
|
+
FROM pg_attribute AS att
|
49
|
+
JOIN pg_class AS tbl ON att.attrelid = tbl.oid
|
50
|
+
JOIN pg_namespace AS ns ON ns.oid = tbl.relnamespace
|
51
|
+
LEFT JOIN pg_stats AS s ON s.schemaname=ns.nspname
|
52
|
+
AND s.tablename = tbl.relname AND s.inherited=false AND s.attname=att.attname
|
53
|
+
LEFT JOIN pg_class AS toast ON tbl.reltoastrelid = toast.oid
|
54
|
+
WHERE att.attnum > 0 AND NOT att.attisdropped
|
55
|
+
AND tbl.relkind = 'r'
|
56
|
+
GROUP BY 1,2,3,4,5,6,7,8,9,10, tbl.relhasoids
|
57
|
+
ORDER BY 2,3
|
58
|
+
) AS s
|
59
|
+
) AS s2
|
60
|
+
) AS s3
|
61
|
+
WHERE NOT is_na
|
62
|
+
-- AND tblpages*((pst).free_percent + (pst).dead_tuple_percent)::float4/100 >= 1
|
63
|
+
AND schemaname= 'public';
|
@@ -0,0 +1,527 @@
|
|
1
|
+
module GitLab
|
2
|
+
module Monitor
|
3
|
+
module Database
|
4
|
+
# A helper class to collect CI builds metrics.
|
5
|
+
class CiBuildsCollector < Base # rubocop:disable Metrics/ClassLength
|
6
|
+
SET_RANDOM_PAGE_COST = "SET LOCAL random_page_cost TO 1".freeze
|
7
|
+
|
8
|
+
BUILDS_QUERY_EE =
|
9
|
+
<<~SQL.freeze
|
10
|
+
SELECT
|
11
|
+
projects.namespace_id,
|
12
|
+
ci_builds.status,
|
13
|
+
projects.shared_runners_enabled,
|
14
|
+
(COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
|
15
|
+
COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
|
16
|
+
COUNT(*) AS count
|
17
|
+
FROM ci_builds
|
18
|
+
JOIN projects
|
19
|
+
ON projects.id = ci_builds.project_id
|
20
|
+
JOIN namespaces
|
21
|
+
ON namespaces.id = projects.namespace_id
|
22
|
+
LEFT JOIN namespace_statistics
|
23
|
+
ON namespace_statistics.namespace_id = namespaces.id
|
24
|
+
JOIN application_settings
|
25
|
+
ON application_settings.id = 1
|
26
|
+
WHERE ci_builds.type = 'Ci::Build'
|
27
|
+
AND ci_builds.status = '%s'
|
28
|
+
AND projects.pending_delete = 'f'
|
29
|
+
GROUP BY
|
30
|
+
projects.namespace_id,
|
31
|
+
ci_builds.status,
|
32
|
+
projects.shared_runners_enabled,
|
33
|
+
namespaces.shared_runners_minutes_limit,
|
34
|
+
namespace_statistics.shared_runners_seconds,
|
35
|
+
application_settings.shared_runners_minutes
|
36
|
+
SQL
|
37
|
+
|
38
|
+
BUILDS_QUERY_CE =
|
39
|
+
<<~SQL.freeze
|
40
|
+
SELECT
|
41
|
+
projects.namespace_id,
|
42
|
+
ci_builds.status,
|
43
|
+
projects.shared_runners_enabled,
|
44
|
+
COUNT(*) AS count
|
45
|
+
FROM ci_builds
|
46
|
+
JOIN projects
|
47
|
+
ON projects.id = ci_builds.project_id
|
48
|
+
WHERE ci_builds.type = 'Ci::Build'
|
49
|
+
AND ci_builds.status = '%s'
|
50
|
+
AND projects.pending_delete = 'f'
|
51
|
+
GROUP BY
|
52
|
+
projects.namespace_id,
|
53
|
+
ci_builds.status,
|
54
|
+
projects.shared_runners_enabled
|
55
|
+
SQL
|
56
|
+
|
57
|
+
STALE_BUILDS_QUERY =
|
58
|
+
<<~SQL.freeze
|
59
|
+
SELECT
|
60
|
+
COUNT(*) AS count
|
61
|
+
FROM ci_builds
|
62
|
+
JOIN projects
|
63
|
+
ON projects.id = ci_builds.project_id
|
64
|
+
WHERE ci_builds.type = 'Ci::Build'
|
65
|
+
AND ci_builds.status = 'running'
|
66
|
+
AND ci_builds.updated_at < NOW() - INTERVAL '1 hour'
|
67
|
+
AND projects.pending_delete = 'f'
|
68
|
+
SQL
|
69
|
+
|
70
|
+
PER_RUNNER_QUERY_EE =
|
71
|
+
<<~SQL.freeze
|
72
|
+
SELECT
|
73
|
+
ci_builds.runner_id,
|
74
|
+
ci_runners.is_shared,
|
75
|
+
projects.namespace_id,
|
76
|
+
projects.mirror,
|
77
|
+
projects.mirror_trigger_builds,
|
78
|
+
ci_pipelines.pipeline_schedule_id,
|
79
|
+
ci_builds.trigger_request_id,
|
80
|
+
(COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
|
81
|
+
COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
|
82
|
+
COUNT(*) AS count
|
83
|
+
FROM ci_builds
|
84
|
+
JOIN ci_runners
|
85
|
+
ON ci_runners.id = ci_builds.runner_id
|
86
|
+
JOIN projects
|
87
|
+
ON projects.id = ci_builds.project_id
|
88
|
+
JOIN ci_pipelines
|
89
|
+
ON ci_pipelines.id = ci_builds.commit_id
|
90
|
+
JOIN namespaces
|
91
|
+
ON namespaces.id = projects.namespace_id
|
92
|
+
LEFT JOIN namespace_statistics
|
93
|
+
ON namespace_statistics.namespace_id = namespaces.id
|
94
|
+
JOIN application_settings
|
95
|
+
ON application_settings.id = 1
|
96
|
+
WHERE ci_builds.type = 'Ci::Build'
|
97
|
+
AND ci_builds.status = 'running'
|
98
|
+
AND projects.pending_delete = 'f'
|
99
|
+
GROUP BY
|
100
|
+
ci_builds.runner_id,
|
101
|
+
ci_runners.is_shared,
|
102
|
+
projects.namespace_id,
|
103
|
+
projects.mirror,
|
104
|
+
projects.mirror_trigger_builds,
|
105
|
+
ci_pipelines.pipeline_schedule_id,
|
106
|
+
ci_builds.trigger_request_id,
|
107
|
+
namespaces.shared_runners_minutes_limit,
|
108
|
+
namespace_statistics.shared_runners_seconds,
|
109
|
+
application_settings.shared_runners_minutes
|
110
|
+
SQL
|
111
|
+
|
112
|
+
PER_RUNNER_QUERY_CE =
|
113
|
+
<<~SQL.freeze
|
114
|
+
SELECT
|
115
|
+
ci_builds.runner_id,
|
116
|
+
ci_runners.is_shared,
|
117
|
+
projects.namespace_id,
|
118
|
+
ci_pipelines.pipeline_schedule_id,
|
119
|
+
ci_builds.trigger_request_id,
|
120
|
+
COUNT(*) AS count
|
121
|
+
FROM ci_builds
|
122
|
+
JOIN ci_runners
|
123
|
+
ON ci_runners.id = ci_builds.runner_id
|
124
|
+
JOIN projects
|
125
|
+
ON projects.id = ci_builds.project_id
|
126
|
+
JOIN ci_pipelines
|
127
|
+
ON ci_pipelines.id = ci_builds.commit_id
|
128
|
+
WHERE ci_builds.type = 'Ci::Build'
|
129
|
+
AND ci_builds.status = 'running'
|
130
|
+
AND projects.pending_delete = 'f'
|
131
|
+
GROUP BY
|
132
|
+
ci_builds.runner_id,
|
133
|
+
ci_runners.is_shared,
|
134
|
+
projects.namespace_id,
|
135
|
+
ci_pipelines.pipeline_schedule_id,
|
136
|
+
ci_builds.trigger_request_id
|
137
|
+
SQL
|
138
|
+
|
139
|
+
MIRROR_COLUMN_QUERY =
|
140
|
+
<<~SQL.freeze
|
141
|
+
SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='projects' AND column_name='mirror')
|
142
|
+
SQL
|
143
|
+
|
144
|
+
REPEATED_COMMANDS_QUERY_EE =
|
145
|
+
<<~SQL.freeze
|
146
|
+
SELECT
|
147
|
+
subquery.namespace_id,
|
148
|
+
subquery.shared_runners_enabled,
|
149
|
+
subquery.project_id,
|
150
|
+
subquery.status,
|
151
|
+
subquery.has_minutes,
|
152
|
+
MAX(subquery.count) as count
|
153
|
+
FROM (
|
154
|
+
SELECT
|
155
|
+
projects.namespace_id,
|
156
|
+
projects.shared_runners_enabled,
|
157
|
+
ci_builds.project_id,
|
158
|
+
ci_builds.commit_id,
|
159
|
+
ci_builds.status,
|
160
|
+
(COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) = 0 OR
|
161
|
+
COALESCE(namespace_statistics.shared_runners_seconds, 0) < COALESCE(namespaces.shared_runners_minutes_limit, application_settings.shared_runners_minutes, 0) * 60) as has_minutes,
|
162
|
+
COUNT(*) AS count
|
163
|
+
FROM ci_builds
|
164
|
+
JOIN projects
|
165
|
+
ON projects.id = ci_builds.project_id
|
166
|
+
JOIN namespaces
|
167
|
+
ON namespaces.id = projects.namespace_id
|
168
|
+
LEFT JOIN namespace_statistics
|
169
|
+
ON namespace_statistics.namespace_id = namespaces.id
|
170
|
+
JOIN application_settings
|
171
|
+
ON application_settings.id = 1
|
172
|
+
WHERE ci_builds.type = 'Ci::Build'
|
173
|
+
AND ci_builds.status IN ('running', 'pending')
|
174
|
+
GROUP BY
|
175
|
+
projects.namespace_id,
|
176
|
+
projects.shared_runners_enabled,
|
177
|
+
ci_builds.project_id,
|
178
|
+
ci_builds.commit_id,
|
179
|
+
ci_builds.status,
|
180
|
+
ci_builds.commands,
|
181
|
+
namespaces.shared_runners_minutes_limit,
|
182
|
+
namespace_statistics.shared_runners_seconds,
|
183
|
+
application_settings.shared_runners_minutes
|
184
|
+
HAVING COUNT(*) > %d
|
185
|
+
) AS subquery
|
186
|
+
GROUP BY
|
187
|
+
subquery.namespace_id,
|
188
|
+
subquery.shared_runners_enabled,
|
189
|
+
subquery.project_id,
|
190
|
+
subquery.commit_id,
|
191
|
+
subquery.status,
|
192
|
+
subquery.has_minutes
|
193
|
+
SQL
|
194
|
+
|
195
|
+
REPEATED_COMMANDS_QUERY_CE =
|
196
|
+
<<~SQL.freeze
|
197
|
+
SELECT
|
198
|
+
subquery.namespace_id,
|
199
|
+
subquery.shared_runners_enabled,
|
200
|
+
subquery.project_id,
|
201
|
+
subquery.status,
|
202
|
+
MAX(subquery.count) as count
|
203
|
+
FROM (
|
204
|
+
SELECT
|
205
|
+
projects.namespace_id,
|
206
|
+
projects.shared_runners_enabled,
|
207
|
+
ci_builds.project_id,
|
208
|
+
ci_builds.commit_id,
|
209
|
+
ci_builds.status,
|
210
|
+
COUNT(*) AS count
|
211
|
+
FROM ci_builds
|
212
|
+
JOIN projects
|
213
|
+
ON projects.id = ci_builds.project_id
|
214
|
+
JOIN namespaces
|
215
|
+
ON namespaces.id = projects.namespace_id
|
216
|
+
WHERE ci_builds.type = 'Ci::Build'
|
217
|
+
AND ci_builds.status IN ('running', 'pending')
|
218
|
+
GROUP BY
|
219
|
+
projects.namespace_id,
|
220
|
+
projects.shared_runners_enabled,
|
221
|
+
ci_builds.project_id,
|
222
|
+
ci_builds.commit_id,
|
223
|
+
ci_builds.status,
|
224
|
+
ci_builds.commands
|
225
|
+
HAVING COUNT(*) > %d
|
226
|
+
) AS subquery
|
227
|
+
GROUP BY
|
228
|
+
subquery.namespace_id,
|
229
|
+
subquery.shared_runners_enabled,
|
230
|
+
subquery.project_id,
|
231
|
+
subquery.commit_id,
|
232
|
+
subquery.status
|
233
|
+
SQL
|
234
|
+
|
235
|
+
UNARCHIVED_TRACES_QUERY =
|
236
|
+
<<~SQL.freeze
|
237
|
+
SELECT
|
238
|
+
COUNT(*) as count
|
239
|
+
FROM ci_builds
|
240
|
+
JOIN ci_build_trace_chunks
|
241
|
+
ON ci_build_trace_chunks.build_id = ci_builds.id
|
242
|
+
LEFT JOIN ci_job_artifacts
|
243
|
+
ON ci_job_artifacts.job_id = ci_builds.id
|
244
|
+
AND ci_job_artifacts.file_type = 3
|
245
|
+
WHERE ci_builds.type = 'Ci::Build'
|
246
|
+
AND ci_builds.status IN ('success', 'failed', 'canceled')
|
247
|
+
AND ci_builds.finished_at < '%s'
|
248
|
+
AND ci_job_artifacts.job_id IS NULL
|
249
|
+
SQL
|
250
|
+
|
251
|
+
STATUS_CREATED = "created".freeze
|
252
|
+
STATUS_PENDING = "pending".freeze
|
253
|
+
|
254
|
+
DEFAULT_UNARCHIVED_TRACES_OFFSET_MINUTES = 1440
|
255
|
+
|
256
|
+
def initialize(opts)
|
257
|
+
super(opts)
|
258
|
+
|
259
|
+
@allowed_repeated_commands_count = opts[:allowed_repeated_commands_count]
|
260
|
+
@created_builds_counting_disabled = opts[:created_builds_counting_disabled]
|
261
|
+
@unarchived_traces_offset_minutes = opts[:unarchived_traces_offset_minutes]
|
262
|
+
end
|
263
|
+
|
264
|
+
def run
|
265
|
+
results = {}
|
266
|
+
results[:created_builds] = builds(STATUS_CREATED) unless @created_builds_counting_disabled
|
267
|
+
results[:pending_builds] = builds(STATUS_PENDING)
|
268
|
+
results[:stale_builds] = stale_builds
|
269
|
+
results[:per_runner] = per_runner_builds
|
270
|
+
results[:repeated_commands] = repeated_commands
|
271
|
+
results[:unarchived_traces] = unarchived_traces
|
272
|
+
results
|
273
|
+
end
|
274
|
+
|
275
|
+
private
|
276
|
+
|
277
|
+
def builds(status)
|
278
|
+
results = []
|
279
|
+
|
280
|
+
query = mirror_column? ? BUILDS_QUERY_EE : BUILDS_QUERY_CE
|
281
|
+
query = query % [status] # rubocop:disable Style/FormatString
|
282
|
+
exec_query_with_custom_random_page_cost(query).each do |row|
|
283
|
+
results << transform_builds_row_to_values(row)
|
284
|
+
end
|
285
|
+
|
286
|
+
results
|
287
|
+
rescue PG::UndefinedTable, PG::UndefinedColumn
|
288
|
+
results
|
289
|
+
end
|
290
|
+
|
291
|
+
def transform_builds_row_to_values(row)
|
292
|
+
values = { namespace: row["namespace_id"].to_s,
|
293
|
+
shared_runners: row["shared_runners_enabled"] == "t" ? "yes" : "no",
|
294
|
+
value: row["count"].to_i }
|
295
|
+
include_ee_fields(values, row)
|
296
|
+
end
|
297
|
+
|
298
|
+
def stale_builds
|
299
|
+
with_connection_pool do |conn|
|
300
|
+
conn.exec(STALE_BUILDS_QUERY)[0]["count"].to_i
|
301
|
+
end
|
302
|
+
rescue PG::UndefinedTable, PG::UndefinedColumn
|
303
|
+
0
|
304
|
+
end
|
305
|
+
|
306
|
+
def per_runner_builds
|
307
|
+
results = []
|
308
|
+
|
309
|
+
query = mirror_column? ? PER_RUNNER_QUERY_EE : PER_RUNNER_QUERY_CE
|
310
|
+
exec_query_with_custom_random_page_cost(query).each do |row|
|
311
|
+
results << transform_per_runners_builds_row_to_values(row)
|
312
|
+
end
|
313
|
+
|
314
|
+
results
|
315
|
+
rescue PG::UndefinedTable, PG::UndefinedColumn
|
316
|
+
[]
|
317
|
+
end
|
318
|
+
|
319
|
+
def transform_per_runners_builds_row_to_values(row)
|
320
|
+
values = { runner: row["runner_id"].to_s,
|
321
|
+
shared_runner: row["is_shared"] == "t" ? "yes" : "no",
|
322
|
+
namespace: row["namespace_id"].to_s,
|
323
|
+
scheduled: row["pipeline_schedule_id"] ? "yes" : "no",
|
324
|
+
triggered: row["trigger_request_id"] ? "yes" : "no",
|
325
|
+
value: row["count"].to_i }
|
326
|
+
include_ee_fields(values, row)
|
327
|
+
end
|
328
|
+
|
329
|
+
def repeated_commands
|
330
|
+
results = []
|
331
|
+
|
332
|
+
query = mirror_column? ? REPEATED_COMMANDS_QUERY_EE : REPEATED_COMMANDS_QUERY_CE
|
333
|
+
query = query % [allowed_repeated_commands_count] # rubocop:disable Style/FormatString
|
334
|
+
exec_query_with_custom_random_page_cost(query).each do |row|
|
335
|
+
results << transform_repeated_commands_row_to_values(row)
|
336
|
+
end
|
337
|
+
|
338
|
+
results
|
339
|
+
rescue PG::UndefinedTable, PG::UndefinedColumn
|
340
|
+
[]
|
341
|
+
end
|
342
|
+
|
343
|
+
def allowed_repeated_commands_count
|
344
|
+
@allowed_repeated_commands_count ||= 2
|
345
|
+
end
|
346
|
+
|
347
|
+
def transform_repeated_commands_row_to_values(row)
|
348
|
+
values = { namespace: row["namespace_id"].to_s,
|
349
|
+
project: row["project_id"].to_s,
|
350
|
+
shared_runners: row["shared_runners_enabled"] == "t" ? "yes" : "no",
|
351
|
+
status: row["status"].to_s,
|
352
|
+
value: row["count"].to_i }
|
353
|
+
|
354
|
+
include_has_minutes_field(values, row)
|
355
|
+
end
|
356
|
+
|
357
|
+
def unarchived_traces
|
358
|
+
time = Time.now - (unarchived_traces_offset_minutes * 60)
|
359
|
+
query = UNARCHIVED_TRACES_QUERY % [time.strftime("%F %T")] # rubocop:disable Style/FormatString
|
360
|
+
|
361
|
+
with_connection_pool do |conn|
|
362
|
+
conn.exec(query)[0]["count"].to_i
|
363
|
+
end
|
364
|
+
rescue PG::UndefinedTable, PG::UndefinedColumn
|
365
|
+
0
|
366
|
+
end
|
367
|
+
|
368
|
+
def unarchived_traces_offset_minutes
|
369
|
+
@unarchived_traces_offset_minutes ||= DEFAULT_UNARCHIVED_TRACES_OFFSET_MINUTES
|
370
|
+
end
|
371
|
+
|
372
|
+
def include_ee_fields(values, row)
|
373
|
+
values.merge!(include_bool_if_row_defined(row, :mirror))
|
374
|
+
values.merge!(include_bool_if_row_defined(row, :mirror_trigger_builds))
|
375
|
+
include_has_minutes_field(values, row)
|
376
|
+
end
|
377
|
+
|
378
|
+
def include_has_minutes_field(values, row)
|
379
|
+
values.merge!(include_bool_if_row_defined(row, :has_minutes))
|
380
|
+
values
|
381
|
+
end
|
382
|
+
|
383
|
+
def include_bool_if_row_defined(row, field)
|
384
|
+
return {} unless row[field.to_s]
|
385
|
+
{ field => row[field.to_s] == "t" ? "yes" : "no" }
|
386
|
+
end
|
387
|
+
|
388
|
+
def exec_query_with_custom_random_page_cost(query)
|
389
|
+
with_connection_pool do |conn|
|
390
|
+
conn.transaction do |trans|
|
391
|
+
trans.exec(SET_RANDOM_PAGE_COST)
|
392
|
+
trans.exec(query)
|
393
|
+
end
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
def mirror_column?
|
398
|
+
@mirror_column ||=
|
399
|
+
begin
|
400
|
+
with_connection_pool do |conn|
|
401
|
+
conn.exec(MIRROR_COLUMN_QUERY)[0]["exists"] == "t"
|
402
|
+
end
|
403
|
+
rescue PG::UndefinedColumn
|
404
|
+
false
|
405
|
+
end
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
# The prober which is called when gathering metrics
|
410
|
+
class CiBuildsProber
|
411
|
+
def initialize(opts, metrics: PrometheusMetrics.new)
|
412
|
+
@metrics = metrics
|
413
|
+
|
414
|
+
collector_opts = { connection_string: opts[:connection_string],
|
415
|
+
allowed_repeated_commands_count: opts[:allowed_repeated_commands_count],
|
416
|
+
created_builds_counting_disabled: opts[:created_builds_counting_disabled],
|
417
|
+
unarchived_traces_offset_minutes: opts[:unarchived_traces_offset_minutes] }
|
418
|
+
@collector = CiBuildsCollector.new(collector_opts)
|
419
|
+
end
|
420
|
+
|
421
|
+
def probe_db
|
422
|
+
@results = @collector.run
|
423
|
+
|
424
|
+
ci_builds_metrics(@results[:created_builds], "ci_created_builds") if @results[:created_builds]
|
425
|
+
ci_builds_metrics(@results[:pending_builds], "ci_pending_builds")
|
426
|
+
ci_stale_builds_metrics
|
427
|
+
metrics_per_runner
|
428
|
+
repeated_commands_metrics
|
429
|
+
unarchived_traces_metrics
|
430
|
+
|
431
|
+
self
|
432
|
+
rescue PG::ConnectionBad
|
433
|
+
self
|
434
|
+
end
|
435
|
+
|
436
|
+
def write_to(target)
|
437
|
+
target.write(@metrics.to_s)
|
438
|
+
end
|
439
|
+
|
440
|
+
private
|
441
|
+
|
442
|
+
def ci_builds_metrics(results_list, metric_name)
|
443
|
+
other_values = {}
|
444
|
+
|
445
|
+
results_list.each do |metric|
|
446
|
+
# If we have a low value, put the value into an "other" bucket.
|
447
|
+
if metric[:value] < 10
|
448
|
+
key = { shared_runners: metric[:shared_runners] }
|
449
|
+
key[:has_minutes] = metric[:has_minutes] if metric[:has_minutes]
|
450
|
+
|
451
|
+
other_values[key] ||= 0
|
452
|
+
other_values[key] += metric[:value]
|
453
|
+
else
|
454
|
+
add_ci_created_pending_builds(metric_name, metric[:value], metric)
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
# Add metrics for the "other" bucket.
|
459
|
+
other_values.each { |key, value| add_ci_created_pending_builds(metric_name, value, key) }
|
460
|
+
end
|
461
|
+
|
462
|
+
def add_ci_created_pending_builds(metric_name, value, labels)
|
463
|
+
add_metric_with_namespace_label(metric_name,
|
464
|
+
[:namespace, :shared_runners, :has_minutes],
|
465
|
+
value,
|
466
|
+
labels)
|
467
|
+
end
|
468
|
+
|
469
|
+
def ci_stale_builds_metrics
|
470
|
+
@metrics.add("ci_stale_builds", @results[:stale_builds])
|
471
|
+
end
|
472
|
+
|
473
|
+
def metrics_per_runner
|
474
|
+
other_values = {}
|
475
|
+
|
476
|
+
@results[:per_runner].each do |metric|
|
477
|
+
# If we have a low value, put the value into an "other" bucket.
|
478
|
+
if metric[:value] < 10
|
479
|
+
key = { runner: metric[:runner], shared_runner: metric[:shared_runner],
|
480
|
+
scheduled: metric[:scheduled], triggered: metric[:triggered] }
|
481
|
+
key[:mirror] = metric[:mirror] if metric[:mirror]
|
482
|
+
key[:mirror_trigger_builds] = metric[:mirror_trigger_builds] if metric[:mirror_trigger_builds]
|
483
|
+
key[:has_minutes] = metric[:has_minutes] if metric[:has_minutes]
|
484
|
+
|
485
|
+
other_values[key] ||= 0
|
486
|
+
other_values[key] += metric[:value]
|
487
|
+
else
|
488
|
+
add_ci_running_builds(metric[:value], metric)
|
489
|
+
end
|
490
|
+
end
|
491
|
+
|
492
|
+
# Add metrics for the "other" bucket.
|
493
|
+
other_values.each { |key, value| add_ci_running_builds(value, key) }
|
494
|
+
end
|
495
|
+
|
496
|
+
def add_ci_running_builds(value, labels)
|
497
|
+
add_metric_with_namespace_label(
|
498
|
+
"ci_running_builds",
|
499
|
+
[:runner, :namespace, :shared_runner, :scheduled,
|
500
|
+
:triggered, :mirror, :mirror_trigger_builds, :has_minutes],
|
501
|
+
value,
|
502
|
+
labels
|
503
|
+
)
|
504
|
+
end
|
505
|
+
|
506
|
+
def add_metric_with_namespace_label(metric_name, allowed_labels, value, labels)
|
507
|
+
labels[:namespace] = "" unless labels[:namespace]
|
508
|
+
|
509
|
+
selected_labels = labels.select { |k, _| allowed_labels.include?(k) }.sort.to_h
|
510
|
+
@metrics.add(metric_name, value, selected_labels)
|
511
|
+
end
|
512
|
+
|
513
|
+
def repeated_commands_metrics
|
514
|
+
@results[:repeated_commands].each do |metric|
|
515
|
+
value = metric.delete(:value)
|
516
|
+
|
517
|
+
@metrics.add("ci_repeated_commands_builds", value, metric)
|
518
|
+
end
|
519
|
+
end
|
520
|
+
|
521
|
+
def unarchived_traces_metrics
|
522
|
+
@metrics.add("ci_unarchived_traces", @results[:unarchived_traces])
|
523
|
+
end
|
524
|
+
end
|
525
|
+
end
|
526
|
+
end
|
527
|
+
end
|