pghero 2.4.0 → 2.6.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of pghero might be problematic. Click here for more details.

@@ -32,13 +32,34 @@ module PgHero
32
32
 
33
33
  private
34
34
 
35
- def select_all(sql, conn = nil)
35
+ def select_all(sql, conn: nil, query_columns: [])
36
36
  conn ||= connection
37
37
  # squish for logs
38
38
  retries = 0
39
39
  begin
40
40
  result = conn.select_all(add_source(squish(sql)))
41
- result.map { |row| Hash[row.map { |col, val| [col.to_sym, result.column_types[col].send(:cast_value, val)] }] }
41
+ result = result.map { |row| Hash[row.map { |col, val| [col.to_sym, result.column_types[col].send(:cast_value, val)] }] }
42
+ if filter_data
43
+ query_columns.each do |column|
44
+ result.each do |row|
45
+ begin
46
+ row[column] = PgQuery.normalize(row[column])
47
+ rescue PgQuery::ParseError
48
+ # try replacing "interval $1" with "$1::interval"
49
+ # see https://github.com/lfittl/pg_query/issues/169 for more info
50
+ # this is not ideal since it changes the query slightly
51
+ # we could skip normalization
52
+ # but this has a very small chance of data leakage
53
+ begin
54
+ row[column] = PgQuery.normalize(row[column].gsub(/\binterval\s+(\$\d+)\b/i, "\\1::interval"))
55
+ rescue PgQuery::ParseError
56
+ row[column] = "<unable to filter data>"
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ result
42
63
  rescue ActiveRecord::StatementInvalid => e
43
64
  # fix for random internal errors
44
65
  if e.message.include?("PG::InternalError") && retries < 2
@@ -51,8 +72,8 @@ module PgHero
51
72
  end
52
73
  end
53
74
 
54
- def select_all_stats(sql)
55
- select_all(sql, stats_connection)
75
+ def select_all_stats(sql, **options)
76
+ select_all(sql, **options, conn: stats_connection)
56
77
  end
57
78
 
58
79
  def select_all_size(sql)
@@ -63,12 +84,12 @@ module PgHero
63
84
  result
64
85
  end
65
86
 
66
- def select_one(sql, conn = nil)
67
- select_all(sql, conn).first.values.first
87
+ def select_one(sql, conn: nil)
88
+ select_all(sql, conn: conn).first.values.first
68
89
  end
69
90
 
70
91
  def select_one_stats(sql)
71
- select_one(sql, stats_connection)
92
+ select_one(sql, conn: stats_connection)
72
93
  end
73
94
 
74
95
  def execute(sql)
@@ -1,6 +1,41 @@
1
1
  module PgHero
2
2
  module Methods
3
3
  module Connections
4
+ def connections
5
+ if server_version_num >= 90500
6
+ select_all <<-SQL
7
+ SELECT
8
+ pg_stat_activity.pid,
9
+ datname AS database,
10
+ usename AS user,
11
+ application_name AS source,
12
+ client_addr AS ip,
13
+ state,
14
+ ssl
15
+ FROM
16
+ pg_stat_activity
17
+ LEFT JOIN
18
+ pg_stat_ssl ON pg_stat_activity.pid = pg_stat_ssl.pid
19
+ ORDER BY
20
+ pg_stat_activity.pid
21
+ SQL
22
+ else
23
+ select_all <<-SQL
24
+ SELECT
25
+ pid,
26
+ datname AS database,
27
+ usename AS user,
28
+ application_name AS source,
29
+ client_addr AS ip,
30
+ state
31
+ FROM
32
+ pg_stat_activity
33
+ ORDER BY
34
+ pid
35
+ SQL
36
+ end
37
+ end
38
+
4
39
  def total_connections
5
40
  select_one("SELECT COUNT(*) FROM pg_stat_activity")
6
41
  end
@@ -6,7 +6,7 @@ module PgHero
6
6
  explanation = nil
7
7
 
8
8
  # use transaction for safety
9
- with_transaction(statement_timeout: (explain_timeout_sec * 1000), rollback: true) do
9
+ with_transaction(statement_timeout: (explain_timeout_sec * 1000).round, rollback: true) do
10
10
  if (sql.sub(/;\z/, "").include?(";") || sql.upcase.include?("COMMIT")) && !explain_safe?
11
11
  raise ActiveRecord::StatementInvalid, "Unsafe statement"
12
12
  end
@@ -57,7 +57,9 @@ module PgHero
57
57
  last_vacuum,
58
58
  last_autovacuum,
59
59
  last_analyze,
60
- last_autoanalyze
60
+ last_autoanalyze,
61
+ n_dead_tup AS dead_rows,
62
+ n_live_tup AS live_rows
61
63
  FROM
62
64
  pg_stat_user_tables
63
65
  ORDER BY
@@ -2,7 +2,7 @@ module PgHero
2
2
  module Methods
3
3
  module Queries
4
4
  def running_queries(min_duration: nil, all: false)
5
- select_all <<-SQL
5
+ query = <<-SQL
6
6
  SELECT
7
7
  pid,
8
8
  state,
@@ -24,6 +24,8 @@ module PgHero
24
24
  ORDER BY
25
25
  COALESCE(query_start, xact_start) DESC
26
26
  SQL
27
+
28
+ select_all(query, query_columns: [:query])
27
29
  end
28
30
 
29
31
  def long_running_queries
@@ -33,7 +35,7 @@ module PgHero
33
35
  # from https://wiki.postgresql.org/wiki/Lock_Monitoring
34
36
  # and https://big-elephants.com/2013-09/exploring-query-locks-in-postgres/
35
37
  def blocked_queries
36
- select_all <<-SQL
38
+ query = <<-SQL
37
39
  SELECT
38
40
  COALESCE(blockingl.relation::regclass::text,blockingl.locktype) as locked_item,
39
41
  blockeda.pid AS blocked_pid,
@@ -65,6 +67,8 @@ module PgHero
65
67
  ORDER BY
66
68
  blocked_duration DESC
67
69
  SQL
70
+
71
+ select_all(query, query_columns: [:blocked_query, :current_or_recent_query_in_blocking_process])
68
72
  end
69
73
  end
70
74
  end
@@ -2,7 +2,7 @@ module PgHero
2
2
  module Methods
3
3
  module QueryStats
4
4
  def query_stats(historical: false, start_at: nil, end_at: nil, min_average_time: nil, min_calls: nil, **options)
5
- current_query_stats = historical && end_at && end_at < Time.now ? [] : current_query_stats(options)
5
+ current_query_stats = historical && end_at && end_at < Time.now ? [] : current_query_stats(**options)
6
6
  historical_query_stats = historical && historical_query_stats_enabled? ? historical_query_stats(start_at: start_at, end_at: end_at, **options) : []
7
7
 
8
8
  query_stats = combine_query_stats((current_query_stats + historical_query_stats).group_by { |q| [q[:query_hash], q[:user]] })
@@ -166,14 +166,15 @@ module PgHero
166
166
  if query_stats_enabled?
167
167
  limit ||= 100
168
168
  sort ||= "total_minutes"
169
- select_all <<-SQL
169
+ total_time = server_version_num >= 130000 ? "(total_plan_time + total_exec_time)" : "total_time"
170
+ query = <<-SQL
170
171
  WITH query_stats AS (
171
172
  SELECT
172
173
  LEFT(query, 10000) AS query,
173
174
  #{supports_query_hash? ? "queryid" : "md5(query)"} AS query_hash,
174
175
  rolname AS user,
175
- (total_time / 1000 / 60) AS total_minutes,
176
- (total_time / calls) AS average_time,
176
+ (#{total_time} / 1000 / 60) AS total_minutes,
177
+ (#{total_time} / calls) AS average_time,
177
178
  calls
178
179
  FROM
179
180
  pg_stat_statements
@@ -182,6 +183,7 @@ module PgHero
182
183
  INNER JOIN
183
184
  pg_roles ON pg_roles.oid = pg_stat_statements.userid
184
185
  WHERE
186
+ calls > 0 AND
185
187
  pg_database.datname = #{database ? quote(database) : "current_database()"}
186
188
  #{query_hash ? "AND queryid = #{quote(query_hash)}" : nil}
187
189
  )
@@ -200,6 +202,11 @@ module PgHero
200
202
  #{quote_table_name(sort)} DESC
201
203
  LIMIT #{limit.to_i}
202
204
  SQL
205
+
206
+ # we may be able to skip query_columns
207
+ # in more recent versions of Postgres
208
+ # as pg_stat_statements should be already normalized
209
+ select_all(query, query_columns: [:query])
203
210
  else
204
211
  raise NotEnabled, "Query stats not enabled"
205
212
  end
@@ -208,7 +215,7 @@ module PgHero
208
215
  def historical_query_stats(sort: nil, start_at: nil, end_at: nil, query_hash: nil)
209
216
  if historical_query_stats_enabled?
210
217
  sort ||= "total_minutes"
211
- select_all_stats <<-SQL
218
+ query = <<-SQL
212
219
  WITH query_stats AS (
213
220
  SELECT
214
221
  #{supports_query_hash? ? "query_hash" : "md5(query)"} AS query_hash,
@@ -244,6 +251,10 @@ module PgHero
244
251
  #{quote_table_name(sort)} DESC
245
252
  LIMIT 100
246
253
  SQL
254
+
255
+ # we can skip query_columns if all stored data is normalized
256
+ # for now, assume it's not
257
+ select_all_stats(query, query_columns: [:query, :explainable_query])
247
258
  else
248
259
  raise NotEnabled, "Historical query stats not enabled"
249
260
  end
@@ -48,7 +48,7 @@ module PgHero
48
48
  def suggested_indexes(suggested_indexes_by_query: nil, **options)
49
49
  indexes = []
50
50
 
51
- (suggested_indexes_by_query || self.suggested_indexes_by_query(options)).select { |_s, i| i[:found] && !i[:covering_index] }.group_by { |_s, i| i[:index] }.each do |index, group|
51
+ (suggested_indexes_by_query || self.suggested_indexes_by_query(**options)).select { |_s, i| i[:found] && !i[:covering_index] }.group_by { |_s, i| i[:index] }.each do |index, group|
52
52
  details = {}
53
53
  group.map(&:second).each do |g|
54
54
  details = details.except(:index).deep_merge(g)
@@ -1,31 +1,46 @@
1
1
  module PgHero
2
2
  module Methods
3
3
  module System
4
+ def system_stats_enabled?
5
+ !system_stats_provider.nil?
6
+ end
7
+
8
+ # TODO remove defined checks in 3.0
9
+ def system_stats_provider
10
+ if aws_db_instance_identifier && (defined?(Aws) || defined?(AWS))
11
+ :aws
12
+ elsif gcp_database_id
13
+ :gcp
14
+ elsif azure_resource_id
15
+ :azure
16
+ end
17
+ end
18
+
4
19
  def cpu_usage(**options)
5
- rds_stats("CPUUtilization", options)
20
+ system_stats(:cpu, **options)
6
21
  end
7
22
 
8
23
  def connection_stats(**options)
9
- rds_stats("DatabaseConnections", options)
24
+ system_stats(:connections, **options)
10
25
  end
11
26
 
12
27
  def replication_lag_stats(**options)
13
- rds_stats("ReplicaLag", options)
28
+ system_stats(:replication_lag, **options)
14
29
  end
15
30
 
16
31
  def read_iops_stats(**options)
17
- rds_stats("ReadIOPS", options)
32
+ system_stats(:read_iops, **options)
18
33
  end
19
34
 
20
35
  def write_iops_stats(**options)
21
- rds_stats("WriteIOPS", options)
36
+ system_stats(:write_iops, **options)
22
37
  end
23
38
 
24
39
  def free_space_stats(**options)
25
- rds_stats("FreeStorageSpace", options)
40
+ system_stats(:free_space, **options)
26
41
  end
27
42
 
28
- def rds_stats(metric_name, duration: nil, period: nil, offset: nil)
43
+ def rds_stats(metric_name, duration: nil, period: nil, offset: nil, series: false)
29
44
  if system_stats_enabled?
30
45
  aws_options = {region: region}
31
46
  if access_key_id
@@ -43,16 +58,14 @@ module PgHero
43
58
  duration = (duration || 1.hour).to_i
44
59
  period = (period || 1.minute).to_i
45
60
  offset = (offset || 0).to_i
46
-
47
- end_time = (Time.now - offset)
48
- # ceil period
49
- end_time = Time.at((end_time.to_f / period).ceil * period)
61
+ end_time = Time.at(((Time.now - offset).to_f / period).ceil * period)
62
+ start_time = end_time - duration
50
63
 
51
64
  resp = client.get_metric_statistics(
52
65
  namespace: "AWS/RDS",
53
66
  metric_name: metric_name,
54
- dimensions: [{name: "DBInstanceIdentifier", value: db_instance_identifier}],
55
- start_time: (end_time - duration).iso8601,
67
+ dimensions: [{name: "DBInstanceIdentifier", value: aws_db_instance_identifier}],
68
+ start_time: start_time.iso8601,
56
69
  end_time: end_time.iso8601,
57
70
  period: period,
58
71
  statistics: ["Average"]
@@ -61,14 +74,213 @@ module PgHero
61
74
  resp[:datapoints].sort_by { |d| d[:timestamp] }.each do |d|
62
75
  data[d[:timestamp]] = d[:average]
63
76
  end
77
+
78
+ add_missing_data(data, start_time, end_time, period) if series
79
+
64
80
  data
65
81
  else
66
82
  raise NotEnabled, "System stats not enabled"
67
83
  end
68
84
  end
69
85
 
70
- def system_stats_enabled?
71
- !!((defined?(Aws) || defined?(AWS)) && db_instance_identifier)
86
+ def azure_stats(metric_name, duration: nil, period: nil, offset: nil, series: false)
87
+ # TODO DRY with RDS stats
88
+ duration = (duration || 1.hour).to_i
89
+ period = (period || 1.minute).to_i
90
+ offset = (offset || 0).to_i
91
+ end_time = Time.at(((Time.now - offset).to_f / period).ceil * period)
92
+ start_time = end_time - duration
93
+
94
+ interval =
95
+ case period
96
+ when 60
97
+ "PT1M"
98
+ when 300
99
+ "PT5M"
100
+ when 900
101
+ "PT15M"
102
+ when 1800
103
+ "PT30M"
104
+ when 3600
105
+ "PT1H"
106
+ else
107
+ raise Error, "Unsupported period"
108
+ end
109
+
110
+ client = Azure::Monitor::Profiles::Latest::Mgmt::Client.new
111
+ timespan = "#{start_time.iso8601}/#{end_time.iso8601}"
112
+ results = client.metrics.list(
113
+ azure_resource_id,
114
+ metricnames: metric_name,
115
+ aggregation: "Average",
116
+ timespan: timespan,
117
+ interval: interval
118
+ )
119
+
120
+ data = {}
121
+ result = results.value.first
122
+ if result
123
+ result.timeseries.first.data.each do |point|
124
+ data[point.time_stamp.to_time] = point.average
125
+ end
126
+ end
127
+
128
+ add_missing_data(data, start_time, end_time, period) if series
129
+
130
+ data
131
+ end
132
+
133
+ private
134
+
135
+ def gcp_stats(metric_name, duration: nil, period: nil, offset: nil, series: false)
136
+ require "google/cloud/monitoring/v3"
137
+
138
+ # TODO DRY with RDS stats
139
+ duration = (duration || 1.hour).to_i
140
+ period = (period || 1.minute).to_i
141
+ offset = (offset || 0).to_i
142
+ end_time = Time.at(((Time.now - offset).to_f / period).ceil * period)
143
+ start_time = end_time - duration
144
+
145
+ # validate input since we need to interpolate below
146
+ raise Error, "Invalid metric name" unless metric_name =~ /\A[a-z\/_]+\z/i
147
+ raise Error, "Invalid database id" unless gcp_database_id =~ /\A[a-z\-:]+\z/i
148
+
149
+ # we handle three situations:
150
+ # 1. google-cloud-monitoring-v3
151
+ # 2. google-cloud-monitoring >= 1
152
+ # 3. google-cloud-monitoring < 1
153
+
154
+ # for situations 1 and 2
155
+ # Google::Cloud::Monitoring.metric_service is documented
156
+ # but doesn't work for situation 1
157
+ if defined?(Google::Cloud::Monitoring::V3::MetricService::Client)
158
+ client = Google::Cloud::Monitoring::V3::MetricService::Client.new
159
+
160
+ interval = Google::Cloud::Monitoring::V3::TimeInterval.new
161
+ interval.end_time = Google::Protobuf::Timestamp.new(seconds: end_time.to_i)
162
+ # subtract period to make sure we get first data point
163
+ interval.start_time = Google::Protobuf::Timestamp.new(seconds: (start_time - period).to_i)
164
+
165
+ aggregation = Google::Cloud::Monitoring::V3::Aggregation.new
166
+ # may be better to use ALIGN_NEXT_OLDER for space stats to show most recent data point
167
+ # stick with average for now to match AWS
168
+ aggregation.per_series_aligner = Google::Cloud::Monitoring::V3::Aggregation::Aligner::ALIGN_MEAN
169
+ aggregation.alignment_period = period
170
+
171
+ results = client.list_time_series({
172
+ name: "projects/#{gcp_database_id.split(":").first}",
173
+ filter: "metric.type = \"cloudsql.googleapis.com/database/#{metric_name}\" AND resource.label.database_id = \"#{gcp_database_id}\"",
174
+ interval: interval,
175
+ view: Google::Cloud::Monitoring::V3::ListTimeSeriesRequest::TimeSeriesView::FULL,
176
+ aggregation: aggregation
177
+ })
178
+ else
179
+ require "google/cloud/monitoring"
180
+
181
+ client = Google::Cloud::Monitoring::Metric.new
182
+
183
+ interval = Google::Monitoring::V3::TimeInterval.new
184
+ interval.end_time = Google::Protobuf::Timestamp.new(seconds: end_time.to_i)
185
+ # subtract period to make sure we get first data point
186
+ interval.start_time = Google::Protobuf::Timestamp.new(seconds: (start_time - period).to_i)
187
+
188
+ aggregation = Google::Monitoring::V3::Aggregation.new
189
+ # may be better to use ALIGN_NEXT_OLDER for space stats to show most recent data point
190
+ # stick with average for now to match AWS
191
+ aggregation.per_series_aligner = Google::Monitoring::V3::Aggregation::Aligner::ALIGN_MEAN
192
+ aggregation.alignment_period = period
193
+
194
+ results = client.list_time_series(
195
+ "projects/#{gcp_database_id.split(":").first}",
196
+ "metric.type = \"cloudsql.googleapis.com/database/#{metric_name}\" AND resource.label.database_id = \"#{gcp_database_id}\"",
197
+ interval,
198
+ Google::Monitoring::V3::ListTimeSeriesRequest::TimeSeriesView::FULL,
199
+ aggregation: aggregation
200
+ )
201
+ end
202
+
203
+ data = {}
204
+ result = results.first
205
+ if result
206
+ result.points.each do |point|
207
+ time = Time.at(point.interval.start_time.seconds)
208
+ value = point.value.double_value
209
+ value *= 100 if metric_name == "cpu/utilization"
210
+ data[time] = value
211
+ end
212
+ end
213
+
214
+ add_missing_data(data, start_time, end_time, period) if series
215
+
216
+ data
217
+ end
218
+
219
+ def system_stats(metric_key, **options)
220
+ case system_stats_provider
221
+ when :aws
222
+ metrics = {
223
+ cpu: "CPUUtilization",
224
+ connections: "DatabaseConnections",
225
+ replication_lag: "ReplicaLag",
226
+ read_iops: "ReadIOPS",
227
+ write_iops: "WriteIOPS",
228
+ free_space: "FreeStorageSpace"
229
+ }
230
+ rds_stats(metrics[metric_key], **options)
231
+ when :gcp
232
+ if metric_key == :free_space
233
+ quota = gcp_stats("disk/quota", **options)
234
+ used = gcp_stats("disk/bytes_used", **options)
235
+ free_space(quota, used)
236
+ else
237
+ metrics = {
238
+ cpu: "cpu/utilization",
239
+ connections: "postgresql/num_backends",
240
+ replication_lag: "replication/replica_lag",
241
+ read_iops: "disk/read_ops_count",
242
+ write_iops: "disk/write_ops_count"
243
+ }
244
+ gcp_stats(metrics[metric_key], **options)
245
+ end
246
+ when :azure
247
+ if metric_key == :free_space
248
+ quota = azure_stats("storage_limit", **options)
249
+ used = azure_stats("storage_used", **options)
250
+ free_space(quota, used)
251
+ else
252
+ # no read_iops, write_iops
253
+ # could add io_consumption_percent
254
+ metrics = {
255
+ cpu: "cpu_percent",
256
+ connections: "active_connections",
257
+ replication_lag: "pg_replica_log_delay_in_seconds"
258
+ }
259
+ raise Error, "Metric not supported" unless metrics[metric_key]
260
+ azure_stats(metrics[metric_key], **options)
261
+ end
262
+ else
263
+ raise NotEnabled, "System stats not enabled"
264
+ end
265
+ end
266
+
267
+ # only use data points included in both series
268
+ # this also eliminates need to align Time.now
269
+ def free_space(quota, used)
270
+ data = {}
271
+ quota.each do |k, v|
272
+ data[k] = v - used[k] if v && used[k]
273
+ end
274
+ data
275
+ end
276
+
277
+ def add_missing_data(data, start_time, end_time, period)
278
+ time = start_time
279
+ end_time = end_time
280
+ while time < end_time
281
+ data[time] ||= nil
282
+ time += period
283
+ end
72
284
  end
73
285
  end
74
286
  end