fluent-plugin-postgresql-csvlog 0.3.2 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5d61fc31718e43c6d1dff46139a1b03d56384405d45b69c9772dc6f7b6a66dbf
4
- data.tar.gz: 5d2d23a4a7b5f277b19f181aa515ab74718f6ef93824e2c958264d9bbdf9c9aa
3
+ metadata.gz: 38090772bf1e0302ecd53d759d8da4ffb2df0cdf0a6f001ec0e0d12390472276
4
+ data.tar.gz: 22bc8479b81f0b0fd7615cfb6cc1b491b317fc74d612f21373243a516e671be4
5
5
  SHA512:
6
- metadata.gz: f2106f60749b6fa8fc931ccd3d85f51595b0ec60eb9f425935072b1749ed068e8a068c6d513f1557a20e8c5a0613acad80c6690c7ce2aa20cca3027d686c388a
7
- data.tar.gz: 9ceef623cbd5256e047dea817d487d2711cc8b387ff068a10913100fe649e77174a600ac019b4729403390152c1a3e6f162accf4cc50f020561032b6493d965f
6
+ metadata.gz: c3ecfef7d3290ab00a71613c7914c0aae1d05501d37574614d9d6f4a2f5de9548474ecb7b6fdad4ef778c4ea48510a1e31a89294e317ecf7ab6004e094579cc9
7
+ data.tar.gz: 2b91a1704d53f07ab5139bc2d623c0c17afde53b416ef5f018dd7e6d345ce8a8da1d0aa8964bcef638b8e6ab351b2d7c16b83be73cbdcfba560dda69962bdd17
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ Gemfile.lock
data/.gitlab-ci.yml CHANGED
@@ -10,10 +10,9 @@ test:
10
10
  paths:
11
11
  - vendor/ruby
12
12
 
13
- # integration tests
14
- itest:
13
+ .iteration_test:
15
14
  services:
16
- - name: postgres:12
15
+ - name: postgres:$POSTGRES_SERVER_VERSION
17
16
  alias: postgres
18
17
  command: ["postgres", "-c", "shared_preload_libraries=pg_stat_statements", "-c", "pg_stat_statements.track=all"]
19
18
  variables:
@@ -27,3 +26,28 @@ itest:
27
26
  cache:
28
27
  paths:
29
28
  - vendor/ruby
29
+
30
+ # integration tests for postgres 12
31
+ itest_pg12:
32
+ extends: .iteration_test
33
+ variables:
34
+ POSTGRES_SERVER_VERSION: 12
35
+
36
+ # integration tests for postgres 13
37
+ itest_pg13:
38
+ extends: .iteration_test
39
+ variables:
40
+ POSTGRES_SERVER_VERSION: 13
41
+
42
+ end_to_end_verification_test:
43
+ image: docker:19.03.12
44
+ services:
45
+ - docker:19.03.12-dind
46
+ tags:
47
+ - gitlab-org-docker
48
+ variables:
49
+ DOCKER_TLS_CERTDIR: ""
50
+ before_script:
51
+ - apk add --no-cache docker-compose
52
+ script:
53
+ - docker-compose run --rm verifier
data/README.md CHANGED
@@ -7,6 +7,8 @@ parse PostgreSQL CSV log files and extract slow log information:
7
7
  - `PostgreSQLSlowLog`: Extracts slow log entries into `duration_s` and `statement` fields
8
8
  - `PostgreSQLRedactor`: Normalizes the SQL query and redacts sensitive information
9
9
  - `Marginalia`: Parses [Marginalia comments](https://github.com/basecamp/marginalia) into key-value pairs and stores them
10
+ - `PgStatStatementsInput`: polls the [`pg_stat_statements`](https://www.postgresql.org/docs/current/pgstatstatements.html) postgres plugin and emits fluentd events.
11
+ - `PgStatActivityInput`: polls the [`postges activity monitor`](https://www.postgresql.org/docs/current/monitoring-stats.html) and emits fluentd events.
10
12
 
11
13
  ## Installation
12
14
 
@@ -72,3 +74,18 @@ ingest and parse PostgreSQL CSV logs:
72
74
  </format>
73
75
  </match>
74
76
  ```
77
+
78
+ ## Developing `fluent-plugin-postgresql-csvlog`
79
+
80
+ To develop and debug locally, there is a `Dockerfile` and `docker-compose.yml` that will setup a local environment,
81
+ complete with Postgres, suitable for testing purposes.
82
+
83
+ 1. `docker compose build` - build the current configuration
84
+ 1. `docker compose run --rm verifier` - test the current configuration
85
+ 1. `docker compose up`
86
+
87
+ ### Releasing a new version
88
+
89
+ 1. Update the version in `fluent-plugin-postgresql-csvlog.gemspec`.
90
+ 1. Create a merge request and merge the changes to `master`.
91
+ 1. Run `bundle exec rake release`.
data/docker-compose.yml CHANGED
@@ -1,14 +1,9 @@
1
1
  # Docker Compose setup useful for testing and development purposes
2
- version: "3.9"
2
+ version: "3.3"
3
3
  services:
4
- fluentd:
5
- build: .
6
- links:
7
- - postgres
8
- entrypoint: /usr/bin/fluentd -vvv -c /src/example-fluentd.conf
9
4
  postgres:
10
- image: postgres
11
- restart: always
5
+ image: postgres:13
6
+ restart: "no"
12
7
  environment:
13
8
  - POSTGRES_USER=testuser
14
9
  - POSTGRES_PASSWORD=testpass
@@ -17,3 +12,26 @@ services:
17
12
  command: postgres -c shared_preload_libraries=pg_stat_statements -c pg_stat_statements.track=all
18
13
  volumes:
19
14
  - ./sql/create_extension.sql:/docker-entrypoint-initdb.d/create_extension.sql
15
+
16
+ fluentd:
17
+ build: .
18
+ restart: "no"
19
+ links:
20
+ - postgres
21
+ entrypoint: /usr/bin/fluentd -vvv -c /src/example-fluentd.conf
22
+ volumes:
23
+ - ./example-fluentd.conf:/src/example-fluentd.conf
24
+ - log-volume:/var/log/pg/
25
+
26
+ verifier:
27
+ image: alpine:3.13
28
+ restart: "no"
29
+ links:
30
+ - fluentd
31
+ command: /bin/sh /src/verify-docker-compose.sh
32
+ volumes:
33
+ - ./test/verify-docker-compose.sh:/src/verify-docker-compose.sh
34
+ - log-volume:/var/log/pg/
35
+
36
+ volumes:
37
+ log-volume:
data/example-fluentd.conf CHANGED
@@ -8,5 +8,34 @@
8
8
  </source>
9
9
 
10
10
  <match postgres.pg_stat_statements>
11
- @type stdout
11
+ @type file
12
+ path /var/log/pg/pg_stat_statements
13
+ time_slice_format %Y%m%d%H%M%S
14
+ flush_interval 1s
15
+ utc
16
+
17
+ <format>
18
+ @type json
19
+ </format>
12
20
  </match>
21
+
22
+ <source>
23
+ @type pg_stat_activity
24
+ tag postgres.pg_stat_activity
25
+ host postgres
26
+ username testuser
27
+ password testpass
28
+ interval 1
29
+ </source>
30
+
31
+ <match postgres.pg_stat_activity>
32
+ @type file
33
+ path /var/log/pg/pg_stat_activity
34
+ time_slice_format %Y%m%d%H%M%S
35
+ flush_interval 1s
36
+ utc
37
+ <format>
38
+ @type json
39
+ </format>
40
+ </match>
41
+
@@ -2,7 +2,7 @@ $:.push File.expand_path('lib', __dir__)
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'fluent-plugin-postgresql-csvlog'
5
- s.version = '0.3.2'
5
+ s.version = '0.7.0'
6
6
  s.authors = ['stanhu']
7
7
  s.email = ['stanhu@gmail.com']
8
8
  s.homepage = 'https://gitlab.com/gitlab-org/fluent-plugins/fluent-plugin-postgresql-csvlog'
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'fluent/plugin/filter'
4
+ require_relative './marginalia_extractor'
4
5
 
5
6
  module Fluent
6
7
  module Plugin
@@ -11,6 +12,7 @@ module Fluent
11
12
  # /*application:sidekiq,correlation_id:d67cae54c169e0cab7d73389e2934f0e,jid:52a1c8a9e4c555ea573f20f0,job_class:Geo::MetricsUpdateWorker*/ SELECT COUNT(*) FROM "projects"
12
13
  #
13
14
  class Marginalia < Filter
15
+ include MarginaliaExtractor
14
16
  Fluent::Plugin.register_filter('marginalia', self)
15
17
 
16
18
  desc 'Field to parse for Marginalia comments (key1:value1,key2:value2)'
@@ -19,77 +21,11 @@ module Fluent
19
21
  desc 'Whether to strip the comment from the record specified by key'
20
22
  config_param :strip_comment, :bool, default: true
21
23
 
22
- MARGINALIA_PREPENDED_REGEXP = %r{^(?<comment>/\*.*\*/)(?<sql>.*)}m.freeze
23
- MARGINALIA_APPENDED_REGEXP = %r{(?<sql>.*)(?<comment>/\*.*\*/)$}m.freeze
24
-
25
24
  def filter(_tag, _time, record)
26
- parse_comments(record)
25
+ parse_marginalia_into_record(record, @key, @strip_comment)
27
26
 
28
27
  record
29
28
  end
30
-
31
- private
32
-
33
- def parse_comments(record)
34
- sql = record[@key]
35
-
36
- return unless sql
37
-
38
- comment_match = match_marginalia_comment(sql)
39
-
40
- return unless comment_match
41
-
42
- entries = extract_entries(comment_match['comment'])
43
- parse_entries(entries, record)
44
-
45
- record[@key] = comment_match['sql'].strip if @strip_comment
46
- end
47
-
48
- def match_marginalia_comment(sql)
49
- matched = MARGINALIA_PREPENDED_REGEXP.match(sql)
50
-
51
- return matched if matched
52
-
53
- MARGINALIA_APPENDED_REGEXP.match(sql)
54
- end
55
-
56
- def extract_entries(comment)
57
- comment = scrub_comment(comment)
58
-
59
- return [] unless comment
60
-
61
- comment.split(',')
62
- end
63
-
64
- def scrub_comment(comment)
65
- return unless comment
66
-
67
- comment.strip!
68
- comment.gsub!(%r{^/\*}, '')
69
- comment.gsub!(%r{\*/$}, '')
70
- end
71
-
72
- def parse_entries(entries, record)
73
- entries.each do |component|
74
- data = component.split(':', 2)
75
-
76
- break unless data.length == 2
77
-
78
- stored_key = store_key(record, data[0])
79
- record[stored_key] = data[1]
80
- end
81
- end
82
-
83
- def store_key(record, component_key)
84
- # In case there is a conflict with the Marginalia key
85
- # (e.g. `correlation_id`), we use the base key
86
- # (`sql_correlation_id`) instead.
87
- if record.key?(component_key)
88
- "#{@key}_#{component_key}"
89
- else
90
- component_key
91
- end
92
- end
93
29
  end
94
30
  end
95
31
  end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative './polling_pg_input_plugin'
4
+ require 'pg_query'
5
+ require_relative './marginalia_extractor'
6
+ require 'time'
7
+
8
+ module Fluent::Plugin
9
+ # PgStatActivityInput polls the `pg_stat_activity` table
10
+ # emitting normalized versions of the queries currently running on
11
+ # the postgres server.
12
+ # Fingerprints of the queries are also included for easier aggregation
13
+ class PgStatActivityInput < PollingPostgresInputPlugin
14
+ include MarginaliaExtractor
15
+ Fluent::Plugin.register_input('pg_stat_activity', self)
16
+
17
+ ACTIVITY_QUERY = <<-SQL
18
+ SELECT
19
+ datid,
20
+ datname,
21
+ pid,
22
+ usesysid,
23
+ usename,
24
+ application_name,
25
+ host(client_addr) as client_addr,
26
+ client_hostname,
27
+ client_port,
28
+ xact_start,
29
+ extract(epoch from clock_timestamp() - xact_start) xact_age_s,
30
+ query_start,
31
+ extract(epoch from clock_timestamp() - query_start) query_age_s,
32
+ state_change,
33
+ extract(epoch from clock_timestamp() - state_change) state_age_s,
34
+ state,
35
+ query
36
+ FROM pg_stat_activity
37
+ WHERE usename IS NOT NULL
38
+ SQL
39
+
40
+ desc 'Name of field to store SQL query fingerprint'
41
+ config_param :fingerprint_key, :string, default: 'fingerprint'
42
+
43
+ protected
44
+
45
+ def on_poll
46
+ with_connection do |conn|
47
+ emit_activity_to_stream(conn)
48
+ end
49
+ end
50
+
51
+ public
52
+
53
+ # Query the database and emit statements to fluentd router
54
+ def emit_activity_to_stream(conn)
55
+ me = Fluent::MultiEventStream.new
56
+
57
+ now = Fluent::Engine.now
58
+ conn.exec(ACTIVITY_QUERY).each do |row|
59
+ record = record_for_row(row)
60
+ me.add(now, record)
61
+ end
62
+
63
+ @router.emit_stream(@tag, me)
64
+ end
65
+
66
+ # Returns a fluentd record for a query row
67
+ def record_for_row(row)
68
+ record = {
69
+ 'datid' => row['datid'],
70
+ 'datname' => row['datname'],
71
+ 'pid' => row['pid'],
72
+ 'usesysid' => row['usesysid'],
73
+ 'usename' => row['usename'],
74
+ 'application_name' => row['application_name'],
75
+ 'client_addr' => row['client_addr'],
76
+ 'client_hostname' => row['client_hostname'],
77
+ 'client_port' => row['client_port'],
78
+ 'xact_start' => row['xact_start']&.iso8601(3),
79
+ 'xact_age_s' => row['xact_age_s'],
80
+ 'query_start' => row['query_start']&.iso8601(3),
81
+ 'query_age_s' => row['query_age_s'],
82
+ 'state_change' => row['state_change']&.iso8601(3),
83
+ 'state_age_s' => row['state_age_s'],
84
+ 'state' => row['state'],
85
+ 'query' => row['query'] # This will be stripped, normalized etc
86
+ }
87
+
88
+ # Inject marginalia into record
89
+ parse_marginalia_into_record(record, 'query', true)
90
+
91
+ # Normalize query and fingerprint
92
+ # Note that `record['query']` was updated in previous step
93
+ # To strip off marginalia comments
94
+ record.merge!(fingerprint_query(record['query']))
95
+
96
+ record
97
+ end
98
+
99
+ def fingerprint_query(query)
100
+ # We record the query_length as it will help in understanding whether unparseable
101
+ # queries are truncated.
102
+ record = { 'query_length' => query&.length, 'query' => nil }
103
+
104
+ return record unless query
105
+
106
+ normalized = PgQuery.normalize(query)
107
+ record['query'] = normalized
108
+
109
+ record[@fingerprint_key] = PgQuery.parse(normalized).fingerprint if @fingerprint_key
110
+
111
+ record
112
+ rescue PgQuery::ParseError
113
+ record['query_unparseable'] = true
114
+
115
+ record
116
+ end
117
+ end
118
+ end
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'fluent/plugin/input'
4
- require 'pg'
3
+ require_relative './polling_pg_input_plugin'
5
4
  require 'pg_query'
6
5
 
7
6
  module Fluent::Plugin
@@ -12,67 +11,47 @@ module Fluent::Plugin
12
11
  # 'fingerprint' => '8a6e9896bd9048a2',
13
12
  # 'query' => 'SELECT * FROM table ORDER BY queryid LIMIT $1',
14
13
  # 'query_length' => 58,
15
- # 'queryid' => 3239318621761098074
14
+ # 'queryid' => '3239318621761098074'
16
15
  # }
17
- class PgStatStatementsInput < Input
16
+ class PgStatStatementsInput < PollingPostgresInputPlugin
18
17
  Fluent::Plugin.register_input('pg_stat_statements', self)
19
18
 
20
- desc 'PostgreSQL host'
21
- config_param :host, :string
22
-
23
- desc 'RDBMS port (default: 5432)'
24
- config_param :port, :integer, default: 5432
25
-
26
- desc 'login user name'
27
- config_param :username, :string, default: nil
28
-
29
- desc 'postgres db'
30
- config_param :dbname, :string, default: nil
31
-
32
- desc 'login password'
33
- config_param :password, :string, default: nil, secret: true
34
-
35
- # See https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE
36
- # for options
37
- desc 'postgres sslmode'
38
- config_param :sslmode, :string, default: 'prefer'
39
-
40
- desc 'tag'
41
- config_param :tag, :string, default: nil
42
-
43
- desc 'interval in second to run query'
44
- config_param :interval, :time, default: 300
45
-
46
19
  desc 'Name of field to store SQL query fingerprint'
47
20
  config_param :fingerprint_key, :string, default: 'fingerprint'
48
21
 
49
- def start
50
- @stop_flag = false
51
- @thread = Thread.new(&method(:thread_main))
22
+ POSTGRES_SERVER_VERSION_QUERY = "SELECT current_setting('server_version_num')"
23
+
24
+ PG12_STAT_STATEMENTS_QUERY = <<-SQL
25
+ SELECT queryid,
26
+ query,
27
+ calls,
28
+ rows,
29
+ total_time
30
+ FROM public.pg_stat_statements
31
+ SQL
32
+
33
+ PG13_STAT_STATEMENTS_QUERY = <<-SQL
34
+ SELECT queryid,
35
+ query,
36
+ calls,
37
+ rows,
38
+ (total_plan_time + total_exec_time) total_time
39
+ FROM public.pg_stat_statements
40
+ SQL
41
+
42
+ protected
43
+
44
+ def on_poll
45
+ with_connection do |conn|
46
+ emit_statements_to_stream(conn)
47
+ end
52
48
  end
53
49
 
54
- def shutdown
55
- @stop_flag = true
56
-
57
- # Interrupt thread and wait for it to finish
58
- Thread.new { @thread.run } if @thread
59
- @thread.join
60
- end
50
+ public
61
51
 
62
- def thread_main
63
- until @stop_flag
64
- sleep @interval
65
- break if @stop_flag
66
-
67
- begin
68
- with_connection do |conn|
69
- emit_statements_to_stream(conn)
70
- end
71
- rescue StandardError => e
72
- log.error 'unexpected error', error: e.message, error_class: e.class
73
- log.error_backtrace e.backtrace
74
- end
75
- end
52
+ def initialize
53
+ super
54
+ @postgres_server_version_num = nil
76
55
  end
77
56
 
78
57
  # Returns a fluentd record for a query row
@@ -81,7 +60,13 @@ module Fluent::Plugin
81
60
 
82
61
  # We record the query_length as it will help in understanding whether unparseable
83
62
  # queries are truncated.
84
- record = { 'queryid' => row['queryid'], 'query_length' => query&.length }
63
+ record = {
64
+ 'queryid' => row['queryid'].to_s,
65
+ 'query_length' => query&.length,
66
+ 'calls' => row['calls']&.to_i,
67
+ 'total_time_ms' => row['total_time']&.to_f,
68
+ 'rows' => row['rows']&.to_i
69
+ }
85
70
 
86
71
  return record unless query
87
72
 
@@ -97,14 +82,15 @@ module Fluent::Plugin
97
82
  record
98
83
  end
99
84
 
100
- private
101
-
102
85
  # Query the database and emit statements to fluentd router
103
86
  def emit_statements_to_stream(conn)
104
87
  me = Fluent::MultiEventStream.new
105
88
 
106
89
  now = Fluent::Engine.now
107
- conn.exec('SELECT queryid, query FROM public.pg_stat_statements').each do |row|
90
+
91
+ query = query_for_postgres_version(conn)
92
+
93
+ conn.exec(query).each do |row|
108
94
  record = record_for_row(row)
109
95
  me.add(now, record)
110
96
  end
@@ -112,25 +98,21 @@ module Fluent::Plugin
112
98
  @router.emit_stream(@tag, me)
113
99
  end
114
100
 
115
- # Since this query is very infrequent, and it may be communicating directly
116
- # with postgres without pgbouncer, don't use a persistent connection and
117
- # ensure that it is properly closed
118
- def with_connection(&block)
119
- conn = PG.connect(
120
- host: @host,
121
- dbname: @dbname,
122
- sslmode: @sslmode,
123
- user: @username,
124
- password: @password
125
- )
126
- conn.type_map_for_results = PG::BasicTypeMapForResults.new conn
127
-
128
- begin
129
- block.call(conn)
130
- ensure
131
- # Always close the connection
132
- conn.finish
133
- end
101
+ # Returns the PG_VERSION_NUM value from the database
102
+ # will memoize the result
103
+ def postgres_server_version_num(conn)
104
+ return @postgres_server_version_num if @postgres_server_version_num
105
+
106
+ @postgres_server_version_num = conn.exec(POSTGRES_SERVER_VERSION_QUERY).getvalue(0,0).to_i
107
+ end
108
+
109
+ # pg_stat_statements columns changed in pg13, so we use different queries depending on the version
110
+ # https://www.postgresql.org/docs/12/pgstatstatements.html
111
+ # https://www.postgresql.org/docs/13/pgstatstatements.html
112
+ def query_for_postgres_version(conn)
113
+ return PG13_STAT_STATEMENTS_QUERY if postgres_server_version_num(conn) >= 13_00_00
114
+
115
+ PG12_STAT_STATEMENTS_QUERY
134
116
  end
135
117
  end
136
118
  end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fluent/plugin/filter'
4
+
5
+ module Fluent::Plugin
6
+ # MarginaliaExtractor provides the parse_marginalia_into_record
7
+ # utility method, useful for extracting marginalia into fluentd records
8
+ module MarginaliaExtractor
9
+ MARGINALIA_PREPENDED_REGEXP = %r{^(?<comment>/\*.*\*/)(?<sql>.*)}m.freeze
10
+ MARGINALIA_APPENDED_REGEXP = %r{(?<sql>.*)(?<comment>/\*.*\*/)$}m.freeze
11
+
12
+ # Injects marginalia into a fluentd record
13
+ def parse_marginalia_into_record(record, key, strip_comment)
14
+ sql = record[key]
15
+ return unless sql
16
+
17
+ comment_match = match_marginalia_comment(sql)
18
+
19
+ return unless comment_match
20
+
21
+ entries = extract_entries(comment_match['comment'])
22
+ parse_entries(entries, key, record)
23
+
24
+ record[key] = comment_match['sql'].strip if strip_comment
25
+ end
26
+
27
+ def match_marginalia_comment(sql)
28
+ matched = MARGINALIA_PREPENDED_REGEXP.match(sql)
29
+
30
+ return matched if matched
31
+
32
+ MARGINALIA_APPENDED_REGEXP.match(sql)
33
+ end
34
+
35
+ def extract_entries(comment)
36
+ comment = scrub_comment(comment)
37
+
38
+ return [] unless comment
39
+
40
+ comment.split(',')
41
+ end
42
+
43
+ def scrub_comment(comment)
44
+ return unless comment
45
+
46
+ comment.strip!
47
+ comment.gsub!(%r{^/\*}, '')
48
+ comment.gsub!(%r{\*/$}, '')
49
+ end
50
+
51
+ def parse_entries(entries, key, record)
52
+ entries.each do |component|
53
+ data = component.split(':', 2)
54
+
55
+ break unless data.length == 2
56
+
57
+ stored_key = store_key(record, key, data[0])
58
+ record[stored_key] = data[1]
59
+ end
60
+ end
61
+
62
+ def store_key(record, key, component_key)
63
+ # In case there is a conflict with the Marginalia key
64
+ # (e.g. `correlation_id`), we use the base key
65
+ # (`sql_correlation_id`) instead.
66
+ if record.key?(component_key)
67
+ "#{key}_#{component_key}"
68
+ else
69
+ component_key
70
+ end
71
+ end
72
+ end
73
+ end