fluent-plugin-postgresql-csvlog 0.3.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5d61fc31718e43c6d1dff46139a1b03d56384405d45b69c9772dc6f7b6a66dbf
4
- data.tar.gz: 5d2d23a4a7b5f277b19f181aa515ab74718f6ef93824e2c958264d9bbdf9c9aa
3
+ metadata.gz: 38090772bf1e0302ecd53d759d8da4ffb2df0cdf0a6f001ec0e0d12390472276
4
+ data.tar.gz: 22bc8479b81f0b0fd7615cfb6cc1b491b317fc74d612f21373243a516e671be4
5
5
  SHA512:
6
- metadata.gz: f2106f60749b6fa8fc931ccd3d85f51595b0ec60eb9f425935072b1749ed068e8a068c6d513f1557a20e8c5a0613acad80c6690c7ce2aa20cca3027d686c388a
7
- data.tar.gz: 9ceef623cbd5256e047dea817d487d2711cc8b387ff068a10913100fe649e77174a600ac019b4729403390152c1a3e6f162accf4cc50f020561032b6493d965f
6
+ metadata.gz: c3ecfef7d3290ab00a71613c7914c0aae1d05501d37574614d9d6f4a2f5de9548474ecb7b6fdad4ef778c4ea48510a1e31a89294e317ecf7ab6004e094579cc9
7
+ data.tar.gz: 2b91a1704d53f07ab5139bc2d623c0c17afde53b416ef5f018dd7e6d345ce8a8da1d0aa8964bcef638b8e6ab351b2d7c16b83be73cbdcfba560dda69962bdd17
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ Gemfile.lock
data/.gitlab-ci.yml CHANGED
@@ -10,10 +10,9 @@ test:
10
10
  paths:
11
11
  - vendor/ruby
12
12
 
13
- # integration tests
14
- itest:
13
+ .iteration_test:
15
14
  services:
16
- - name: postgres:12
15
+ - name: postgres:$POSTGRES_SERVER_VERSION
17
16
  alias: postgres
18
17
  command: ["postgres", "-c", "shared_preload_libraries=pg_stat_statements", "-c", "pg_stat_statements.track=all"]
19
18
  variables:
@@ -27,3 +26,28 @@ itest:
27
26
  cache:
28
27
  paths:
29
28
  - vendor/ruby
29
+
30
+ # integration tests for postgres 12
31
+ itest_pg12:
32
+ extends: .iteration_test
33
+ variables:
34
+ POSTGRES_SERVER_VERSION: 12
35
+
36
+ # integration tests for postgres 13
37
+ itest_pg13:
38
+ extends: .iteration_test
39
+ variables:
40
+ POSTGRES_SERVER_VERSION: 13
41
+
42
+ end_to_end_verification_test:
43
+ image: docker:19.03.12
44
+ services:
45
+ - docker:19.03.12-dind
46
+ tags:
47
+ - gitlab-org-docker
48
+ variables:
49
+ DOCKER_TLS_CERTDIR: ""
50
+ before_script:
51
+ - apk add --no-cache docker-compose
52
+ script:
53
+ - docker-compose run --rm verifier
data/README.md CHANGED
@@ -7,6 +7,8 @@ parse PostgreSQL CSV log files and extract slow log information:
7
7
  - `PostgreSQLSlowLog`: Extracts slow log entries into `duration_s` and `statement` fields
8
8
  - `PostgreSQLRedactor`: Normalizes the SQL query and redacts sensitive information
9
9
  - `Marginalia`: Parses [Marginalia comments](https://github.com/basecamp/marginalia) into key-value pairs and stores them
10
+ - `PgStatStatementsInput`: polls the [`pg_stat_statements`](https://www.postgresql.org/docs/current/pgstatstatements.html) postgres plugin and emits fluentd events.
11
+ - `PgStatActivityInput`: polls the [`postges activity monitor`](https://www.postgresql.org/docs/current/monitoring-stats.html) and emits fluentd events.
10
12
 
11
13
  ## Installation
12
14
 
@@ -72,3 +74,18 @@ ingest and parse PostgreSQL CSV logs:
72
74
  </format>
73
75
  </match>
74
76
  ```
77
+
78
+ ## Developing `fluent-plugin-postgresql-csvlog`
79
+
80
+ To develop and debug locally, there is a `Dockerfile` and `docker-compose.yml` that will setup a local environment,
81
+ complete with Postgres, suitable for testing purposes.
82
+
83
+ 1. `docker compose build` - build the current configuration
84
+ 1. `docker compose run --rm verifier` - test the current configuration
85
+ 1. `docker compose up`
86
+
87
+ ### Releasing a new version
88
+
89
+ 1. Update the version in `fluent-plugin-postgresql-csvlog.gemspec`.
90
+ 1. Create a merge request and merge the changes to `master`.
91
+ 1. Run `bundle exec rake release`.
data/docker-compose.yml CHANGED
@@ -1,14 +1,9 @@
1
1
  # Docker Compose setup useful for testing and development purposes
2
- version: "3.9"
2
+ version: "3.3"
3
3
  services:
4
- fluentd:
5
- build: .
6
- links:
7
- - postgres
8
- entrypoint: /usr/bin/fluentd -vvv -c /src/example-fluentd.conf
9
4
  postgres:
10
- image: postgres
11
- restart: always
5
+ image: postgres:13
6
+ restart: "no"
12
7
  environment:
13
8
  - POSTGRES_USER=testuser
14
9
  - POSTGRES_PASSWORD=testpass
@@ -17,3 +12,26 @@ services:
17
12
  command: postgres -c shared_preload_libraries=pg_stat_statements -c pg_stat_statements.track=all
18
13
  volumes:
19
14
  - ./sql/create_extension.sql:/docker-entrypoint-initdb.d/create_extension.sql
15
+
16
+ fluentd:
17
+ build: .
18
+ restart: "no"
19
+ links:
20
+ - postgres
21
+ entrypoint: /usr/bin/fluentd -vvv -c /src/example-fluentd.conf
22
+ volumes:
23
+ - ./example-fluentd.conf:/src/example-fluentd.conf
24
+ - log-volume:/var/log/pg/
25
+
26
+ verifier:
27
+ image: alpine:3.13
28
+ restart: "no"
29
+ links:
30
+ - fluentd
31
+ command: /bin/sh /src/verify-docker-compose.sh
32
+ volumes:
33
+ - ./test/verify-docker-compose.sh:/src/verify-docker-compose.sh
34
+ - log-volume:/var/log/pg/
35
+
36
+ volumes:
37
+ log-volume:
data/example-fluentd.conf CHANGED
@@ -8,5 +8,34 @@
8
8
  </source>
9
9
 
10
10
  <match postgres.pg_stat_statements>
11
- @type stdout
11
+ @type file
12
+ path /var/log/pg/pg_stat_statements
13
+ time_slice_format %Y%m%d%H%M%S
14
+ flush_interval 1s
15
+ utc
16
+
17
+ <format>
18
+ @type json
19
+ </format>
12
20
  </match>
21
+
22
+ <source>
23
+ @type pg_stat_activity
24
+ tag postgres.pg_stat_activity
25
+ host postgres
26
+ username testuser
27
+ password testpass
28
+ interval 1
29
+ </source>
30
+
31
+ <match postgres.pg_stat_activity>
32
+ @type file
33
+ path /var/log/pg/pg_stat_activity
34
+ time_slice_format %Y%m%d%H%M%S
35
+ flush_interval 1s
36
+ utc
37
+ <format>
38
+ @type json
39
+ </format>
40
+ </match>
41
+
@@ -2,7 +2,7 @@ $:.push File.expand_path('lib', __dir__)
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'fluent-plugin-postgresql-csvlog'
5
- s.version = '0.3.2'
5
+ s.version = '0.7.0'
6
6
  s.authors = ['stanhu']
7
7
  s.email = ['stanhu@gmail.com']
8
8
  s.homepage = 'https://gitlab.com/gitlab-org/fluent-plugins/fluent-plugin-postgresql-csvlog'
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'fluent/plugin/filter'
4
+ require_relative './marginalia_extractor'
4
5
 
5
6
  module Fluent
6
7
  module Plugin
@@ -11,6 +12,7 @@ module Fluent
11
12
  # /*application:sidekiq,correlation_id:d67cae54c169e0cab7d73389e2934f0e,jid:52a1c8a9e4c555ea573f20f0,job_class:Geo::MetricsUpdateWorker*/ SELECT COUNT(*) FROM "projects"
12
13
  #
13
14
  class Marginalia < Filter
15
+ include MarginaliaExtractor
14
16
  Fluent::Plugin.register_filter('marginalia', self)
15
17
 
16
18
  desc 'Field to parse for Marginalia comments (key1:value1,key2:value2)'
@@ -19,77 +21,11 @@ module Fluent
19
21
  desc 'Whether to strip the comment from the record specified by key'
20
22
  config_param :strip_comment, :bool, default: true
21
23
 
22
- MARGINALIA_PREPENDED_REGEXP = %r{^(?<comment>/\*.*\*/)(?<sql>.*)}m.freeze
23
- MARGINALIA_APPENDED_REGEXP = %r{(?<sql>.*)(?<comment>/\*.*\*/)$}m.freeze
24
-
25
24
  def filter(_tag, _time, record)
26
- parse_comments(record)
25
+ parse_marginalia_into_record(record, @key, @strip_comment)
27
26
 
28
27
  record
29
28
  end
30
-
31
- private
32
-
33
- def parse_comments(record)
34
- sql = record[@key]
35
-
36
- return unless sql
37
-
38
- comment_match = match_marginalia_comment(sql)
39
-
40
- return unless comment_match
41
-
42
- entries = extract_entries(comment_match['comment'])
43
- parse_entries(entries, record)
44
-
45
- record[@key] = comment_match['sql'].strip if @strip_comment
46
- end
47
-
48
- def match_marginalia_comment(sql)
49
- matched = MARGINALIA_PREPENDED_REGEXP.match(sql)
50
-
51
- return matched if matched
52
-
53
- MARGINALIA_APPENDED_REGEXP.match(sql)
54
- end
55
-
56
- def extract_entries(comment)
57
- comment = scrub_comment(comment)
58
-
59
- return [] unless comment
60
-
61
- comment.split(',')
62
- end
63
-
64
- def scrub_comment(comment)
65
- return unless comment
66
-
67
- comment.strip!
68
- comment.gsub!(%r{^/\*}, '')
69
- comment.gsub!(%r{\*/$}, '')
70
- end
71
-
72
- def parse_entries(entries, record)
73
- entries.each do |component|
74
- data = component.split(':', 2)
75
-
76
- break unless data.length == 2
77
-
78
- stored_key = store_key(record, data[0])
79
- record[stored_key] = data[1]
80
- end
81
- end
82
-
83
- def store_key(record, component_key)
84
- # In case there is a conflict with the Marginalia key
85
- # (e.g. `correlation_id`), we use the base key
86
- # (`sql_correlation_id`) instead.
87
- if record.key?(component_key)
88
- "#{@key}_#{component_key}"
89
- else
90
- component_key
91
- end
92
- end
93
29
  end
94
30
  end
95
31
  end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative './polling_pg_input_plugin'
4
+ require 'pg_query'
5
+ require_relative './marginalia_extractor'
6
+ require 'time'
7
+
8
+ module Fluent::Plugin
9
+ # PgStatActivityInput polls the `pg_stat_activity` table
10
+ # emitting normalized versions of the queries currently running on
11
+ # the postgres server.
12
+ # Fingerprints of the queries are also included for easier aggregation
13
+ class PgStatActivityInput < PollingPostgresInputPlugin
14
+ include MarginaliaExtractor
15
+ Fluent::Plugin.register_input('pg_stat_activity', self)
16
+
17
+ ACTIVITY_QUERY = <<-SQL
18
+ SELECT
19
+ datid,
20
+ datname,
21
+ pid,
22
+ usesysid,
23
+ usename,
24
+ application_name,
25
+ host(client_addr) as client_addr,
26
+ client_hostname,
27
+ client_port,
28
+ xact_start,
29
+ extract(epoch from clock_timestamp() - xact_start) xact_age_s,
30
+ query_start,
31
+ extract(epoch from clock_timestamp() - query_start) query_age_s,
32
+ state_change,
33
+ extract(epoch from clock_timestamp() - state_change) state_age_s,
34
+ state,
35
+ query
36
+ FROM pg_stat_activity
37
+ WHERE usename IS NOT NULL
38
+ SQL
39
+
40
+ desc 'Name of field to store SQL query fingerprint'
41
+ config_param :fingerprint_key, :string, default: 'fingerprint'
42
+
43
+ protected
44
+
45
+ def on_poll
46
+ with_connection do |conn|
47
+ emit_activity_to_stream(conn)
48
+ end
49
+ end
50
+
51
+ public
52
+
53
+ # Query the database and emit statements to fluentd router
54
+ def emit_activity_to_stream(conn)
55
+ me = Fluent::MultiEventStream.new
56
+
57
+ now = Fluent::Engine.now
58
+ conn.exec(ACTIVITY_QUERY).each do |row|
59
+ record = record_for_row(row)
60
+ me.add(now, record)
61
+ end
62
+
63
+ @router.emit_stream(@tag, me)
64
+ end
65
+
66
+ # Returns a fluentd record for a query row
67
+ def record_for_row(row)
68
+ record = {
69
+ 'datid' => row['datid'],
70
+ 'datname' => row['datname'],
71
+ 'pid' => row['pid'],
72
+ 'usesysid' => row['usesysid'],
73
+ 'usename' => row['usename'],
74
+ 'application_name' => row['application_name'],
75
+ 'client_addr' => row['client_addr'],
76
+ 'client_hostname' => row['client_hostname'],
77
+ 'client_port' => row['client_port'],
78
+ 'xact_start' => row['xact_start']&.iso8601(3),
79
+ 'xact_age_s' => row['xact_age_s'],
80
+ 'query_start' => row['query_start']&.iso8601(3),
81
+ 'query_age_s' => row['query_age_s'],
82
+ 'state_change' => row['state_change']&.iso8601(3),
83
+ 'state_age_s' => row['state_age_s'],
84
+ 'state' => row['state'],
85
+ 'query' => row['query'] # This will be stripped, normalized etc
86
+ }
87
+
88
+ # Inject marginalia into record
89
+ parse_marginalia_into_record(record, 'query', true)
90
+
91
+ # Normalize query and fingerprint
92
+ # Note that `record['query']` was updated in previous step
93
+ # To strip off marginalia comments
94
+ record.merge!(fingerprint_query(record['query']))
95
+
96
+ record
97
+ end
98
+
99
+ def fingerprint_query(query)
100
+ # We record the query_length as it will help in understanding whether unparseable
101
+ # queries are truncated.
102
+ record = { 'query_length' => query&.length, 'query' => nil }
103
+
104
+ return record unless query
105
+
106
+ normalized = PgQuery.normalize(query)
107
+ record['query'] = normalized
108
+
109
+ record[@fingerprint_key] = PgQuery.parse(normalized).fingerprint if @fingerprint_key
110
+
111
+ record
112
+ rescue PgQuery::ParseError
113
+ record['query_unparseable'] = true
114
+
115
+ record
116
+ end
117
+ end
118
+ end
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'fluent/plugin/input'
4
- require 'pg'
3
+ require_relative './polling_pg_input_plugin'
5
4
  require 'pg_query'
6
5
 
7
6
  module Fluent::Plugin
@@ -12,67 +11,47 @@ module Fluent::Plugin
12
11
  # 'fingerprint' => '8a6e9896bd9048a2',
13
12
  # 'query' => 'SELECT * FROM table ORDER BY queryid LIMIT $1',
14
13
  # 'query_length' => 58,
15
- # 'queryid' => 3239318621761098074
14
+ # 'queryid' => '3239318621761098074'
16
15
  # }
17
- class PgStatStatementsInput < Input
16
+ class PgStatStatementsInput < PollingPostgresInputPlugin
18
17
  Fluent::Plugin.register_input('pg_stat_statements', self)
19
18
 
20
- desc 'PostgreSQL host'
21
- config_param :host, :string
22
-
23
- desc 'RDBMS port (default: 5432)'
24
- config_param :port, :integer, default: 5432
25
-
26
- desc 'login user name'
27
- config_param :username, :string, default: nil
28
-
29
- desc 'postgres db'
30
- config_param :dbname, :string, default: nil
31
-
32
- desc 'login password'
33
- config_param :password, :string, default: nil, secret: true
34
-
35
- # See https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE
36
- # for options
37
- desc 'postgres sslmode'
38
- config_param :sslmode, :string, default: 'prefer'
39
-
40
- desc 'tag'
41
- config_param :tag, :string, default: nil
42
-
43
- desc 'interval in second to run query'
44
- config_param :interval, :time, default: 300
45
-
46
19
  desc 'Name of field to store SQL query fingerprint'
47
20
  config_param :fingerprint_key, :string, default: 'fingerprint'
48
21
 
49
- def start
50
- @stop_flag = false
51
- @thread = Thread.new(&method(:thread_main))
22
+ POSTGRES_SERVER_VERSION_QUERY = "SELECT current_setting('server_version_num')"
23
+
24
+ PG12_STAT_STATEMENTS_QUERY = <<-SQL
25
+ SELECT queryid,
26
+ query,
27
+ calls,
28
+ rows,
29
+ total_time
30
+ FROM public.pg_stat_statements
31
+ SQL
32
+
33
+ PG13_STAT_STATEMENTS_QUERY = <<-SQL
34
+ SELECT queryid,
35
+ query,
36
+ calls,
37
+ rows,
38
+ (total_plan_time + total_exec_time) total_time
39
+ FROM public.pg_stat_statements
40
+ SQL
41
+
42
+ protected
43
+
44
+ def on_poll
45
+ with_connection do |conn|
46
+ emit_statements_to_stream(conn)
47
+ end
52
48
  end
53
49
 
54
- def shutdown
55
- @stop_flag = true
56
-
57
- # Interrupt thread and wait for it to finish
58
- Thread.new { @thread.run } if @thread
59
- @thread.join
60
- end
50
+ public
61
51
 
62
- def thread_main
63
- until @stop_flag
64
- sleep @interval
65
- break if @stop_flag
66
-
67
- begin
68
- with_connection do |conn|
69
- emit_statements_to_stream(conn)
70
- end
71
- rescue StandardError => e
72
- log.error 'unexpected error', error: e.message, error_class: e.class
73
- log.error_backtrace e.backtrace
74
- end
75
- end
52
+ def initialize
53
+ super
54
+ @postgres_server_version_num = nil
76
55
  end
77
56
 
78
57
  # Returns a fluentd record for a query row
@@ -81,7 +60,13 @@ module Fluent::Plugin
81
60
 
82
61
  # We record the query_length as it will help in understanding whether unparseable
83
62
  # queries are truncated.
84
- record = { 'queryid' => row['queryid'], 'query_length' => query&.length }
63
+ record = {
64
+ 'queryid' => row['queryid'].to_s,
65
+ 'query_length' => query&.length,
66
+ 'calls' => row['calls']&.to_i,
67
+ 'total_time_ms' => row['total_time']&.to_f,
68
+ 'rows' => row['rows']&.to_i
69
+ }
85
70
 
86
71
  return record unless query
87
72
 
@@ -97,14 +82,15 @@ module Fluent::Plugin
97
82
  record
98
83
  end
99
84
 
100
- private
101
-
102
85
  # Query the database and emit statements to fluentd router
103
86
  def emit_statements_to_stream(conn)
104
87
  me = Fluent::MultiEventStream.new
105
88
 
106
89
  now = Fluent::Engine.now
107
- conn.exec('SELECT queryid, query FROM public.pg_stat_statements').each do |row|
90
+
91
+ query = query_for_postgres_version(conn)
92
+
93
+ conn.exec(query).each do |row|
108
94
  record = record_for_row(row)
109
95
  me.add(now, record)
110
96
  end
@@ -112,25 +98,21 @@ module Fluent::Plugin
112
98
  @router.emit_stream(@tag, me)
113
99
  end
114
100
 
115
- # Since this query is very infrequent, and it may be communicating directly
116
- # with postgres without pgbouncer, don't use a persistent connection and
117
- # ensure that it is properly closed
118
- def with_connection(&block)
119
- conn = PG.connect(
120
- host: @host,
121
- dbname: @dbname,
122
- sslmode: @sslmode,
123
- user: @username,
124
- password: @password
125
- )
126
- conn.type_map_for_results = PG::BasicTypeMapForResults.new conn
127
-
128
- begin
129
- block.call(conn)
130
- ensure
131
- # Always close the connection
132
- conn.finish
133
- end
101
+ # Returns the PG_VERSION_NUM value from the database
102
+ # will memoize the result
103
+ def postgres_server_version_num(conn)
104
+ return @postgres_server_version_num if @postgres_server_version_num
105
+
106
+ @postgres_server_version_num = conn.exec(POSTGRES_SERVER_VERSION_QUERY).getvalue(0,0).to_i
107
+ end
108
+
109
+ # pg_stat_statements columns changed in pg13, so we use different queries depending on the version
110
+ # https://www.postgresql.org/docs/12/pgstatstatements.html
111
+ # https://www.postgresql.org/docs/13/pgstatstatements.html
112
+ def query_for_postgres_version(conn)
113
+ return PG13_STAT_STATEMENTS_QUERY if postgres_server_version_num(conn) >= 13_00_00
114
+
115
+ PG12_STAT_STATEMENTS_QUERY
134
116
  end
135
117
  end
136
118
  end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fluent/plugin/filter'
4
+
5
+ module Fluent::Plugin
6
+ # MarginaliaExtractor provides the parse_marginalia_into_record
7
+ # utility method, useful for extracting marginalia into fluentd records
8
+ module MarginaliaExtractor
9
+ MARGINALIA_PREPENDED_REGEXP = %r{^(?<comment>/\*.*\*/)(?<sql>.*)}m.freeze
10
+ MARGINALIA_APPENDED_REGEXP = %r{(?<sql>.*)(?<comment>/\*.*\*/)$}m.freeze
11
+
12
+ # Injects marginalia into a fluentd record
13
+ def parse_marginalia_into_record(record, key, strip_comment)
14
+ sql = record[key]
15
+ return unless sql
16
+
17
+ comment_match = match_marginalia_comment(sql)
18
+
19
+ return unless comment_match
20
+
21
+ entries = extract_entries(comment_match['comment'])
22
+ parse_entries(entries, key, record)
23
+
24
+ record[key] = comment_match['sql'].strip if strip_comment
25
+ end
26
+
27
+ def match_marginalia_comment(sql)
28
+ matched = MARGINALIA_PREPENDED_REGEXP.match(sql)
29
+
30
+ return matched if matched
31
+
32
+ MARGINALIA_APPENDED_REGEXP.match(sql)
33
+ end
34
+
35
+ def extract_entries(comment)
36
+ comment = scrub_comment(comment)
37
+
38
+ return [] unless comment
39
+
40
+ comment.split(',')
41
+ end
42
+
43
+ def scrub_comment(comment)
44
+ return unless comment
45
+
46
+ comment.strip!
47
+ comment.gsub!(%r{^/\*}, '')
48
+ comment.gsub!(%r{\*/$}, '')
49
+ end
50
+
51
+ def parse_entries(entries, key, record)
52
+ entries.each do |component|
53
+ data = component.split(':', 2)
54
+
55
+ break unless data.length == 2
56
+
57
+ stored_key = store_key(record, key, data[0])
58
+ record[stored_key] = data[1]
59
+ end
60
+ end
61
+
62
+ def store_key(record, key, component_key)
63
+ # In case there is a conflict with the Marginalia key
64
+ # (e.g. `correlation_id`), we use the base key
65
+ # (`sql_correlation_id`) instead.
66
+ if record.key?(component_key)
67
+ "#{key}_#{component_key}"
68
+ else
69
+ component_key
70
+ end
71
+ end
72
+ end
73
+ end