fluent-plugin-gcloud-pubsub-custom-compress-batches 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler"
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ require "rake/testtask"
7
+
8
+ Rake::TestTask.new(:test) do |test|
9
+ test.libs << "lib" << "test"
10
+ test.test_files = FileList["test/plugin/test_*.rb"]
11
+ test.verbose = true
12
+ end
13
+
14
+ task default: [:build]
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.push File.expand_path("lib", __dir__)
4
+
5
+ Gem::Specification.new do |gem|
6
+ gem.name = "fluent-plugin-gcloud-pubsub-custom-compress-batches"
7
+ gem.description = "Google Cloud Pub/Sub input/output plugin for Fluentd event collector - with payload compression. Forked from https://github.com/gocardless/fluent-plugin-gcloud-pubsub-custom"
8
+ gem.license = "MIT"
9
+ gem.homepage = "https://github.com/calvinaditya95/fluent-plugin-gcloud-pubsub-custom"
10
+ gem.summary = "Google Cloud Pub/Sub input/output plugin for Fluentd event collector - with payload compression"
11
+ gem.version = "1.3.4"
12
+ gem.authors = ["Calvin Aditya"]
13
+ gem.email = "calvin.aditya95@gmail.com"
14
+ gem.files = `git ls-files`.split("\n")
15
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ gem.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
17
+ gem.require_paths = ["lib"]
18
+
19
+ gem.add_runtime_dependency "fluentd", [">= 0.14.15", "< 2"]
20
+ gem.add_runtime_dependency "google-cloud-pubsub", "~> 0.30.0"
21
+
22
+ # Use the same version constraint as fluent-plugin-prometheus currently specifies
23
+ gem.add_runtime_dependency "prometheus-client", "< 0.10"
24
+
25
+ gem.add_development_dependency "bundler"
26
+ gem.add_development_dependency "pry"
27
+ gem.add_development_dependency "pry-byebug"
28
+ gem.add_development_dependency "rake"
29
+ gem.add_development_dependency "rubocop", "~>0.83"
30
+ gem.add_development_dependency "test-unit"
31
+ gem.add_development_dependency "test-unit-rr"
32
+ end
@@ -0,0 +1,173 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "google/cloud/pubsub"
4
+ require "zlib"
5
+
6
+ module Fluent
7
+ module GcloudPubSub
8
+ class Error < StandardError
9
+ end
10
+ class RetryableError < Error
11
+ end
12
+
13
+ COMPRESSION_ALGORITHM_ZLIB = "zlib"
14
+ # 30 is the ASCII record separator character
15
+ BATCHED_RECORD_SEPARATOR = 30.chr
16
+
17
+ class Message
18
+ attr_reader :message, :attributes
19
+
20
+ def initialize(message, attributes = {})
21
+ @message = message
22
+ @attributes = attributes
23
+ end
24
+
25
+ def bytesize
26
+ attr_size = 0
27
+ @attributes.each do |key, val|
28
+ attr_size += key.bytesize + val.bytesize
29
+ end
30
+ @message.bytesize + attr_size
31
+ end
32
+ end
33
+
34
+ class Publisher
35
+ def initialize(project, key, autocreate_topic, metric_prefix)
36
+ @pubsub = Google::Cloud::Pubsub.new project_id: project, credentials: key
37
+ @autocreate_topic = autocreate_topic
38
+ @topics = {}
39
+
40
+ # rubocop:disable Layout/LineLength
41
+ @compression_ratio =
42
+ Fluent::GcloudPubSub::Metrics.register_or_existing(:"#{metric_prefix}_messages_compressed_size_per_original_size_ratio") do
43
+ ::Prometheus::Client.registry.histogram(
44
+ :"#{metric_prefix}_messages_compressed_size_per_original_size_ratio",
45
+ "Compression ratio achieved on a batch of messages",
46
+ {},
47
+ # We expect compression for even a single message to be typically
48
+ # above 2x (0.5/50%), so bias the buckets towards the higher end
49
+ # of the range.
50
+ [0, 0.25, 0.5, 0.75, 0.85, 0.9, 0.95, 0.975, 1],
51
+ )
52
+ end
53
+
54
+ @compression_duration =
55
+ Fluent::GcloudPubSub::Metrics.register_or_existing(:"#{metric_prefix}_messages_compression_duration_seconds") do
56
+ ::Prometheus::Client.registry.histogram(
57
+ :"#{metric_prefix}_messages_compression_duration_seconds",
58
+ "Time taken to compress a batch of messages",
59
+ {},
60
+ [0, 0.0001, 0.0005, 0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1],
61
+ )
62
+ end
63
+ # rubocop:enable Layout/LineLength
64
+ end
65
+
66
+ def topic(topic_name)
67
+ return @topics[topic_name] if @topics.key? topic_name
68
+
69
+ client = @pubsub.topic topic_name
70
+ client = @pubsub.create_topic topic_name if client.nil? && @autocreate_topic
71
+ raise Error, "topic:#{topic_name} does not exist." if client.nil?
72
+
73
+ @topics[topic_name] = client
74
+ client
75
+ end
76
+
77
+ def publish(topic_name, messages, compress_batches = false)
78
+ if compress_batches
79
+ topic(topic_name).publish(*compress_messages_with_zlib(messages, topic_name))
80
+ else
81
+ topic(topic_name).publish do |batch|
82
+ messages.each do |m|
83
+ batch.publish m.message, m.attributes
84
+ end
85
+ end
86
+ end
87
+ rescue Google::Cloud::UnavailableError, Google::Cloud::DeadlineExceededError, Google::Cloud::InternalError => e
88
+ raise RetryableError, "Google api returns error:#{e.class} message:#{e}"
89
+ end
90
+
91
+ private
92
+
93
+ def compress_messages_with_zlib(messages, topic_name)
94
+ original_size = messages.sum(&:bytesize)
95
+ # This should never happen, only a programming error or major
96
+ # misconfiguration should lead to this situation. But checking against
97
+ # it here avoids a potential division by zero later on.
98
+ raise ArgumentError, "not compressing empty inputs" if original_size.zero?
99
+
100
+ # Here we're implicitly dropping the 'attributes' field of the messages
101
+ # that we're iterating over.
102
+ # This is fine, because the :attribute_keys config param is not
103
+ # supported when in compressed mode, so this field will always be
104
+ # empty.
105
+ packed_messages = messages.map(&:message).join(BATCHED_RECORD_SEPARATOR)
106
+
107
+ duration, compressed_messages = Fluent::GcloudPubSub::Metrics.measure_duration do
108
+ Zlib::Deflate.deflate(packed_messages)
109
+ end
110
+
111
+ @compression_duration.observe(
112
+ { topic: topic_name, algorithm: COMPRESSION_ALGORITHM_ZLIB },
113
+ duration,
114
+ )
115
+
116
+ compressed_size = compressed_messages.bytesize
117
+ @compression_ratio.observe(
118
+ { topic: topic_name, algorithm: COMPRESSION_ALGORITHM_ZLIB },
119
+ # If original = 1MiB and compressed = 256KiB; then metric value = 0.75 = 75% when plotted
120
+ 1 - compressed_size.to_f / original_size,
121
+ )
122
+
123
+ [compressed_messages, { "compression_algorithm": COMPRESSION_ALGORITHM_ZLIB }]
124
+ end
125
+ end
126
+
127
+ class Subscriber
128
+ def initialize(project, key, topic_name, subscription_name)
129
+ pubsub = Google::Cloud::Pubsub.new project_id: project, credentials: key
130
+ if topic_name.nil?
131
+ @client = pubsub.subscription subscription_name
132
+ else
133
+ topic = pubsub.topic topic_name
134
+ @client = topic.subscription subscription_name
135
+ end
136
+ raise Error, "subscription:#{subscription_name} does not exist." if @client.nil?
137
+ end
138
+
139
+ def pull(immediate, max)
140
+ @client.pull immediate: immediate, max: max
141
+ rescue Google::Cloud::UnavailableError, Google::Cloud::DeadlineExceededError, Google::Cloud::InternalError => e
142
+ raise RetryableError, "Google pull api returns error:#{e.class} message:#{e}"
143
+ end
144
+
145
+ def acknowledge(messages)
146
+ @client.acknowledge messages
147
+ rescue Google::Cloud::UnavailableError, Google::Cloud::DeadlineExceededError, Google::Cloud::InternalError => e
148
+ raise RetryableError, "Google acknowledge api returns error:#{e.class} message:#{e}"
149
+ end
150
+ end
151
+
152
+ class MessageUnpacker
153
+ def self.unpack(message)
154
+ attributes = message.attributes
155
+ algorithm = attributes["compression_algorithm"]
156
+
157
+ case algorithm
158
+ when nil
159
+ # For an uncompressed message return the single line and attributes
160
+ [[message.message.data.chomp, message.attributes]]
161
+ when COMPRESSION_ALGORITHM_ZLIB
162
+ # Return all of the lines in the message, with empty attributes
163
+ Zlib::Inflate
164
+ .inflate(message.message.data)
165
+ .split(BATCHED_RECORD_SEPARATOR)
166
+ .map { |line| [line, {}] }
167
+ else
168
+ raise ArgumentError, "unknown compression algorithm: '#{algorithm}'"
169
+ end
170
+ end
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Fluent
4
+ module GcloudPubSub
5
+ # Utilities for interacting with Prometheus metrics
6
+ module Metrics
7
+ def self.register_or_existing(metric_name)
8
+ return ::Prometheus::Client.registry.get(metric_name) if ::Prometheus::Client.registry.exist?(metric_name)
9
+
10
+ yield
11
+ end
12
+
13
+ # Time the elapsed execution of the provided block, return the duration
14
+ # as the first element followed by the result of the block.
15
+ def self.measure_duration
16
+ start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
17
+ result = yield
18
+ finish = Process.clock_gettime(Process::CLOCK_MONOTONIC)
19
+
20
+ [finish - start, *result]
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,303 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "webrick"
5
+
6
+ require "fluent/plugin/input"
7
+ require "fluent/plugin/parser"
8
+
9
+ require "fluent/plugin/gcloud_pubsub/client"
10
+ require "fluent/plugin/gcloud_pubsub/metrics"
11
+
12
+ require "prometheus/client"
13
+
14
+ module Fluent::Plugin
15
+ class GcloudPubSubInput < Input
16
+ Fluent::Plugin.register_input("gcloud_pubsub", self)
17
+
18
+ helpers :compat_parameters, :parser, :thread
19
+
20
+ DEFAULT_PARSER_TYPE = "json"
21
+
22
+ class FailedParseError < StandardError
23
+ end
24
+
25
+ desc "Set tag of messages."
26
+ config_param :tag, :string
27
+ desc "Set key to be used as tag."
28
+ config_param :tag_key, :string, default: nil
29
+ desc "Set your GCP project."
30
+ config_param :project, :string, default: nil
31
+ desc "Set your credential file path."
32
+ config_param :key, :string, default: nil
33
+ desc "Set topic name to pull."
34
+ config_param :topic, :string, default: nil
35
+ desc "Set subscription name to pull."
36
+ config_param :subscription, :string
37
+ desc "Pulling messages by intervals of specified seconds."
38
+ config_param :pull_interval, :float, default: 5.0
39
+ desc "Max messages pulling at once."
40
+ config_param :max_messages, :integer, default: 100
41
+ desc "Setting `true`, keepalive connection to wait for new messages."
42
+ config_param :return_immediately, :bool, default: true
43
+ desc "Set number of threads to pull messages."
44
+ config_param :pull_threads, :integer, default: 1
45
+ desc "Acquire these fields from attributes on the Pub/Sub message and merge them into the record"
46
+ config_param :attribute_keys, :array, default: []
47
+ desc "Set error type when parsing messages fails."
48
+ config_param :parse_error_action, :enum, default: :exception, list: %i[exception warning]
49
+ desc "The prefix for Prometheus metric names"
50
+ config_param :metric_prefix, :string, default: "fluentd_input_gcloud_pubsub"
51
+ # for HTTP RPC
52
+ desc "If `true` is specified, HTTP RPC to stop or start pulling message is enabled."
53
+ config_param :enable_rpc, :bool, default: false
54
+ desc "Bind IP address for HTTP RPC."
55
+ config_param :rpc_bind, :string, default: "0.0.0.0"
56
+ desc "Port for HTTP RPC."
57
+ config_param :rpc_port, :integer, default: 24_680
58
+
59
+ config_section :parse do
60
+ config_set_default :@type, DEFAULT_PARSER_TYPE
61
+ end
62
+
63
+ class RPCServlet < WEBrick::HTTPServlet::AbstractServlet
64
+ class Error < StandardError; end
65
+
66
+ def initialize(server, plugin)
67
+ super
68
+ @plugin = plugin
69
+ end
70
+
71
+ # rubocop:disable Naming/MethodName
72
+ def do_GET(req, res)
73
+ begin
74
+ code, header, body = process(req, res)
75
+ rescue StandardError
76
+ code, header, body = render_json(500, {
77
+ "ok" => false,
78
+ "message" => "Internal Server Error",
79
+ "error" => $ERROR_INFO.to_s,
80
+ "backtrace" => $ERROR_INFO.backtrace,
81
+ })
82
+ end
83
+
84
+ res.status = code
85
+ header.each_pair do |k, v|
86
+ res[k] = v
87
+ end
88
+ res.body = body
89
+ end
90
+ # rubocop:enable Naming/MethodName
91
+
92
+ def render_json(code, obj)
93
+ [code, { "Content-Type" => "application/json" }, obj.to_json]
94
+ end
95
+
96
+ def process(req, _res)
97
+ ret = { "ok" => true }
98
+ case req.path_info
99
+ when "/stop"
100
+ @plugin.stop_pull
101
+ when "/start"
102
+ @plugin.start_pull
103
+ when "/status"
104
+ ret["status"] = @plugin.status_of_pull
105
+ else
106
+ raise Error, "Invalid path_info: #{req.path_info}"
107
+ end
108
+ render_json(200, ret)
109
+ end
110
+ end
111
+
112
+ # rubocop:disable Metrics/MethodLength
113
+ def configure(conf)
114
+ compat_parameters_convert(conf, :parser)
115
+ super
116
+ @rpc_srv = nil
117
+ @rpc_thread = nil
118
+ @stop_pull = false
119
+
120
+ @extract_tag = if @tag_key.nil?
121
+ method(:static_tag)
122
+ else
123
+ method(:dynamic_tag)
124
+ end
125
+
126
+ @parser = parser_create
127
+
128
+ @messages_pulled =
129
+ Fluent::GcloudPubSub::Metrics.register_or_existing(:"#{@metric_prefix}_messages_pulled") do
130
+ ::Prometheus::Client.registry.histogram(
131
+ :"#{@metric_prefix}_messages_pulled",
132
+ "Number of Pub/Sub messages pulled by the subscriber on each invocation",
133
+ {},
134
+ [0, 1, 10, 50, 100, 250, 500, 1000],
135
+ )
136
+ end
137
+
138
+ @messages_pulled_bytes =
139
+ Fluent::GcloudPubSub::Metrics.register_or_existing(:"#{@metric_prefix}_messages_pulled_bytes") do
140
+ ::Prometheus::Client.registry.histogram(
141
+ :"#{@metric_prefix}_messages_pulled_bytes",
142
+ "Total size in bytes of the Pub/Sub messages pulled by the subscriber on each invocation",
143
+ {},
144
+ [100, 1000, 10_000, 100_000, 1_000_000, 5_000_000, 10_000_000],
145
+ )
146
+ end
147
+
148
+ @pull_errors =
149
+ Fluent::GcloudPubSub::Metrics.register_or_existing(:"#{@metric_prefix}_pull_errors_total") do
150
+ ::Prometheus::Client.registry.counter(
151
+ :"#{@metric_prefix}_pull_errors_total",
152
+ "Errors encountered while pulling or processing messages",
153
+ {},
154
+ )
155
+ end
156
+ end
157
+ # rubocop:enable Metrics/MethodLength
158
+
159
+ def start
160
+ super
161
+ start_rpc if @enable_rpc
162
+
163
+ @subscriber = Fluent::GcloudPubSub::Subscriber.new @project, @key, @topic, @subscription
164
+ log.debug "connected subscription:#{@subscription} in project #{@project}"
165
+
166
+ @emit_guard = Mutex.new
167
+ @stop_subscribing = false
168
+ @subscribe_threads = []
169
+ @pull_threads.times do |idx|
170
+ @subscribe_threads.push thread_create("in_gcloud_pubsub_subscribe_#{idx}".to_sym, &method(:subscribe))
171
+ end
172
+ end
173
+
174
+ def shutdown
175
+ if @rpc_srv
176
+ @rpc_srv.shutdown
177
+ @rpc_srv = nil
178
+ end
179
+ @rpc_thread = nil if @rpc_thread
180
+ @stop_subscribing = true
181
+ @subscribe_threads.each(&:join)
182
+ super
183
+ end
184
+
185
+ def stop_pull
186
+ @stop_pull = true
187
+ log.info "stop pull from subscription:#{@subscription}"
188
+ end
189
+
190
+ def start_pull
191
+ @stop_pull = false
192
+ log.info "start pull from subscription:#{@subscription}"
193
+ end
194
+
195
+ def status_of_pull
196
+ @stop_pull ? "stopped" : "started"
197
+ end
198
+
199
+ private
200
+
201
+ def static_tag(_record)
202
+ @tag
203
+ end
204
+
205
+ def dynamic_tag(record)
206
+ record.delete(@tag_key) || @tag
207
+ end
208
+
209
+ def start_rpc
210
+ log.info "listening http rpc server on http://#{@rpc_bind}:#{@rpc_port}/"
211
+ @rpc_srv = WEBrick::HTTPServer.new(
212
+ {
213
+ BindAddress: @rpc_bind,
214
+ Port: @rpc_port,
215
+ Logger: WEBrick::Log.new(STDERR, WEBrick::Log::FATAL),
216
+ AccessLog: [],
217
+ },
218
+ )
219
+ @rpc_srv.mount("/api/in_gcloud_pubsub/pull/", RPCServlet, self)
220
+ @rpc_thread = thread_create(:in_gcloud_pubsub_rpc_thread) do
221
+ @rpc_srv.start
222
+ end
223
+ end
224
+
225
+ def subscribe
226
+ until @stop_subscribing
227
+ _subscribe unless @stop_pull
228
+
229
+ sleep @pull_interval if @return_immediately || @stop_pull
230
+ end
231
+ rescue StandardError => e
232
+ log.error "unexpected error", error_message: e.to_s, error_class: e.class.to_s
233
+ log.error_backtrace e.backtrace
234
+ end
235
+
236
+ def _subscribe
237
+ messages = @subscriber.pull @return_immediately, @max_messages
238
+ @messages_pulled.observe(common_labels, messages.size)
239
+ if messages.empty?
240
+ log.debug "no messages are pulled"
241
+ return
242
+ end
243
+
244
+ messages_size = messages.sum do |message|
245
+ message.data.bytesize + message.attributes.sum { |k, v| k.bytesize + v.bytesize }
246
+ end
247
+ @messages_pulled_bytes.observe(common_labels, messages_size)
248
+
249
+ process messages
250
+ @subscriber.acknowledge messages
251
+
252
+ log.debug "#{messages.length} message(s) processed"
253
+ rescue Fluent::GcloudPubSub::RetryableError => e
254
+ @pull_errors.increment(common_labels.merge({ retryable: true }))
255
+ log.warn "Retryable error occurs. Fluentd will retry.", error_message: e.to_s, error_class: e.class.to_s
256
+ rescue StandardError => e
257
+ @pull_errors.increment(common_labels.merge({ retryable: false }))
258
+ log.error "unexpected error", error_message: e.to_s, error_class: e.class.to_s
259
+ log.error_backtrace e.backtrace
260
+ end
261
+
262
+ def process(messages)
263
+ event_streams = Hash.new do |hsh, key|
264
+ hsh[key] = Fluent::MultiEventStream.new
265
+ end
266
+
267
+ messages.each do |m|
268
+ lines_attributes = Fluent::GcloudPubSub::MessageUnpacker.unpack(m)
269
+
270
+ lines_attributes.each do |line, attributes|
271
+ @parser.parse(line) do |time, record|
272
+ if time && record
273
+ @attribute_keys.each do |key|
274
+ record[key] = attributes[key]
275
+ end
276
+
277
+ event_streams[@extract_tag.call(record)].add(time, record)
278
+ else
279
+ case @parse_error_action
280
+ when :exception
281
+ raise FailedParseError, "pattern not match: #{line}"
282
+ else
283
+ log.warn "pattern not match", record: line
284
+ end
285
+ end
286
+ end
287
+ end
288
+ end
289
+
290
+ event_streams.each do |tag, es|
291
+ # There are some output plugins not to supposed to be called with multi-threading.
292
+ # Maybe remove in the future.
293
+ @emit_guard.synchronize do
294
+ router.emit_stream(tag, es)
295
+ end
296
+ end
297
+ end
298
+
299
+ def common_labels
300
+ { subscription: @subscription }
301
+ end
302
+ end
303
+ end