waterdrop 1.4.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/.diffend.yml +3 -0
  5. data/.github/workflows/ci.yml +75 -0
  6. data/.gitignore +2 -0
  7. data/.ruby-version +1 -1
  8. data/CHANGELOG.md +13 -0
  9. data/Gemfile +9 -0
  10. data/Gemfile.lock +67 -54
  11. data/LICENSE +165 -0
  12. data/README.md +194 -56
  13. data/config/errors.yml +3 -16
  14. data/docker-compose.yml +17 -0
  15. data/lib/water_drop.rb +4 -24
  16. data/lib/water_drop/config.rb +41 -142
  17. data/lib/water_drop/contracts.rb +0 -2
  18. data/lib/water_drop/contracts/config.rb +8 -121
  19. data/lib/water_drop/contracts/message.rb +41 -0
  20. data/lib/water_drop/errors.rb +31 -5
  21. data/lib/water_drop/instrumentation.rb +7 -0
  22. data/lib/water_drop/instrumentation/monitor.rb +16 -23
  23. data/lib/water_drop/instrumentation/stdout_listener.rb +113 -32
  24. data/lib/water_drop/producer.rb +143 -0
  25. data/lib/water_drop/producer/async.rb +51 -0
  26. data/lib/water_drop/producer/buffer.rb +113 -0
  27. data/lib/water_drop/producer/builder.rb +63 -0
  28. data/lib/water_drop/producer/dummy_client.rb +32 -0
  29. data/lib/water_drop/producer/statistics_decorator.rb +71 -0
  30. data/lib/water_drop/producer/status.rb +52 -0
  31. data/lib/water_drop/producer/sync.rb +65 -0
  32. data/lib/water_drop/version.rb +1 -1
  33. data/waterdrop.gemspec +5 -5
  34. metadata +27 -26
  35. metadata.gz.sig +0 -0
  36. data/.travis.yml +0 -35
  37. data/MIT-LICENCE +0 -18
  38. data/lib/water_drop/async_producer.rb +0 -26
  39. data/lib/water_drop/base_producer.rb +0 -57
  40. data/lib/water_drop/config_applier.rb +0 -52
  41. data/lib/water_drop/contracts/message_options.rb +0 -19
  42. data/lib/water_drop/sync_producer.rb +0 -24
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ # Main WaterDrop messages producer
5
+ class Producer
6
+ include Sync
7
+ include Async
8
+ include Buffer
9
+
10
+ # @return [String] uuid of the current producer
11
+ attr_reader :id
12
+ # @return [Status] producer status object
13
+ attr_reader :status
14
+ # @return [Concurrent::Array] internal messages buffer
15
+ attr_reader :messages
16
+ # @return [Object] monitor we want to use
17
+ attr_reader :monitor
18
+ # @return [Object] dry-configurable config object
19
+ attr_reader :config
20
+
21
+ # Creates a not-yet-configured instance of the producer
22
+ # @param block [Proc] configuration block
23
+ # @return [Producer] producer instance
24
+ def initialize(&block)
25
+ @buffer_mutex = Mutex.new
26
+ @connecting_mutex = Mutex.new
27
+ @closing_mutex = Mutex.new
28
+
29
+ @status = Status.new
30
+ @messages = Concurrent::Array.new
31
+
32
+ return unless block
33
+
34
+ setup(&block)
35
+ end
36
+
37
+ # Sets up the whole configuration and initializes all that is needed
38
+ # @param block [Block] configuration block
39
+ def setup(&block)
40
+ raise Errors::ProducerAlreadyConfiguredError, id unless @status.initial?
41
+
42
+ @config = Config
43
+ .new
44
+ .setup(&block)
45
+ .config
46
+
47
+ @id = @config.id
48
+ @monitor = @config.monitor
49
+ @contract = Contracts::Message.new(max_payload_size: @config.max_payload_size)
50
+ @status.configured!
51
+ end
52
+
53
+ # @return [Rdkafka::Producer] raw rdkafka producer
54
+ # @note Client is lazy initialized, keeping in mind also the fact of a potential fork that
55
+ # can happen any time.
56
+ # @note It is not recommended to fork a producer that is already in use so in case of
57
+ # bootstrapping a cluster, it's much better to fork configured but not used producers
58
+ def client
59
+ return @client if @client && @pid == Process.pid
60
+
61
+ # Don't allow to obtain a client reference for a producer that was not configured
62
+ raise Errors::ProducerNotConfiguredError, id if @status.initial?
63
+
64
+ @connecting_mutex.synchronize do
65
+ return @client if @client && @pid == Process.pid
66
+
67
+ # We should raise an error when trying to use a producer from a fork, that is already
68
+ # connected to Kafka. We allow forking producers only before they are used
69
+ raise Errors::ProducerUsedInParentProcess, Process.pid if @status.connected?
70
+
71
+ # We undefine all the finalizers, in case it was a fork, so the finalizers from the parent
72
+ # process don't leak
73
+ ObjectSpace.undefine_finalizer(id)
74
+ # Finalizer tracking is needed for handling shutdowns gracefully.
75
+ # I don't expect everyone to remember about closing all the producers all the time, thus
76
+ # this approach is better. Although it is still worth keeping in mind, that this will
77
+ # block GC from removing a no longer used producer unless closed properly but at least
78
+ # won't crash the VM upon closing the process
79
+ ObjectSpace.define_finalizer(id, proc { close })
80
+
81
+ @pid = Process.pid
82
+ @client = Builder.new.call(self, @config)
83
+ @status.connected!
84
+ end
85
+
86
+ @client
87
+ end
88
+
89
+ # Flushes the buffers in a sync way and closes the producer
90
+ def close
91
+ @closing_mutex.synchronize do
92
+ return unless @status.active?
93
+
94
+ @monitor.instrument(
95
+ 'producer.closed',
96
+ producer: self
97
+ ) do
98
+ @status.closing!
99
+
100
+ # No need for auto-gc if everything got closed by us
101
+ # This should be used only in case a producer was not closed properly and forgotten
102
+ ObjectSpace.undefine_finalizer(id)
103
+
104
+ # Flush has it's own buffer mutex but even if it is blocked, flushing can still happen
105
+ # as we close the client after the flushing (even if blocked by the mutex)
106
+ flush(false)
107
+
108
+ # We should not close the client in several threads the same time
109
+ # It is safe to run it several times but not exactly the same moment
110
+ client.close
111
+
112
+ @status.closed!
113
+ end
114
+ end
115
+ end
116
+
117
+ # Ensures that we don't run any operations when the producer is not configured or when it
118
+ # was already closed
119
+ def ensure_active!
120
+ return if @status.active?
121
+
122
+ raise Errors::ProducerNotConfiguredError, id if @status.initial?
123
+ raise Errors::ProducerClosedError, id if @status.closing? || @status.closed?
124
+
125
+ # This should never happen
126
+ raise Errors::StatusInvalidError, [id, @status.to_s]
127
+ end
128
+
129
+ # Ensures that the message we want to send out to Kafka is actually valid and that it can be
130
+ # sent there
131
+ # @param message [Hash] message we want to send
132
+ # @raise [Karafka::Errors::MessageInvalidError]
133
+ def validate_message!(message)
134
+ result = @contract.call(message)
135
+ return if result.success?
136
+
137
+ raise Errors::MessageInvalidError, [
138
+ result.errors.to_h,
139
+ message
140
+ ]
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # Component for asynchronous producer operations
6
+ module Async
7
+ # Produces a message to Kafka and does not wait for results
8
+ #
9
+ # @param message [Hash] hash that complies with the {Contracts::Message} contract
10
+ #
11
+ # @return [Rdkafka::Producer::DeliveryHandle] delivery handle that might return the report
12
+ #
13
+ # @raise [Rdkafka::RdkafkaError] When adding the message to rdkafka's queue failed
14
+ # @raise [Errors::MessageInvalidError] When provided message details are invalid and the
15
+ # message could not be sent to Kafka
16
+ def produce_async(message)
17
+ ensure_active!
18
+ validate_message!(message)
19
+
20
+ @monitor.instrument(
21
+ 'message.produced_async',
22
+ producer: self,
23
+ message: message
24
+ ) { client.produce(**message) }
25
+ end
26
+
27
+ # Produces many messages to Kafka and does not wait for them to be delivered
28
+ #
29
+ # @param messages [Array<Hash>] array with messages that comply with the
30
+ # {Contracts::Message} contract
31
+ #
32
+ # @return [Array<Rdkafka::Producer::DeliveryHandle>] deliveries handles
33
+ #
34
+ # @raise [Rdkafka::RdkafkaError] When adding the messages to rdkafka's queue failed
35
+ # @raise [Errors::MessageInvalidError] When any of the provided messages details are invalid
36
+ # and the message could not be sent to Kafka
37
+ def produce_many_async(messages)
38
+ ensure_active!
39
+ messages.each { |message| validate_message!(message) }
40
+
41
+ @monitor.instrument(
42
+ 'messages.produced_async',
43
+ producer: self,
44
+ messages: messages
45
+ ) do
46
+ messages.map { |message| client.produce(**message) }
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # Component for buffered operations
6
+ module Buffer
7
+ # Exceptions we catch when dispatching messages from a buffer
8
+ RESCUED_ERRORS = [
9
+ Rdkafka::RdkafkaError,
10
+ Rdkafka::Producer::DeliveryHandle::WaitTimeoutError
11
+ ].freeze
12
+
13
+ private_constant :RESCUED_ERRORS
14
+
15
+ # Adds given message into the internal producer buffer without flushing it to Kafka
16
+ #
17
+ # @param message [Hash] hash that complies with the {Contracts::Message} contract
18
+ # @raise [Errors::MessageInvalidError] When provided message details are invalid and the
19
+ # message could not be sent to Kafka
20
+ def buffer(message)
21
+ ensure_active!
22
+ validate_message!(message)
23
+
24
+ @monitor.instrument(
25
+ 'message.buffered',
26
+ producer: self,
27
+ message: message
28
+ ) { @messages << message }
29
+ end
30
+
31
+ # Adds given messages into the internal producer buffer without flushing them to Kafka
32
+ #
33
+ # @param messages [Array<Hash>] array with messages that comply with the
34
+ # {Contracts::Message} contract
35
+ # @raise [Errors::MessageInvalidError] When any of the provided messages details are invalid
36
+ # and the message could not be sent to Kafka
37
+ def buffer_many(messages)
38
+ ensure_active!
39
+ messages.each { |message| validate_message!(message) }
40
+
41
+ @monitor.instrument(
42
+ 'messages.buffered',
43
+ producer: self,
44
+ messages: messages
45
+ ) do
46
+ messages.each { |message| @messages << message }
47
+ messages
48
+ end
49
+ end
50
+
51
+ # Flushes the internal buffer to Kafka in an async way
52
+ # @return [Array<Rdkafka::Producer::DeliveryHandle>] delivery handles for messages that were
53
+ # flushed
54
+ def flush_async
55
+ ensure_active!
56
+
57
+ @monitor.instrument(
58
+ 'buffer.flushed_async',
59
+ producer: self,
60
+ messages: @messages
61
+ ) { flush(false) }
62
+ end
63
+
64
+ # Flushes the internal buffer to Kafka in a sync way
65
+ # @return [Array<Rdkafka::Producer::DeliveryReport>] delivery reports for messages that were
66
+ # flushed
67
+ def flush_sync
68
+ ensure_active!
69
+
70
+ @monitor.instrument(
71
+ 'buffer.flushed_sync',
72
+ producer: self,
73
+ messages: @messages
74
+ ) { flush(true) }
75
+ end
76
+
77
+ private
78
+
79
+ # Method for triggering the buffer
80
+ # @param sync [Boolean] should it flush in a sync way
81
+ # @return [Array<Rdkafka::Producer::DeliveryHandle, Rdkafka::Producer::DeliveryReport>]
82
+ # delivery handles for async or delivery reports for sync
83
+ # @raise [Errors::FlushFailureError] when there was a failure in flushing
84
+ # @note We use this method underneath to provide a different instrumentation for sync and
85
+ # async flushing within the public API
86
+ def flush(sync)
87
+ data_for_dispatch = nil
88
+ dispatched = []
89
+
90
+ @buffer_mutex.synchronize do
91
+ data_for_dispatch = @messages
92
+ @messages = Concurrent::Array.new
93
+ end
94
+
95
+ dispatched = data_for_dispatch.map { |message| client.produce(**message) }
96
+
97
+ return dispatched unless sync
98
+
99
+ dispatched.map do |handler|
100
+ handler.wait(
101
+ max_wait_timeout: @config.max_wait_timeout,
102
+ wait_timeout: @config.wait_timeout
103
+ )
104
+ end
105
+ rescue *RESCUED_ERRORS => e
106
+ key = sync ? 'buffer.flushed_sync.error' : 'buffer.flush_async.error'
107
+ @monitor.instrument(key, producer: self, error: e, dispatched: dispatched)
108
+
109
+ raise Errors::FlushFailureError.new(dispatched)
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # Class used to construct the rdkafka producer client
6
+ class Builder
7
+ # @param producer [Producer] not yet configured producer for which we want to
8
+ # build the client
9
+ # @param config [Object] dry-configurable based configuration object
10
+ # @return [Rdkafka::Producer, Producer::DummyClient] raw rdkafka producer or a dummy producer
11
+ # when we don't want to dispatch any messages
12
+ def call(producer, config)
13
+ return DummyClient.new unless config.deliver
14
+
15
+ Rdkafka::Config.logger = config.logger
16
+ Rdkafka::Config.statistics_callback = build_statistics_callback(producer, config.monitor)
17
+
18
+ client = Rdkafka::Config.new(config.kafka.to_h).producer
19
+ client.delivery_callback = build_delivery_callback(producer, config.monitor)
20
+ client
21
+ end
22
+
23
+ private
24
+
25
+ # Creates a proc that we want to run upon each successful message delivery
26
+ #
27
+ # @param producer [Producer]
28
+ # @param monitor [Object] monitor we want to use
29
+ # @return [Proc] delivery callback
30
+ def build_delivery_callback(producer, monitor)
31
+ lambda do |delivery_report|
32
+ monitor.instrument(
33
+ 'message.acknowledged',
34
+ producer: producer,
35
+ offset: delivery_report.offset,
36
+ partition: delivery_report.partition
37
+ )
38
+ end
39
+ end
40
+
41
+ # Creates a proc that we want to run upon each statistics callback execution
42
+ #
43
+ # @param producer [Producer]
44
+ # @param monitor [Object] monitor we want to use
45
+ # @return [Proc] statistics callback
46
+ # @note We decorate the statistics with our own decorator because some of the metrics from
47
+ # rdkafka are absolute. For example number of sent messages increases not in reference to
48
+ # previous statistics emit but from the beginning of the process. We decorate it with diff
49
+ # of all the numeric values against the data from the previous callback emit
50
+ def build_statistics_callback(producer, monitor)
51
+ statistics_decorator = StatisticsDecorator.new
52
+
53
+ lambda do |statistics|
54
+ monitor.instrument(
55
+ 'statistics.emitted',
56
+ producer: producer,
57
+ statistics: statistics_decorator.call(statistics)
58
+ )
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # A dummy client that is supposed to be used instead of Rdkafka::Producer in case we don't
6
+ # want to dispatch anything to Kafka
7
+ class DummyClient
8
+ # @return [DummyClient] dummy instance
9
+ def initialize
10
+ @counter = -1
11
+ end
12
+
13
+ # Dummy method for returning the delivery report
14
+ # @param _args [Object] anything that the delivery handle accepts
15
+ # @return [::Rdkafka::Producer::DeliveryReport]
16
+ def wait(*_args)
17
+ ::Rdkafka::Producer::DeliveryReport.new(0, @counter += 1)
18
+ end
19
+
20
+ # @param _args [Object] anything really, this dummy is suppose to support anything
21
+ def respond_to_missing?(*_args)
22
+ true
23
+ end
24
+
25
+ # @param _args [Object] anything really, this dummy is suppose to support anything
26
+ # @return [self] returns self for chaining cases
27
+ def method_missing(*_args)
28
+ self || super
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # Many of the librdkafka statistics are absolute values instead of a gauge.
6
+ # This means, that for example number of messages sent is an absolute growing value
7
+ # instead of being a value of messages sent from the last statistics report.
8
+ # This decorator calculates the diff against previously emited stats, so we get also
9
+ # the diff together with the original values
10
+ class StatisticsDecorator
11
+ def initialize
12
+ @previous = {}.freeze
13
+ end
14
+
15
+ # @param emited_stats [Hash] original emited statistics
16
+ # @return [Hash] emited statistics extended with the diff data
17
+ # @note We modify the emited statistics, instead of creating new. Since we don't expose
18
+ # any API to get raw data, users can just assume that the result of this decoration is the
19
+ # proper raw stats that they can use
20
+ def call(emited_stats)
21
+ diff(
22
+ @previous,
23
+ emited_stats
24
+ )
25
+
26
+ @previous = emited_stats
27
+
28
+ emited_stats.freeze
29
+ end
30
+
31
+ private
32
+
33
+ # Calculates the diff of the provided values and modifies in place the emited statistics
34
+ #
35
+ # @param previous [Object] previous value from the given scope in which
36
+ # we are
37
+ # @param current [Object] current scope from emitted statistics
38
+ # @return [Object] the diff if the values were numerics or the current scope
39
+ def diff(previous, current)
40
+ if current.is_a?(Hash)
41
+ # @note We cannot use #each_key as we modify the content of the current scope
42
+ # in place (in case it's a hash)
43
+ current.keys.each do |key|
44
+ append(
45
+ current,
46
+ key,
47
+ diff((previous || {})[key], (current || {})[key])
48
+ )
49
+ end
50
+ end
51
+
52
+ if current.is_a?(Numeric) && previous.is_a?(Numeric)
53
+ current - previous
54
+ else
55
+ current
56
+ end
57
+ end
58
+
59
+ # Appends the result of the diff to a given key as long as the result is numeric
60
+ #
61
+ # @param current [Hash] current scope
62
+ # @param key [Symbol] key based on which we were diffing
63
+ # @param result [Object] diff result
64
+ def append(current, key, result)
65
+ return unless result.is_a?(Numeric)
66
+
67
+ current["#{key}_d"] = result
68
+ end
69
+ end
70
+ end
71
+ end