waterdrop 1.4.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data.tar.gz.sig +0 -0
  4. data/.diffend.yml +3 -0
  5. data/.github/workflows/ci.yml +75 -0
  6. data/.gitignore +2 -0
  7. data/.ruby-version +1 -1
  8. data/CHANGELOG.md +13 -0
  9. data/Gemfile +9 -0
  10. data/Gemfile.lock +67 -54
  11. data/LICENSE +165 -0
  12. data/README.md +194 -56
  13. data/config/errors.yml +3 -16
  14. data/docker-compose.yml +17 -0
  15. data/lib/water_drop.rb +4 -24
  16. data/lib/water_drop/config.rb +41 -142
  17. data/lib/water_drop/contracts.rb +0 -2
  18. data/lib/water_drop/contracts/config.rb +8 -121
  19. data/lib/water_drop/contracts/message.rb +41 -0
  20. data/lib/water_drop/errors.rb +31 -5
  21. data/lib/water_drop/instrumentation.rb +7 -0
  22. data/lib/water_drop/instrumentation/monitor.rb +16 -23
  23. data/lib/water_drop/instrumentation/stdout_listener.rb +113 -32
  24. data/lib/water_drop/producer.rb +143 -0
  25. data/lib/water_drop/producer/async.rb +51 -0
  26. data/lib/water_drop/producer/buffer.rb +113 -0
  27. data/lib/water_drop/producer/builder.rb +63 -0
  28. data/lib/water_drop/producer/dummy_client.rb +32 -0
  29. data/lib/water_drop/producer/statistics_decorator.rb +71 -0
  30. data/lib/water_drop/producer/status.rb +52 -0
  31. data/lib/water_drop/producer/sync.rb +65 -0
  32. data/lib/water_drop/version.rb +1 -1
  33. data/waterdrop.gemspec +5 -5
  34. metadata +27 -26
  35. metadata.gz.sig +0 -0
  36. data/.travis.yml +0 -35
  37. data/MIT-LICENCE +0 -18
  38. data/lib/water_drop/async_producer.rb +0 -26
  39. data/lib/water_drop/base_producer.rb +0 -57
  40. data/lib/water_drop/config_applier.rb +0 -52
  41. data/lib/water_drop/contracts/message_options.rb +0 -19
  42. data/lib/water_drop/sync_producer.rb +0 -24
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ # Main WaterDrop messages producer
5
+ class Producer
6
+ include Sync
7
+ include Async
8
+ include Buffer
9
+
10
+ # @return [String] uuid of the current producer
11
+ attr_reader :id
12
+ # @return [Status] producer status object
13
+ attr_reader :status
14
+ # @return [Concurrent::Array] internal messages buffer
15
+ attr_reader :messages
16
+ # @return [Object] monitor we want to use
17
+ attr_reader :monitor
18
+ # @return [Object] dry-configurable config object
19
+ attr_reader :config
20
+
21
+ # Creates a not-yet-configured instance of the producer
22
+ # @param block [Proc] configuration block
23
+ # @return [Producer] producer instance
24
+ def initialize(&block)
25
+ @buffer_mutex = Mutex.new
26
+ @connecting_mutex = Mutex.new
27
+ @closing_mutex = Mutex.new
28
+
29
+ @status = Status.new
30
+ @messages = Concurrent::Array.new
31
+
32
+ return unless block
33
+
34
+ setup(&block)
35
+ end
36
+
37
+ # Sets up the whole configuration and initializes all that is needed
38
+ # @param block [Block] configuration block
39
+ def setup(&block)
40
+ raise Errors::ProducerAlreadyConfiguredError, id unless @status.initial?
41
+
42
+ @config = Config
43
+ .new
44
+ .setup(&block)
45
+ .config
46
+
47
+ @id = @config.id
48
+ @monitor = @config.monitor
49
+ @contract = Contracts::Message.new(max_payload_size: @config.max_payload_size)
50
+ @status.configured!
51
+ end
52
+
53
+ # @return [Rdkafka::Producer] raw rdkafka producer
54
+ # @note Client is lazy initialized, keeping in mind also the fact of a potential fork that
55
+ # can happen any time.
56
+ # @note It is not recommended to fork a producer that is already in use so in case of
57
+ # bootstrapping a cluster, it's much better to fork configured but not used producers
58
+ def client
59
+ return @client if @client && @pid == Process.pid
60
+
61
+ # Don't allow to obtain a client reference for a producer that was not configured
62
+ raise Errors::ProducerNotConfiguredError, id if @status.initial?
63
+
64
+ @connecting_mutex.synchronize do
65
+ return @client if @client && @pid == Process.pid
66
+
67
+ # We should raise an error when trying to use a producer from a fork, that is already
68
+ # connected to Kafka. We allow forking producers only before they are used
69
+ raise Errors::ProducerUsedInParentProcess, Process.pid if @status.connected?
70
+
71
+ # We undefine all the finalizers, in case it was a fork, so the finalizers from the parent
72
+ # process don't leak
73
+ ObjectSpace.undefine_finalizer(id)
74
+ # Finalizer tracking is needed for handling shutdowns gracefully.
75
+ # I don't expect everyone to remember about closing all the producers all the time, thus
76
+ # this approach is better. Although it is still worth keeping in mind, that this will
77
+ # block GC from removing a no longer used producer unless closed properly but at least
78
+ # won't crash the VM upon closing the process
79
+ ObjectSpace.define_finalizer(id, proc { close })
80
+
81
+ @pid = Process.pid
82
+ @client = Builder.new.call(self, @config)
83
+ @status.connected!
84
+ end
85
+
86
+ @client
87
+ end
88
+
89
+ # Flushes the buffers in a sync way and closes the producer
90
+ def close
91
+ @closing_mutex.synchronize do
92
+ return unless @status.active?
93
+
94
+ @monitor.instrument(
95
+ 'producer.closed',
96
+ producer: self
97
+ ) do
98
+ @status.closing!
99
+
100
+ # No need for auto-gc if everything got closed by us
101
+ # This should be used only in case a producer was not closed properly and forgotten
102
+ ObjectSpace.undefine_finalizer(id)
103
+
104
+ # Flush has it's own buffer mutex but even if it is blocked, flushing can still happen
105
+ # as we close the client after the flushing (even if blocked by the mutex)
106
+ flush(false)
107
+
108
+ # We should not close the client in several threads the same time
109
+ # It is safe to run it several times but not exactly the same moment
110
+ client.close
111
+
112
+ @status.closed!
113
+ end
114
+ end
115
+ end
116
+
117
+ # Ensures that we don't run any operations when the producer is not configured or when it
118
+ # was already closed
119
+ def ensure_active!
120
+ return if @status.active?
121
+
122
+ raise Errors::ProducerNotConfiguredError, id if @status.initial?
123
+ raise Errors::ProducerClosedError, id if @status.closing? || @status.closed?
124
+
125
+ # This should never happen
126
+ raise Errors::StatusInvalidError, [id, @status.to_s]
127
+ end
128
+
129
+ # Ensures that the message we want to send out to Kafka is actually valid and that it can be
130
+ # sent there
131
+ # @param message [Hash] message we want to send
132
+ # @raise [Karafka::Errors::MessageInvalidError]
133
+ def validate_message!(message)
134
+ result = @contract.call(message)
135
+ return if result.success?
136
+
137
+ raise Errors::MessageInvalidError, [
138
+ result.errors.to_h,
139
+ message
140
+ ]
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # Component for asynchronous producer operations
6
+ module Async
7
+ # Produces a message to Kafka and does not wait for results
8
+ #
9
+ # @param message [Hash] hash that complies with the {Contracts::Message} contract
10
+ #
11
+ # @return [Rdkafka::Producer::DeliveryHandle] delivery handle that might return the report
12
+ #
13
+ # @raise [Rdkafka::RdkafkaError] When adding the message to rdkafka's queue failed
14
+ # @raise [Errors::MessageInvalidError] When provided message details are invalid and the
15
+ # message could not be sent to Kafka
16
+ def produce_async(message)
17
+ ensure_active!
18
+ validate_message!(message)
19
+
20
+ @monitor.instrument(
21
+ 'message.produced_async',
22
+ producer: self,
23
+ message: message
24
+ ) { client.produce(**message) }
25
+ end
26
+
27
+ # Produces many messages to Kafka and does not wait for them to be delivered
28
+ #
29
+ # @param messages [Array<Hash>] array with messages that comply with the
30
+ # {Contracts::Message} contract
31
+ #
32
+ # @return [Array<Rdkafka::Producer::DeliveryHandle>] deliveries handles
33
+ #
34
+ # @raise [Rdkafka::RdkafkaError] When adding the messages to rdkafka's queue failed
35
+ # @raise [Errors::MessageInvalidError] When any of the provided messages details are invalid
36
+ # and the message could not be sent to Kafka
37
+ def produce_many_async(messages)
38
+ ensure_active!
39
+ messages.each { |message| validate_message!(message) }
40
+
41
+ @monitor.instrument(
42
+ 'messages.produced_async',
43
+ producer: self,
44
+ messages: messages
45
+ ) do
46
+ messages.map { |message| client.produce(**message) }
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # Component for buffered operations
6
+ module Buffer
7
+ # Exceptions we catch when dispatching messages from a buffer
8
+ RESCUED_ERRORS = [
9
+ Rdkafka::RdkafkaError,
10
+ Rdkafka::Producer::DeliveryHandle::WaitTimeoutError
11
+ ].freeze
12
+
13
+ private_constant :RESCUED_ERRORS
14
+
15
+ # Adds given message into the internal producer buffer without flushing it to Kafka
16
+ #
17
+ # @param message [Hash] hash that complies with the {Contracts::Message} contract
18
+ # @raise [Errors::MessageInvalidError] When provided message details are invalid and the
19
+ # message could not be sent to Kafka
20
+ def buffer(message)
21
+ ensure_active!
22
+ validate_message!(message)
23
+
24
+ @monitor.instrument(
25
+ 'message.buffered',
26
+ producer: self,
27
+ message: message
28
+ ) { @messages << message }
29
+ end
30
+
31
+ # Adds given messages into the internal producer buffer without flushing them to Kafka
32
+ #
33
+ # @param messages [Array<Hash>] array with messages that comply with the
34
+ # {Contracts::Message} contract
35
+ # @raise [Errors::MessageInvalidError] When any of the provided messages details are invalid
36
+ # and the message could not be sent to Kafka
37
+ def buffer_many(messages)
38
+ ensure_active!
39
+ messages.each { |message| validate_message!(message) }
40
+
41
+ @monitor.instrument(
42
+ 'messages.buffered',
43
+ producer: self,
44
+ messages: messages
45
+ ) do
46
+ messages.each { |message| @messages << message }
47
+ messages
48
+ end
49
+ end
50
+
51
+ # Flushes the internal buffer to Kafka in an async way
52
+ # @return [Array<Rdkafka::Producer::DeliveryHandle>] delivery handles for messages that were
53
+ # flushed
54
+ def flush_async
55
+ ensure_active!
56
+
57
+ @monitor.instrument(
58
+ 'buffer.flushed_async',
59
+ producer: self,
60
+ messages: @messages
61
+ ) { flush(false) }
62
+ end
63
+
64
+ # Flushes the internal buffer to Kafka in a sync way
65
+ # @return [Array<Rdkafka::Producer::DeliveryReport>] delivery reports for messages that were
66
+ # flushed
67
+ def flush_sync
68
+ ensure_active!
69
+
70
+ @monitor.instrument(
71
+ 'buffer.flushed_sync',
72
+ producer: self,
73
+ messages: @messages
74
+ ) { flush(true) }
75
+ end
76
+
77
+ private
78
+
79
+ # Method for triggering the buffer
80
+ # @param sync [Boolean] should it flush in a sync way
81
+ # @return [Array<Rdkafka::Producer::DeliveryHandle, Rdkafka::Producer::DeliveryReport>]
82
+ # delivery handles for async or delivery reports for sync
83
+ # @raise [Errors::FlushFailureError] when there was a failure in flushing
84
+ # @note We use this method underneath to provide a different instrumentation for sync and
85
+ # async flushing within the public API
86
+ def flush(sync)
87
+ data_for_dispatch = nil
88
+ dispatched = []
89
+
90
+ @buffer_mutex.synchronize do
91
+ data_for_dispatch = @messages
92
+ @messages = Concurrent::Array.new
93
+ end
94
+
95
+ dispatched = data_for_dispatch.map { |message| client.produce(**message) }
96
+
97
+ return dispatched unless sync
98
+
99
+ dispatched.map do |handler|
100
+ handler.wait(
101
+ max_wait_timeout: @config.max_wait_timeout,
102
+ wait_timeout: @config.wait_timeout
103
+ )
104
+ end
105
+ rescue *RESCUED_ERRORS => e
106
+ key = sync ? 'buffer.flushed_sync.error' : 'buffer.flush_async.error'
107
+ @monitor.instrument(key, producer: self, error: e, dispatched: dispatched)
108
+
109
+ raise Errors::FlushFailureError.new(dispatched)
110
+ end
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # Class used to construct the rdkafka producer client
6
+ class Builder
7
+ # @param producer [Producer] not yet configured producer for which we want to
8
+ # build the client
9
+ # @param config [Object] dry-configurable based configuration object
10
+ # @return [Rdkafka::Producer, Producer::DummyClient] raw rdkafka producer or a dummy producer
11
+ # when we don't want to dispatch any messages
12
+ def call(producer, config)
13
+ return DummyClient.new unless config.deliver
14
+
15
+ Rdkafka::Config.logger = config.logger
16
+ Rdkafka::Config.statistics_callback = build_statistics_callback(producer, config.monitor)
17
+
18
+ client = Rdkafka::Config.new(config.kafka.to_h).producer
19
+ client.delivery_callback = build_delivery_callback(producer, config.monitor)
20
+ client
21
+ end
22
+
23
+ private
24
+
25
+ # Creates a proc that we want to run upon each successful message delivery
26
+ #
27
+ # @param producer [Producer]
28
+ # @param monitor [Object] monitor we want to use
29
+ # @return [Proc] delivery callback
30
+ def build_delivery_callback(producer, monitor)
31
+ lambda do |delivery_report|
32
+ monitor.instrument(
33
+ 'message.acknowledged',
34
+ producer: producer,
35
+ offset: delivery_report.offset,
36
+ partition: delivery_report.partition
37
+ )
38
+ end
39
+ end
40
+
41
+ # Creates a proc that we want to run upon each statistics callback execution
42
+ #
43
+ # @param producer [Producer]
44
+ # @param monitor [Object] monitor we want to use
45
+ # @return [Proc] statistics callback
46
+ # @note We decorate the statistics with our own decorator because some of the metrics from
47
+ # rdkafka are absolute. For example number of sent messages increases not in reference to
48
+ # previous statistics emit but from the beginning of the process. We decorate it with diff
49
+ # of all the numeric values against the data from the previous callback emit
50
+ def build_statistics_callback(producer, monitor)
51
+ statistics_decorator = StatisticsDecorator.new
52
+
53
+ lambda do |statistics|
54
+ monitor.instrument(
55
+ 'statistics.emitted',
56
+ producer: producer,
57
+ statistics: statistics_decorator.call(statistics)
58
+ )
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # A dummy client that is supposed to be used instead of Rdkafka::Producer in case we don't
6
+ # want to dispatch anything to Kafka
7
+ class DummyClient
8
+ # @return [DummyClient] dummy instance
9
+ def initialize
10
+ @counter = -1
11
+ end
12
+
13
+ # Dummy method for returning the delivery report
14
+ # @param _args [Object] anything that the delivery handle accepts
15
+ # @return [::Rdkafka::Producer::DeliveryReport]
16
+ def wait(*_args)
17
+ ::Rdkafka::Producer::DeliveryReport.new(0, @counter += 1)
18
+ end
19
+
20
+ # @param _args [Object] anything really, this dummy is suppose to support anything
21
+ def respond_to_missing?(*_args)
22
+ true
23
+ end
24
+
25
+ # @param _args [Object] anything really, this dummy is suppose to support anything
26
+ # @return [self] returns self for chaining cases
27
+ def method_missing(*_args)
28
+ self || super
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WaterDrop
4
+ class Producer
5
+ # Many of the librdkafka statistics are absolute values instead of a gauge.
6
+ # This means, that for example number of messages sent is an absolute growing value
7
+ # instead of being a value of messages sent from the last statistics report.
8
+ # This decorator calculates the diff against previously emited stats, so we get also
9
+ # the diff together with the original values
10
+ class StatisticsDecorator
11
+ def initialize
12
+ @previous = {}.freeze
13
+ end
14
+
15
+ # @param emited_stats [Hash] original emited statistics
16
+ # @return [Hash] emited statistics extended with the diff data
17
+ # @note We modify the emited statistics, instead of creating new. Since we don't expose
18
+ # any API to get raw data, users can just assume that the result of this decoration is the
19
+ # proper raw stats that they can use
20
+ def call(emited_stats)
21
+ diff(
22
+ @previous,
23
+ emited_stats
24
+ )
25
+
26
+ @previous = emited_stats
27
+
28
+ emited_stats.freeze
29
+ end
30
+
31
+ private
32
+
33
+ # Calculates the diff of the provided values and modifies in place the emited statistics
34
+ #
35
+ # @param previous [Object] previous value from the given scope in which
36
+ # we are
37
+ # @param current [Object] current scope from emitted statistics
38
+ # @return [Object] the diff if the values were numerics or the current scope
39
+ def diff(previous, current)
40
+ if current.is_a?(Hash)
41
+ # @note We cannot use #each_key as we modify the content of the current scope
42
+ # in place (in case it's a hash)
43
+ current.keys.each do |key|
44
+ append(
45
+ current,
46
+ key,
47
+ diff((previous || {})[key], (current || {})[key])
48
+ )
49
+ end
50
+ end
51
+
52
+ if current.is_a?(Numeric) && previous.is_a?(Numeric)
53
+ current - previous
54
+ else
55
+ current
56
+ end
57
+ end
58
+
59
+ # Appends the result of the diff to a given key as long as the result is numeric
60
+ #
61
+ # @param current [Hash] current scope
62
+ # @param key [Symbol] key based on which we were diffing
63
+ # @param result [Object] diff result
64
+ def append(current, key, result)
65
+ return unless result.is_a?(Numeric)
66
+
67
+ current["#{key}_d"] = result
68
+ end
69
+ end
70
+ end
71
+ end