deimos-ruby 1.6.3 → 1.8.1.pre.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +9 -0
  3. data/.rubocop.yml +22 -16
  4. data/.ruby-version +1 -1
  5. data/CHANGELOG.md +42 -0
  6. data/Gemfile.lock +125 -98
  7. data/README.md +164 -16
  8. data/Rakefile +1 -1
  9. data/deimos-ruby.gemspec +4 -3
  10. data/docs/ARCHITECTURE.md +144 -0
  11. data/docs/CONFIGURATION.md +27 -0
  12. data/lib/deimos.rb +8 -7
  13. data/lib/deimos/active_record_consume/batch_consumption.rb +159 -0
  14. data/lib/deimos/active_record_consume/batch_slicer.rb +27 -0
  15. data/lib/deimos/active_record_consume/message_consumption.rb +58 -0
  16. data/lib/deimos/active_record_consume/schema_model_converter.rb +52 -0
  17. data/lib/deimos/active_record_consumer.rb +33 -75
  18. data/lib/deimos/active_record_producer.rb +23 -0
  19. data/lib/deimos/batch_consumer.rb +2 -140
  20. data/lib/deimos/config/configuration.rb +28 -10
  21. data/lib/deimos/consume/batch_consumption.rb +150 -0
  22. data/lib/deimos/consume/message_consumption.rb +94 -0
  23. data/lib/deimos/consumer.rb +79 -70
  24. data/lib/deimos/kafka_message.rb +1 -1
  25. data/lib/deimos/kafka_topic_info.rb +22 -3
  26. data/lib/deimos/message.rb +6 -1
  27. data/lib/deimos/metrics/provider.rb +0 -2
  28. data/lib/deimos/poll_info.rb +9 -0
  29. data/lib/deimos/schema_backends/avro_base.rb +28 -1
  30. data/lib/deimos/schema_backends/base.rb +15 -2
  31. data/lib/deimos/tracing/provider.rb +0 -2
  32. data/lib/deimos/utils/db_poller.rb +149 -0
  33. data/lib/deimos/utils/db_producer.rb +59 -16
  34. data/lib/deimos/utils/deadlock_retry.rb +68 -0
  35. data/lib/deimos/utils/lag_reporter.rb +19 -26
  36. data/lib/deimos/version.rb +1 -1
  37. data/lib/generators/deimos/active_record/templates/migration.rb.tt +28 -0
  38. data/lib/generators/deimos/active_record/templates/model.rb.tt +5 -0
  39. data/lib/generators/deimos/active_record_generator.rb +79 -0
  40. data/lib/generators/deimos/db_backend/templates/migration +1 -0
  41. data/lib/generators/deimos/db_backend/templates/rails3_migration +1 -0
  42. data/lib/generators/deimos/db_poller/templates/migration +11 -0
  43. data/lib/generators/deimos/db_poller/templates/rails3_migration +16 -0
  44. data/lib/generators/deimos/db_poller_generator.rb +48 -0
  45. data/lib/tasks/deimos.rake +7 -0
  46. data/spec/active_record_batch_consumer_spec.rb +481 -0
  47. data/spec/active_record_consume/batch_slicer_spec.rb +42 -0
  48. data/spec/active_record_consume/schema_model_converter_spec.rb +105 -0
  49. data/spec/active_record_consumer_spec.rb +3 -11
  50. data/spec/active_record_producer_spec.rb +66 -88
  51. data/spec/batch_consumer_spec.rb +24 -7
  52. data/spec/config/configuration_spec.rb +4 -0
  53. data/spec/consumer_spec.rb +8 -8
  54. data/spec/deimos_spec.rb +57 -49
  55. data/spec/generators/active_record_generator_spec.rb +56 -0
  56. data/spec/handlers/my_batch_consumer.rb +6 -1
  57. data/spec/handlers/my_consumer.rb +6 -1
  58. data/spec/kafka_topic_info_spec.rb +39 -16
  59. data/spec/message_spec.rb +19 -0
  60. data/spec/producer_spec.rb +3 -3
  61. data/spec/rake_spec.rb +1 -1
  62. data/spec/schemas/com/my-namespace/Generated.avsc +71 -0
  63. data/spec/schemas/com/my-namespace/MySchemaCompound-key.avsc +18 -0
  64. data/spec/schemas/com/my-namespace/Wibble.avsc +43 -0
  65. data/spec/spec_helper.rb +62 -6
  66. data/spec/utils/db_poller_spec.rb +320 -0
  67. data/spec/utils/db_producer_spec.rb +84 -10
  68. data/spec/utils/deadlock_retry_spec.rb +74 -0
  69. data/spec/utils/lag_reporter_spec.rb +29 -22
  70. metadata +66 -30
  71. data/lib/deimos/base_consumer.rb +0 -104
  72. data/lib/deimos/utils/executor.rb +0 -124
  73. data/lib/deimos/utils/platform_schema_validation.rb +0 -0
  74. data/lib/deimos/utils/signal_handler.rb +0 -68
  75. data/spec/utils/executor_spec.rb +0 -53
  76. data/spec/utils/signal_handler_spec.rb +0 -16
@@ -1,95 +1,104 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'deimos/base_consumer'
4
- require 'deimos/shared_config'
5
- require 'phobos/handler'
6
- require 'active_support/all'
3
+ require 'deimos/consume/batch_consumption'
4
+ require 'deimos/consume/message_consumption'
7
5
 
8
- # Class to consume messages coming from the pipeline topic
6
+ # Class to consume messages coming from a Kafka topic
9
7
  # Note: According to the docs, instances of your handler will be created
10
- # for every incoming message. This class should be lightweight.
8
+ # for every incoming message/batch. This class should be lightweight.
11
9
  module Deimos
12
- # Parent consumer class.
13
- class Consumer < BaseConsumer
14
- include Phobos::Handler
10
+ # Basic consumer class. Inherit from this class and override either consume
11
+ # or consume_batch, depending on the delivery mode of your listener.
12
+ # `consume` -> use `delivery :message` or `delivery :batch`
13
+ # `consume_batch` -> use `delivery :inline_batch`
14
+ class Consumer
15
+ include Consume::MessageConsumption
16
+ include Consume::BatchConsumption
17
+ include SharedConfig
15
18
 
16
- # :nodoc:
17
- def around_consume(payload, metadata)
18
- decoded_payload = payload.dup
19
- new_metadata = metadata.dup
20
- benchmark = Benchmark.measure do
21
- _with_error_span(payload, metadata) do
22
- new_metadata[:key] = decode_key(metadata[:key]) if self.class.config[:key_configured]
23
- decoded_payload = payload ? self.class.decoder.decode(payload) : nil
24
- _received_message(decoded_payload, new_metadata)
25
- yield decoded_payload, new_metadata
26
- end
19
+ class << self
20
+ # @return [Deimos::SchemaBackends::Base]
21
+ def decoder
22
+ @decoder ||= Deimos.schema_backend(schema: config[:schema],
23
+ namespace: config[:namespace])
24
+ end
25
+
26
+ # @return [Deimos::SchemaBackends::Base]
27
+ def key_decoder
28
+ @key_decoder ||= Deimos.schema_backend(schema: config[:key_schema],
29
+ namespace: config[:namespace])
27
30
  end
28
- _handle_success(benchmark.real, decoded_payload, new_metadata)
29
31
  end
30
32
 
31
- # Consume incoming messages.
32
- # @param _payload [String]
33
- # @param _metadata [Hash]
34
- def consume(_payload, _metadata)
35
- raise NotImplementedError
33
+ # Helper method to decode an encoded key.
34
+ # @param key [String]
35
+ # @return [Object] the decoded key.
36
+ def decode_key(key)
37
+ return nil if key.nil?
38
+
39
+ config = self.class.config
40
+ unless config[:key_configured]
41
+ raise 'No key config given - if you are not decoding keys, please use '\
42
+ '`key_config plain: true`'
43
+ end
44
+
45
+ if config[:key_field]
46
+ self.class.decoder.decode_key(key, config[:key_field])
47
+ elsif config[:key_schema]
48
+ self.class.key_decoder.decode(key, schema: config[:key_schema])
49
+ else # no encoding
50
+ key
51
+ end
36
52
  end
37
53
 
38
54
  private
39
55
 
40
- def _received_message(payload, metadata)
41
- Deimos.config.logger.info(
42
- message: 'Got Kafka event',
43
- payload: payload,
44
- metadata: metadata
56
+ def _with_span
57
+ @span = Deimos.config.tracer&.start(
58
+ 'deimos-consumer',
59
+ resource: self.class.name.gsub('::', '-')
45
60
  )
46
- Deimos.config.metrics&.increment('handler', tags: %W(
47
- status:received
61
+ yield
62
+ ensure
63
+ Deimos.config.tracer&.finish(@span)
64
+ end
65
+
66
+ def _report_time_delayed(payload, metadata)
67
+ return if payload.nil? || payload['timestamp'].blank?
68
+
69
+ begin
70
+ time_delayed = Time.now.in_time_zone - payload['timestamp'].to_datetime
71
+ rescue ArgumentError
72
+ Deimos.config.logger.info(
73
+ message: "Error parsing timestamp! #{payload['timestamp']}"
74
+ )
75
+ return
76
+ end
77
+ Deimos.config.metrics&.histogram('handler', time_delayed, tags: %W(
78
+ time:time_delayed
48
79
  topic:#{metadata[:topic]}
49
80
  ))
50
- _report_time_delayed(payload, metadata)
51
81
  end
52
82
 
53
- # @param exception [Throwable]
54
- # @param payload [Hash]
55
- # @param metadata [Hash]
56
- def _handle_error(exception, payload, metadata)
57
- Deimos.config.metrics&.increment(
58
- 'handler',
59
- tags: %W(
60
- status:error
61
- topic:#{metadata[:topic]}
62
- )
63
- )
64
- Deimos.config.logger.warn(
65
- message: 'Error consuming message',
66
- handler: self.class.name,
67
- metadata: metadata,
68
- data: payload,
69
- error_message: exception.message,
70
- error: exception.backtrace
71
- )
72
- super
83
+ # Overrideable method to determine if a given error should be considered
84
+ # "fatal" and always be reraised.
85
+ # @param _error [Exception]
86
+ # @param _payload [Hash]
87
+ # @param _metadata [Hash]
88
+ # @return [Boolean]
89
+ def fatal_error?(_error, _payload, _metadata)
90
+ false
73
91
  end
74
92
 
75
- # @param time_taken [Float]
93
+ # @param exception [Exception]
76
94
  # @param payload [Hash]
77
95
  # @param metadata [Hash]
78
- def _handle_success(time_taken, payload, metadata)
79
- Deimos.config.metrics&.histogram('handler', time_taken, tags: %W(
80
- time:consume
81
- topic:#{metadata[:topic]}
82
- ))
83
- Deimos.config.metrics&.increment('handler', tags: %W(
84
- status:success
85
- topic:#{metadata[:topic]}
86
- ))
87
- Deimos.config.logger.info(
88
- message: 'Finished processing Kafka event',
89
- payload: payload,
90
- time_elapsed: time_taken,
91
- metadata: metadata
92
- )
96
+ def _error(exception, payload, metadata)
97
+ Deimos.config.tracer&.set_error(@span, exception)
98
+
99
+ raise if Deimos.config.consumers.reraise_errors ||
100
+ Deimos.config.consumers.fatal_error&.call(exception, payload, metadata) ||
101
+ fatal_error?(exception, payload, metadata)
93
102
  end
94
103
  end
95
104
  end
@@ -42,7 +42,7 @@ module Deimos
42
42
  messages.map do |m|
43
43
  {
44
44
  key: m.key.present? ? decoder&.decode_key(m.key) || m.key : nil,
45
- payload: decoder&.decoder&.decode(self.message) || self.message
45
+ payload: decoder&.decoder&.decode(m.message) || m.message
46
46
  }
47
47
  end
48
48
  end
@@ -13,8 +13,8 @@ module Deimos
13
13
  def lock(topic, lock_id)
14
14
  # Try to create it - it's fine if it already exists
15
15
  begin
16
- self.create(topic: topic)
17
- rescue ActiveRecord::RecordNotUnique # rubocop:disable Lint/SuppressedException
16
+ self.create(topic: topic, last_processed_at: Time.zone.now)
17
+ rescue ActiveRecord::RecordNotUnique
18
18
  # continue on
19
19
  end
20
20
 
@@ -52,7 +52,26 @@ module Deimos
52
52
  # @param lock_id [String]
53
53
  def clear_lock(topic, lock_id)
54
54
  self.where(topic: topic, locked_by: lock_id).
55
- update_all(locked_by: nil, locked_at: nil, error: false, retries: 0)
55
+ update_all(locked_by: nil,
56
+ locked_at: nil,
57
+ error: false,
58
+ retries: 0,
59
+ last_processed_at: Time.zone.now)
60
+ end
61
+
62
+ # Update all topics that aren't currently locked and have no messages
63
+ # waiting. It's OK if some messages get inserted in the middle of this
64
+ # because the point is that at least within a few milliseconds of each
65
+ # other, it wasn't locked and had no messages, meaning the topic
66
+ # was in a good state.
67
+ # @param except_topics [Array<String>] the list of topics we've just
68
+ # realized had messages in them, meaning all other topics were empty.
69
+ def ping_empty_topics(except_topics)
70
+ records = KafkaTopicInfo.where(locked_by: nil).
71
+ where('topic not in(?)', except_topics)
72
+ records.each do |info|
73
+ info.update_attribute(:last_processed_at, Time.zone.now)
74
+ end
56
75
  end
57
76
 
58
77
  # The producer calls this if it gets an error sending messages. This
@@ -10,7 +10,7 @@ module Deimos
10
10
  # @param producer [Class]
11
11
  def initialize(payload, producer, topic: nil, key: nil, partition_key: nil)
12
12
  @payload = payload&.with_indifferent_access
13
- @producer_name = producer.name
13
+ @producer_name = producer&.name
14
14
  @topic = topic
15
15
  @key = key
16
16
  @partition_key = partition_key
@@ -70,5 +70,10 @@ module Deimos
70
70
  def ==(other)
71
71
  self.to_h == other.to_h
72
72
  end
73
+
74
+ # @return [Boolean] True if this message is a tombstone
75
+ def tombstone?
76
+ payload.nil?
77
+ end
73
78
  end
74
79
  end
@@ -1,6 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Lint/UnusedMethodArgument
4
3
  module Deimos
5
4
  module Metrics
6
5
  # Base class for all metrics providers.
@@ -35,4 +34,3 @@ module Deimos
35
34
  end
36
35
  end
37
36
  end
38
- # rubocop:enable Lint/UnusedMethodArgument
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Deimos
4
+ # ActiveRecord class to record the last time we polled the database.
5
+ # For use with DbPoller.
6
+ class PollInfo < ActiveRecord::Base
7
+ self.table_name = 'deimos_poll_info'
8
+ end
9
+ end
@@ -33,6 +33,30 @@ module Deimos
33
33
  decode(payload, schema: @key_schema['name'])[field_name]
34
34
  end
35
35
 
36
+ # :nodoc:
37
+ def sql_type(field)
38
+ type = field.type.type
39
+ return type if %w(array map record).include?(type)
40
+
41
+ if type == :union
42
+ non_null = field.type.schemas.reject { |f| f.type == :null }
43
+ if non_null.size > 1
44
+ warn("WARNING: #{field.name} has more than one non-null type. Picking the first for the SQL type.")
45
+ end
46
+ return non_null.first.type
47
+ end
48
+ return type.to_sym if %w(float boolean).include?(type)
49
+ return :integer if type == 'int'
50
+ return :bigint if type == 'long'
51
+
52
+ if type == 'double'
53
+ warn('Avro `double` type turns into SQL `float` type. Please ensure you have the correct `limit` set.')
54
+ return :float
55
+ end
56
+
57
+ :string
58
+ end
59
+
36
60
  # @override
37
61
  def coerce_field(field, value)
38
62
  AvroSchemaCoercer.new(avro_schema).coerce_type(field.type, value)
@@ -40,7 +64,10 @@ module Deimos
40
64
 
41
65
  # @override
42
66
  def schema_fields
43
- avro_schema.fields.map { |field| SchemaField.new(field.name, field.type) }
67
+ avro_schema.fields.map do |field|
68
+ enum_values = field.type.type == 'enum' ? field.type.symbols : []
69
+ SchemaField.new(field.name, field.type, enum_values)
70
+ end
44
71
  end
45
72
 
46
73
  # @override
@@ -3,13 +3,15 @@
3
3
  module Deimos
4
4
  # Represents a field in the schema.
5
5
  class SchemaField
6
- attr_accessor :name, :type
6
+ attr_accessor :name, :type, :enum_values
7
7
 
8
8
  # @param name [String]
9
9
  # @param type [Object]
10
- def initialize(name, type)
10
+ # @param enum_values [Array<String>]
11
+ def initialize(name, type, enum_values=[])
11
12
  @name = name
12
13
  @type = type
14
+ @enum_values = enum_values
13
15
  end
14
16
  end
15
17
 
@@ -109,6 +111,17 @@ module Deimos
109
111
  raise NotImplementedError
110
112
  end
111
113
 
114
+ # Given a field definition, return the SQL type that might be used in
115
+ # ActiveRecord table creation - e.g. for Avro, a `long` type would
116
+ # return `:bigint`. There are also special values that need to be returned:
117
+ # `:array`, `:map` and `:record`, for types representing those structures.
118
+ # `:enum` is also recognized.
119
+ # @param field [SchemaField]
120
+ # @return [Symbol]
121
+ def sql_type(field)
122
+ raise NotImplementedError
123
+ end
124
+
112
125
  # Encode a message key. To be defined by subclass.
113
126
  # @param key [String|Hash] the value to use as the key.
114
127
  # @param key_id [Symbol|String] the field name of the key.
@@ -1,6 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Lint/UnusedMethodArgument
4
3
  module Deimos
5
4
  module Tracing
6
5
  # Base class for all tracing providers.
@@ -28,4 +27,3 @@ module Deimos
28
27
  end
29
28
  end
30
29
  end
31
- # rubocop:enable Lint/UnusedMethodArgument
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'deimos/poll_info'
4
+ require 'sigurd/executor'
5
+ require 'sigurd/signal_handler'
6
+
7
+ module Deimos
8
+ module Utils
9
+ # Class which continually polls the database and sends Kafka messages.
10
+ class DbPoller
11
+ BATCH_SIZE = 1000
12
+
13
+ # Needed for Executor so it can identify the worker
14
+ attr_reader :id
15
+
16
+ # Begin the DB Poller process.
17
+ def self.start!
18
+ if Deimos.config.db_poller_objects.empty?
19
+ raise('No pollers configured!')
20
+ end
21
+
22
+ pollers = Deimos.config.db_poller_objects.map do |poller_config|
23
+ self.new(poller_config)
24
+ end
25
+ executor = Sigurd::Executor.new(pollers,
26
+ sleep_seconds: 5,
27
+ logger: Deimos.config.logger)
28
+ signal_handler = Sigurd::SignalHandler.new(executor)
29
+ signal_handler.run!
30
+ end
31
+
32
+ # @param config [Deimos::Configuration::ConfigStruct]
33
+ def initialize(config)
34
+ @config = config
35
+ @id = SecureRandom.hex
36
+ begin
37
+ @producer = @config.producer_class.constantize
38
+ rescue NameError
39
+ raise "Class #{@config.producer_class} not found!"
40
+ end
41
+ unless @producer < Deimos::ActiveRecordProducer
42
+ raise "Class #{@producer.class.name} is not an ActiveRecordProducer!"
43
+ end
44
+ end
45
+
46
+ # Start the poll:
47
+ # 1) Grab the current PollInfo from the database indicating the last
48
+ # time we ran
49
+ # 2) On a loop, process all the recent updates between the last time
50
+ # we ran and now.
51
+ def start
52
+ # Don't send asynchronously
53
+ if Deimos.config.producers.backend == :kafka_async
54
+ Deimos.config.producers.backend = :kafka
55
+ end
56
+ Deimos.config.logger.info('Starting...')
57
+ @signal_to_stop = false
58
+ retrieve_poll_info
59
+ loop do
60
+ if @signal_to_stop
61
+ Deimos.config.logger.info('Shutting down')
62
+ break
63
+ end
64
+ process_updates
65
+ sleep 0.1
66
+ end
67
+ end
68
+
69
+ # Grab the PollInfo or create if it doesn't exist.
70
+ def retrieve_poll_info
71
+ ActiveRecord::Base.connection.reconnect!
72
+ new_time = @config.start_from_beginning ? Time.new(0) : Time.zone.now
73
+ @info = Deimos::PollInfo.find_by_producer(@config.producer_class) ||
74
+ Deimos::PollInfo.create!(producer: @config.producer_class,
75
+ last_sent: new_time,
76
+ last_sent_id: 0)
77
+ end
78
+
79
+ # Stop the poll.
80
+ def stop
81
+ Deimos.config.logger.info('Received signal to stop')
82
+ @signal_to_stop = true
83
+ end
84
+
85
+ # Indicate whether this current loop should process updates. Most loops
86
+ # will busy-wait (sleeping 0.1 seconds) until it's ready.
87
+ # @return [Boolean]
88
+ def should_run?
89
+ Time.zone.now - @info.last_sent - @config.delay_time >= @config.run_every
90
+ end
91
+
92
+ # @param record [ActiveRecord::Base]
93
+ # @return [ActiveSupport::TimeWithZone]
94
+ def last_updated(record)
95
+ record.public_send(@config.timestamp_column)
96
+ end
97
+
98
+ # Send messages for updated data.
99
+ def process_updates
100
+ return unless should_run?
101
+
102
+ time_from = @config.full_table ? Time.new(0) : @info.last_sent.in_time_zone
103
+ time_to = Time.zone.now - @config.delay_time
104
+ Deimos.config.logger.info("Polling #{@producer.topic} from #{time_from} to #{time_to}")
105
+ message_count = 0
106
+ batch_count = 0
107
+
108
+ # poll_query gets all the relevant data from the database, as defined
109
+ # by the producer itself.
110
+ loop do
111
+ Deimos.config.logger.debug("Polling #{@producer.topic}, batch #{batch_count + 1}")
112
+ batch = fetch_results(time_from, time_to).to_a
113
+ break if batch.empty?
114
+
115
+ batch_count += 1
116
+ process_batch(batch)
117
+ message_count += batch.size
118
+ time_from = last_updated(batch.last)
119
+ end
120
+ Deimos.config.logger.info("Poll #{@producer.topic} complete at #{time_to} (#{message_count} messages, #{batch_count} batches}")
121
+ end
122
+
123
+ # @param time_from [ActiveSupport::TimeWithZone]
124
+ # @param time_to [ActiveSupport::TimeWithZone]
125
+ # @return [ActiveRecord::Relation]
126
+ def fetch_results(time_from, time_to)
127
+ id = @producer.config[:record_class].primary_key
128
+ quoted_timestamp = ActiveRecord::Base.connection.quote_column_name(@config.timestamp_column)
129
+ quoted_id = ActiveRecord::Base.connection.quote_column_name(id)
130
+ @producer.poll_query(time_from: time_from,
131
+ time_to: time_to,
132
+ column_name: @config.timestamp_column,
133
+ min_id: @info.last_sent_id).
134
+ limit(BATCH_SIZE).
135
+ order("#{quoted_timestamp}, #{quoted_id}")
136
+ end
137
+
138
+ # @param batch [Array<ActiveRecord::Base>]
139
+ def process_batch(batch)
140
+ record = batch.last
141
+ id_method = record.class.primary_key
142
+ last_id = record.public_send(id_method)
143
+ last_updated_at = last_updated(record)
144
+ @producer.send_events(batch)
145
+ @info.update_attributes!(last_sent: last_updated_at, last_sent_id: last_id)
146
+ end
147
+ end
148
+ end
149
+ end