deimos-ruby 1.6.3 → 1.8.1.pre.beta1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/.circleci/config.yml +9 -0
  3. data/.rubocop.yml +22 -16
  4. data/.ruby-version +1 -1
  5. data/CHANGELOG.md +42 -0
  6. data/Gemfile.lock +125 -98
  7. data/README.md +164 -16
  8. data/Rakefile +1 -1
  9. data/deimos-ruby.gemspec +4 -3
  10. data/docs/ARCHITECTURE.md +144 -0
  11. data/docs/CONFIGURATION.md +27 -0
  12. data/lib/deimos.rb +8 -7
  13. data/lib/deimos/active_record_consume/batch_consumption.rb +159 -0
  14. data/lib/deimos/active_record_consume/batch_slicer.rb +27 -0
  15. data/lib/deimos/active_record_consume/message_consumption.rb +58 -0
  16. data/lib/deimos/active_record_consume/schema_model_converter.rb +52 -0
  17. data/lib/deimos/active_record_consumer.rb +33 -75
  18. data/lib/deimos/active_record_producer.rb +23 -0
  19. data/lib/deimos/batch_consumer.rb +2 -140
  20. data/lib/deimos/config/configuration.rb +28 -10
  21. data/lib/deimos/consume/batch_consumption.rb +150 -0
  22. data/lib/deimos/consume/message_consumption.rb +94 -0
  23. data/lib/deimos/consumer.rb +79 -70
  24. data/lib/deimos/kafka_message.rb +1 -1
  25. data/lib/deimos/kafka_topic_info.rb +22 -3
  26. data/lib/deimos/message.rb +6 -1
  27. data/lib/deimos/metrics/provider.rb +0 -2
  28. data/lib/deimos/poll_info.rb +9 -0
  29. data/lib/deimos/schema_backends/avro_base.rb +28 -1
  30. data/lib/deimos/schema_backends/base.rb +15 -2
  31. data/lib/deimos/tracing/provider.rb +0 -2
  32. data/lib/deimos/utils/db_poller.rb +149 -0
  33. data/lib/deimos/utils/db_producer.rb +59 -16
  34. data/lib/deimos/utils/deadlock_retry.rb +68 -0
  35. data/lib/deimos/utils/lag_reporter.rb +19 -26
  36. data/lib/deimos/version.rb +1 -1
  37. data/lib/generators/deimos/active_record/templates/migration.rb.tt +28 -0
  38. data/lib/generators/deimos/active_record/templates/model.rb.tt +5 -0
  39. data/lib/generators/deimos/active_record_generator.rb +79 -0
  40. data/lib/generators/deimos/db_backend/templates/migration +1 -0
  41. data/lib/generators/deimos/db_backend/templates/rails3_migration +1 -0
  42. data/lib/generators/deimos/db_poller/templates/migration +11 -0
  43. data/lib/generators/deimos/db_poller/templates/rails3_migration +16 -0
  44. data/lib/generators/deimos/db_poller_generator.rb +48 -0
  45. data/lib/tasks/deimos.rake +7 -0
  46. data/spec/active_record_batch_consumer_spec.rb +481 -0
  47. data/spec/active_record_consume/batch_slicer_spec.rb +42 -0
  48. data/spec/active_record_consume/schema_model_converter_spec.rb +105 -0
  49. data/spec/active_record_consumer_spec.rb +3 -11
  50. data/spec/active_record_producer_spec.rb +66 -88
  51. data/spec/batch_consumer_spec.rb +24 -7
  52. data/spec/config/configuration_spec.rb +4 -0
  53. data/spec/consumer_spec.rb +8 -8
  54. data/spec/deimos_spec.rb +57 -49
  55. data/spec/generators/active_record_generator_spec.rb +56 -0
  56. data/spec/handlers/my_batch_consumer.rb +6 -1
  57. data/spec/handlers/my_consumer.rb +6 -1
  58. data/spec/kafka_topic_info_spec.rb +39 -16
  59. data/spec/message_spec.rb +19 -0
  60. data/spec/producer_spec.rb +3 -3
  61. data/spec/rake_spec.rb +1 -1
  62. data/spec/schemas/com/my-namespace/Generated.avsc +71 -0
  63. data/spec/schemas/com/my-namespace/MySchemaCompound-key.avsc +18 -0
  64. data/spec/schemas/com/my-namespace/Wibble.avsc +43 -0
  65. data/spec/spec_helper.rb +62 -6
  66. data/spec/utils/db_poller_spec.rb +320 -0
  67. data/spec/utils/db_producer_spec.rb +84 -10
  68. data/spec/utils/deadlock_retry_spec.rb +74 -0
  69. data/spec/utils/lag_reporter_spec.rb +29 -22
  70. metadata +66 -30
  71. data/lib/deimos/base_consumer.rb +0 -104
  72. data/lib/deimos/utils/executor.rb +0 -124
  73. data/lib/deimos/utils/platform_schema_validation.rb +0 -0
  74. data/lib/deimos/utils/signal_handler.rb +0 -68
  75. data/spec/utils/executor_spec.rb +0 -53
  76. data/spec/utils/signal_handler_spec.rb +0 -16
@@ -1,95 +1,104 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'deimos/base_consumer'
4
- require 'deimos/shared_config'
5
- require 'phobos/handler'
6
- require 'active_support/all'
3
+ require 'deimos/consume/batch_consumption'
4
+ require 'deimos/consume/message_consumption'
7
5
 
8
- # Class to consume messages coming from the pipeline topic
6
+ # Class to consume messages coming from a Kafka topic
9
7
  # Note: According to the docs, instances of your handler will be created
10
- # for every incoming message. This class should be lightweight.
8
+ # for every incoming message/batch. This class should be lightweight.
11
9
  module Deimos
12
- # Parent consumer class.
13
- class Consumer < BaseConsumer
14
- include Phobos::Handler
10
+ # Basic consumer class. Inherit from this class and override either consume
11
+ # or consume_batch, depending on the delivery mode of your listener.
12
+ # `consume` -> use `delivery :message` or `delivery :batch`
13
+ # `consume_batch` -> use `delivery :inline_batch`
14
+ class Consumer
15
+ include Consume::MessageConsumption
16
+ include Consume::BatchConsumption
17
+ include SharedConfig
15
18
 
16
- # :nodoc:
17
- def around_consume(payload, metadata)
18
- decoded_payload = payload.dup
19
- new_metadata = metadata.dup
20
- benchmark = Benchmark.measure do
21
- _with_error_span(payload, metadata) do
22
- new_metadata[:key] = decode_key(metadata[:key]) if self.class.config[:key_configured]
23
- decoded_payload = payload ? self.class.decoder.decode(payload) : nil
24
- _received_message(decoded_payload, new_metadata)
25
- yield decoded_payload, new_metadata
26
- end
19
+ class << self
20
+ # @return [Deimos::SchemaBackends::Base]
21
+ def decoder
22
+ @decoder ||= Deimos.schema_backend(schema: config[:schema],
23
+ namespace: config[:namespace])
24
+ end
25
+
26
+ # @return [Deimos::SchemaBackends::Base]
27
+ def key_decoder
28
+ @key_decoder ||= Deimos.schema_backend(schema: config[:key_schema],
29
+ namespace: config[:namespace])
27
30
  end
28
- _handle_success(benchmark.real, decoded_payload, new_metadata)
29
31
  end
30
32
 
31
- # Consume incoming messages.
32
- # @param _payload [String]
33
- # @param _metadata [Hash]
34
- def consume(_payload, _metadata)
35
- raise NotImplementedError
33
+ # Helper method to decode an encoded key.
34
+ # @param key [String]
35
+ # @return [Object] the decoded key.
36
+ def decode_key(key)
37
+ return nil if key.nil?
38
+
39
+ config = self.class.config
40
+ unless config[:key_configured]
41
+ raise 'No key config given - if you are not decoding keys, please use '\
42
+ '`key_config plain: true`'
43
+ end
44
+
45
+ if config[:key_field]
46
+ self.class.decoder.decode_key(key, config[:key_field])
47
+ elsif config[:key_schema]
48
+ self.class.key_decoder.decode(key, schema: config[:key_schema])
49
+ else # no encoding
50
+ key
51
+ end
36
52
  end
37
53
 
38
54
  private
39
55
 
40
- def _received_message(payload, metadata)
41
- Deimos.config.logger.info(
42
- message: 'Got Kafka event',
43
- payload: payload,
44
- metadata: metadata
56
+ def _with_span
57
+ @span = Deimos.config.tracer&.start(
58
+ 'deimos-consumer',
59
+ resource: self.class.name.gsub('::', '-')
45
60
  )
46
- Deimos.config.metrics&.increment('handler', tags: %W(
47
- status:received
61
+ yield
62
+ ensure
63
+ Deimos.config.tracer&.finish(@span)
64
+ end
65
+
66
+ def _report_time_delayed(payload, metadata)
67
+ return if payload.nil? || payload['timestamp'].blank?
68
+
69
+ begin
70
+ time_delayed = Time.now.in_time_zone - payload['timestamp'].to_datetime
71
+ rescue ArgumentError
72
+ Deimos.config.logger.info(
73
+ message: "Error parsing timestamp! #{payload['timestamp']}"
74
+ )
75
+ return
76
+ end
77
+ Deimos.config.metrics&.histogram('handler', time_delayed, tags: %W(
78
+ time:time_delayed
48
79
  topic:#{metadata[:topic]}
49
80
  ))
50
- _report_time_delayed(payload, metadata)
51
81
  end
52
82
 
53
- # @param exception [Throwable]
54
- # @param payload [Hash]
55
- # @param metadata [Hash]
56
- def _handle_error(exception, payload, metadata)
57
- Deimos.config.metrics&.increment(
58
- 'handler',
59
- tags: %W(
60
- status:error
61
- topic:#{metadata[:topic]}
62
- )
63
- )
64
- Deimos.config.logger.warn(
65
- message: 'Error consuming message',
66
- handler: self.class.name,
67
- metadata: metadata,
68
- data: payload,
69
- error_message: exception.message,
70
- error: exception.backtrace
71
- )
72
- super
83
+ # Overrideable method to determine if a given error should be considered
84
+ # "fatal" and always be reraised.
85
+ # @param _error [Exception]
86
+ # @param _payload [Hash]
87
+ # @param _metadata [Hash]
88
+ # @return [Boolean]
89
+ def fatal_error?(_error, _payload, _metadata)
90
+ false
73
91
  end
74
92
 
75
- # @param time_taken [Float]
93
+ # @param exception [Exception]
76
94
  # @param payload [Hash]
77
95
  # @param metadata [Hash]
78
- def _handle_success(time_taken, payload, metadata)
79
- Deimos.config.metrics&.histogram('handler', time_taken, tags: %W(
80
- time:consume
81
- topic:#{metadata[:topic]}
82
- ))
83
- Deimos.config.metrics&.increment('handler', tags: %W(
84
- status:success
85
- topic:#{metadata[:topic]}
86
- ))
87
- Deimos.config.logger.info(
88
- message: 'Finished processing Kafka event',
89
- payload: payload,
90
- time_elapsed: time_taken,
91
- metadata: metadata
92
- )
96
+ def _error(exception, payload, metadata)
97
+ Deimos.config.tracer&.set_error(@span, exception)
98
+
99
+ raise if Deimos.config.consumers.reraise_errors ||
100
+ Deimos.config.consumers.fatal_error&.call(exception, payload, metadata) ||
101
+ fatal_error?(exception, payload, metadata)
93
102
  end
94
103
  end
95
104
  end
@@ -42,7 +42,7 @@ module Deimos
42
42
  messages.map do |m|
43
43
  {
44
44
  key: m.key.present? ? decoder&.decode_key(m.key) || m.key : nil,
45
- payload: decoder&.decoder&.decode(self.message) || self.message
45
+ payload: decoder&.decoder&.decode(m.message) || m.message
46
46
  }
47
47
  end
48
48
  end
@@ -13,8 +13,8 @@ module Deimos
13
13
  def lock(topic, lock_id)
14
14
  # Try to create it - it's fine if it already exists
15
15
  begin
16
- self.create(topic: topic)
17
- rescue ActiveRecord::RecordNotUnique # rubocop:disable Lint/SuppressedException
16
+ self.create(topic: topic, last_processed_at: Time.zone.now)
17
+ rescue ActiveRecord::RecordNotUnique
18
18
  # continue on
19
19
  end
20
20
 
@@ -52,7 +52,26 @@ module Deimos
52
52
  # @param lock_id [String]
53
53
  def clear_lock(topic, lock_id)
54
54
  self.where(topic: topic, locked_by: lock_id).
55
- update_all(locked_by: nil, locked_at: nil, error: false, retries: 0)
55
+ update_all(locked_by: nil,
56
+ locked_at: nil,
57
+ error: false,
58
+ retries: 0,
59
+ last_processed_at: Time.zone.now)
60
+ end
61
+
62
+ # Update all topics that aren't currently locked and have no messages
63
+ # waiting. It's OK if some messages get inserted in the middle of this
64
+ # because the point is that at least within a few milliseconds of each
65
+ # other, it wasn't locked and had no messages, meaning the topic
66
+ # was in a good state.
67
+ # @param except_topics [Array<String>] the list of topics we've just
68
+ # realized had messages in them, meaning all other topics were empty.
69
+ def ping_empty_topics(except_topics)
70
+ records = KafkaTopicInfo.where(locked_by: nil).
71
+ where('topic not in(?)', except_topics)
72
+ records.each do |info|
73
+ info.update_attribute(:last_processed_at, Time.zone.now)
74
+ end
56
75
  end
57
76
 
58
77
  # The producer calls this if it gets an error sending messages. This
@@ -10,7 +10,7 @@ module Deimos
10
10
  # @param producer [Class]
11
11
  def initialize(payload, producer, topic: nil, key: nil, partition_key: nil)
12
12
  @payload = payload&.with_indifferent_access
13
- @producer_name = producer.name
13
+ @producer_name = producer&.name
14
14
  @topic = topic
15
15
  @key = key
16
16
  @partition_key = partition_key
@@ -70,5 +70,10 @@ module Deimos
70
70
  def ==(other)
71
71
  self.to_h == other.to_h
72
72
  end
73
+
74
+ # @return [Boolean] True if this message is a tombstone
75
+ def tombstone?
76
+ payload.nil?
77
+ end
73
78
  end
74
79
  end
@@ -1,6 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Lint/UnusedMethodArgument
4
3
  module Deimos
5
4
  module Metrics
6
5
  # Base class for all metrics providers.
@@ -35,4 +34,3 @@ module Deimos
35
34
  end
36
35
  end
37
36
  end
38
- # rubocop:enable Lint/UnusedMethodArgument
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Deimos
4
+ # ActiveRecord class to record the last time we polled the database.
5
+ # For use with DbPoller.
6
+ class PollInfo < ActiveRecord::Base
7
+ self.table_name = 'deimos_poll_info'
8
+ end
9
+ end
@@ -33,6 +33,30 @@ module Deimos
33
33
  decode(payload, schema: @key_schema['name'])[field_name]
34
34
  end
35
35
 
36
+ # :nodoc:
37
+ def sql_type(field)
38
+ type = field.type.type
39
+ return type if %w(array map record).include?(type)
40
+
41
+ if type == :union
42
+ non_null = field.type.schemas.reject { |f| f.type == :null }
43
+ if non_null.size > 1
44
+ warn("WARNING: #{field.name} has more than one non-null type. Picking the first for the SQL type.")
45
+ end
46
+ return non_null.first.type
47
+ end
48
+ return type.to_sym if %w(float boolean).include?(type)
49
+ return :integer if type == 'int'
50
+ return :bigint if type == 'long'
51
+
52
+ if type == 'double'
53
+ warn('Avro `double` type turns into SQL `float` type. Please ensure you have the correct `limit` set.')
54
+ return :float
55
+ end
56
+
57
+ :string
58
+ end
59
+
36
60
  # @override
37
61
  def coerce_field(field, value)
38
62
  AvroSchemaCoercer.new(avro_schema).coerce_type(field.type, value)
@@ -40,7 +64,10 @@ module Deimos
40
64
 
41
65
  # @override
42
66
  def schema_fields
43
- avro_schema.fields.map { |field| SchemaField.new(field.name, field.type) }
67
+ avro_schema.fields.map do |field|
68
+ enum_values = field.type.type == 'enum' ? field.type.symbols : []
69
+ SchemaField.new(field.name, field.type, enum_values)
70
+ end
44
71
  end
45
72
 
46
73
  # @override
@@ -3,13 +3,15 @@
3
3
  module Deimos
4
4
  # Represents a field in the schema.
5
5
  class SchemaField
6
- attr_accessor :name, :type
6
+ attr_accessor :name, :type, :enum_values
7
7
 
8
8
  # @param name [String]
9
9
  # @param type [Object]
10
- def initialize(name, type)
10
+ # @param enum_values [Array<String>]
11
+ def initialize(name, type, enum_values=[])
11
12
  @name = name
12
13
  @type = type
14
+ @enum_values = enum_values
13
15
  end
14
16
  end
15
17
 
@@ -109,6 +111,17 @@ module Deimos
109
111
  raise NotImplementedError
110
112
  end
111
113
 
114
+ # Given a field definition, return the SQL type that might be used in
115
+ # ActiveRecord table creation - e.g. for Avro, a `long` type would
116
+ # return `:bigint`. There are also special values that need to be returned:
117
+ # `:array`, `:map` and `:record`, for types representing those structures.
118
+ # `:enum` is also recognized.
119
+ # @param field [SchemaField]
120
+ # @return [Symbol]
121
+ def sql_type(field)
122
+ raise NotImplementedError
123
+ end
124
+
112
125
  # Encode a message key. To be defined by subclass.
113
126
  # @param key [String|Hash] the value to use as the key.
114
127
  # @param key_id [Symbol|String] the field name of the key.
@@ -1,6 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # rubocop:disable Lint/UnusedMethodArgument
4
3
  module Deimos
5
4
  module Tracing
6
5
  # Base class for all tracing providers.
@@ -28,4 +27,3 @@ module Deimos
28
27
  end
29
28
  end
30
29
  end
31
- # rubocop:enable Lint/UnusedMethodArgument
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'deimos/poll_info'
4
+ require 'sigurd/executor'
5
+ require 'sigurd/signal_handler'
6
+
7
+ module Deimos
8
+ module Utils
9
+ # Class which continually polls the database and sends Kafka messages.
10
+ class DbPoller
11
+ BATCH_SIZE = 1000
12
+
13
+ # Needed for Executor so it can identify the worker
14
+ attr_reader :id
15
+
16
+ # Begin the DB Poller process.
17
+ def self.start!
18
+ if Deimos.config.db_poller_objects.empty?
19
+ raise('No pollers configured!')
20
+ end
21
+
22
+ pollers = Deimos.config.db_poller_objects.map do |poller_config|
23
+ self.new(poller_config)
24
+ end
25
+ executor = Sigurd::Executor.new(pollers,
26
+ sleep_seconds: 5,
27
+ logger: Deimos.config.logger)
28
+ signal_handler = Sigurd::SignalHandler.new(executor)
29
+ signal_handler.run!
30
+ end
31
+
32
+ # @param config [Deimos::Configuration::ConfigStruct]
33
+ def initialize(config)
34
+ @config = config
35
+ @id = SecureRandom.hex
36
+ begin
37
+ @producer = @config.producer_class.constantize
38
+ rescue NameError
39
+ raise "Class #{@config.producer_class} not found!"
40
+ end
41
+ unless @producer < Deimos::ActiveRecordProducer
42
+ raise "Class #{@producer.class.name} is not an ActiveRecordProducer!"
43
+ end
44
+ end
45
+
46
+ # Start the poll:
47
+ # 1) Grab the current PollInfo from the database indicating the last
48
+ # time we ran
49
+ # 2) On a loop, process all the recent updates between the last time
50
+ # we ran and now.
51
+ def start
52
+ # Don't send asynchronously
53
+ if Deimos.config.producers.backend == :kafka_async
54
+ Deimos.config.producers.backend = :kafka
55
+ end
56
+ Deimos.config.logger.info('Starting...')
57
+ @signal_to_stop = false
58
+ retrieve_poll_info
59
+ loop do
60
+ if @signal_to_stop
61
+ Deimos.config.logger.info('Shutting down')
62
+ break
63
+ end
64
+ process_updates
65
+ sleep 0.1
66
+ end
67
+ end
68
+
69
+ # Grab the PollInfo or create if it doesn't exist.
70
+ def retrieve_poll_info
71
+ ActiveRecord::Base.connection.reconnect!
72
+ new_time = @config.start_from_beginning ? Time.new(0) : Time.zone.now
73
+ @info = Deimos::PollInfo.find_by_producer(@config.producer_class) ||
74
+ Deimos::PollInfo.create!(producer: @config.producer_class,
75
+ last_sent: new_time,
76
+ last_sent_id: 0)
77
+ end
78
+
79
+ # Stop the poll.
80
+ def stop
81
+ Deimos.config.logger.info('Received signal to stop')
82
+ @signal_to_stop = true
83
+ end
84
+
85
+ # Indicate whether this current loop should process updates. Most loops
86
+ # will busy-wait (sleeping 0.1 seconds) until it's ready.
87
+ # @return [Boolean]
88
+ def should_run?
89
+ Time.zone.now - @info.last_sent - @config.delay_time >= @config.run_every
90
+ end
91
+
92
+ # @param record [ActiveRecord::Base]
93
+ # @return [ActiveSupport::TimeWithZone]
94
+ def last_updated(record)
95
+ record.public_send(@config.timestamp_column)
96
+ end
97
+
98
+ # Send messages for updated data.
99
+ def process_updates
100
+ return unless should_run?
101
+
102
+ time_from = @config.full_table ? Time.new(0) : @info.last_sent.in_time_zone
103
+ time_to = Time.zone.now - @config.delay_time
104
+ Deimos.config.logger.info("Polling #{@producer.topic} from #{time_from} to #{time_to}")
105
+ message_count = 0
106
+ batch_count = 0
107
+
108
+ # poll_query gets all the relevant data from the database, as defined
109
+ # by the producer itself.
110
+ loop do
111
+ Deimos.config.logger.debug("Polling #{@producer.topic}, batch #{batch_count + 1}")
112
+ batch = fetch_results(time_from, time_to).to_a
113
+ break if batch.empty?
114
+
115
+ batch_count += 1
116
+ process_batch(batch)
117
+ message_count += batch.size
118
+ time_from = last_updated(batch.last)
119
+ end
120
+ Deimos.config.logger.info("Poll #{@producer.topic} complete at #{time_to} (#{message_count} messages, #{batch_count} batches}")
121
+ end
122
+
123
+ # @param time_from [ActiveSupport::TimeWithZone]
124
+ # @param time_to [ActiveSupport::TimeWithZone]
125
+ # @return [ActiveRecord::Relation]
126
+ def fetch_results(time_from, time_to)
127
+ id = @producer.config[:record_class].primary_key
128
+ quoted_timestamp = ActiveRecord::Base.connection.quote_column_name(@config.timestamp_column)
129
+ quoted_id = ActiveRecord::Base.connection.quote_column_name(id)
130
+ @producer.poll_query(time_from: time_from,
131
+ time_to: time_to,
132
+ column_name: @config.timestamp_column,
133
+ min_id: @info.last_sent_id).
134
+ limit(BATCH_SIZE).
135
+ order("#{quoted_timestamp}, #{quoted_id}")
136
+ end
137
+
138
+ # @param batch [Array<ActiveRecord::Base>]
139
+ def process_batch(batch)
140
+ record = batch.last
141
+ id_method = record.class.primary_key
142
+ last_id = record.public_send(id_method)
143
+ last_updated_at = last_updated(record)
144
+ @producer.send_events(batch)
145
+ @info.update_attributes!(last_sent: last_updated_at, last_sent_id: last_id)
146
+ end
147
+ end
148
+ end
149
+ end