bricolage-streamingload 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d81ff86cb9addccb7ca9db4d240218679b1f72f9
4
- data.tar.gz: f2ec045c994f1c6b619695f74a13aebcc9318722
3
+ metadata.gz: 848b815669c6580505119917a72a4e97833064c2
4
+ data.tar.gz: 1809410699822e2a60a21407cad1b5814551adcd
5
5
  SHA512:
6
- metadata.gz: a78c3b8f35f8d10cbc3da301667ea811874ccadb7653b02388990d63afc2183ee3d0c4e357dccc7799721e1dcd8e33b88e9b3d66147903c32ff109595bced0f6
7
- data.tar.gz: a3632f13d3ea039aa690deca646659e0d89ad636b2af55abf5f7011f54cfecddcef53d57152de513bcbaf5c03671af7a512a10781e7846efc60af9c752b8c364
6
+ metadata.gz: 84ecbe1e548953cc1f4889eb07e9289101e1ade28e89370dae39462b61d193521ceda25f59d8a0b0f1760b8555cf254abab50cf7ed3d11e70d81c422af2f8b82
7
+ data.tar.gz: b367bb4faa24e9755bee5dbeaaaf937a317826e8e562466e3264a19563bbfcc877ddba9801ffda4875d21e39feddfcc8e87df81a6433b9cd1c3a2cabcefc0754
@@ -0,0 +1,20 @@
1
+ require 'logger'
2
+
3
+ module Bricolage
4
+ # FIXME: should be defined in the Bricolage package
5
+ class NullLogger
6
+ def debug(*args) end
7
+ def debug?() false end
8
+ def info(*args) end
9
+ def info?() false end
10
+ def warn(*args) end
11
+ def warn?() false end
12
+ def error(*args) end
13
+ def error?() false end
14
+ def exception(*args) end
15
+ def with_elapsed_time(*args) yield end
16
+ def elapsed_time(*args) yield end
17
+ def level() Logger::ERROR end
18
+ def level=(l) l end
19
+ end
20
+ end
@@ -1,5 +1,4 @@
1
1
  require 'bricolage/datasource'
2
- require 'bricolage/sqswrapper'
3
2
  require 'securerandom'
4
3
  require 'aws-sdk'
5
4
  require 'json'
@@ -28,6 +27,10 @@ module Bricolage
28
27
  attr_reader :access_key_id
29
28
  attr_reader :secret_access_key
30
29
 
30
+ attr_reader :visibility_timeout
31
+ attr_reader :max_number_of_messages
32
+ attr_reader :wait_time_seconds
33
+
31
34
  def client
32
35
  @client ||= begin
33
36
  c = @noop ? DummySQSClient.new : Aws::SQS::Client.new(region: @region, access_key_id: @access_key_id, secret_access_key: @secret_access_key)
@@ -39,22 +42,18 @@ module Bricolage
39
42
  # High-Level Polling Interface
40
43
  #
41
44
 
42
- def main_handler_loop(handlers:, message_class:)
45
+ def handle_messages(handler:, message_class:)
43
46
  trap_signals
44
-
45
- n_zero = 0
46
- until terminating?
47
- insert_handler_wait(n_zero)
48
- n_msg = handle_messages(handlers: handlers, message_class: message_class)
49
- if n_msg == 0
50
- n_zero += 1
51
- else
52
- n_zero = 0
47
+ polling_loop do
48
+ result = poll or next true
49
+ msgs = message_class.for_sqs_result(result)
50
+ msgs.each do |msg|
51
+ handler.handle(msg)
53
52
  end
54
- delete_message_buffer.flush
53
+ handler.after_message_batch
54
+ break if terminating?
55
+ msgs.empty?
55
56
  end
56
- delete_message_buffer.flush_force
57
- logger.info "shutdown gracefully"
58
57
  end
59
58
 
60
59
  def trap_signals
@@ -66,6 +65,7 @@ module Bricolage
66
65
  initiate_terminate
67
66
  }
68
67
  end
68
+ private :trap_signals
69
69
 
70
70
  def initiate_terminate
71
71
  # No I/O allowed in this method
@@ -76,36 +76,72 @@ module Bricolage
76
76
  @terminating
77
77
  end
78
78
 
79
- def insert_handler_wait(n_zero)
80
- sec = 2 ** [n_zero, 6].min # max 64s
81
- logger.info "queue wait: sleep #{sec}" if n_zero > 0
82
- sleep sec
83
- end
84
-
85
- def handle_messages(handlers:, message_class:)
86
- n_msg = foreach_message(message_class) do |msg|
87
- logger.debug "handling message: #{msg.inspect}" if logger.debug?
88
- mid = "handle_#{msg.message_type}"
89
- # just ignore unknown event to make app migration easy
90
- if handlers.respond_to?(mid, true)
91
- handlers.__send__(mid, msg)
79
+ def polling_loop
80
+ n_failure = 0
81
+ while true
82
+ failed = yield
83
+ if failed
84
+ n_failure += 1
92
85
  else
93
- logger.error "unknown SQS message type: #{msg.message_type.inspect} (message-id: #{msg.message_id})"
86
+ n_failure = 0
94
87
  end
88
+ insert_handler_wait(n_failure)
95
89
  end
96
- n_msg
97
90
  end
91
+ private :polling_loop
98
92
 
99
- def foreach_message(message_class, &block)
93
+ def insert_handler_wait(n_failure)
94
+ sec = 2 ** [n_failure, 6].min # max 64s
95
+ logger.info "queue wait: sleep #{sec}" if n_failure > 0
96
+ sleep sec
97
+ end
98
+ private :insert_handler_wait
99
+
100
+ def poll
100
101
  result = receive_messages()
101
102
  unless result and result.successful?
102
103
  logger.error "ReceiveMessage failed: #{result ? result.error.message : '(result=nil)'}"
103
104
  return nil
104
105
  end
105
- logger.info "receive #{result.messages.size} messages" unless result.messages.empty?
106
- msgs = message_class.for_sqs_result(result)
107
- msgs.each(&block)
108
- msgs.size
106
+ logger.info "receive #{result.messages.size} messages"
107
+ result
108
+ end
109
+
110
+ class MessageHandler
111
+ # abstract logger()
112
+
113
+ def handle(msg)
114
+ logger.debug "handling message: #{msg.inspect}" if logger.debug?
115
+ if handleable?(msg)
116
+ call_handler_method(msg)
117
+ else
118
+ handle_unknown(msg)
119
+ end
120
+ end
121
+
122
+ def handleable?(msg)
123
+ respond_to?(handler_method(msg), true)
124
+ end
125
+
126
+ def call_handler_method(msg)
127
+ __send__(handler_method(msg), msg)
128
+ end
129
+
130
+ def handler_method(msg)
131
+ "handle_#{msg.message_type}".intern
132
+ end
133
+
134
+ # Unknown message handler.
135
+ # Feel free to override this method.
136
+ def handle_unknown(msg)
137
+ # just ignore unknown message to make app migration easy
138
+ logger.error "unknown message type: #{msg.message_type.inspect} (message-id: #{msg.message_id})"
139
+ end
140
+
141
+ # Called after each message batch (ReceiveMessage) is processed.
142
+ # Override this method in subclasses on demand.
143
+ def after_message_batch
144
+ end
109
145
  end
110
146
 
111
147
  #
@@ -122,6 +158,18 @@ module Bricolage
122
158
  result
123
159
  end
124
160
 
161
+ def put(msg)
162
+ send_message(msg)
163
+ end
164
+
165
+ def send_message(msg)
166
+ client.send_message(
167
+ queue_url: @url,
168
+ message_body: { 'Records' => [msg.body] }.to_json,
169
+ delay_seconds: msg.delay_seconds
170
+ )
171
+ end
172
+
125
173
  def delete_message(msg)
126
174
  client.delete_message(
127
175
  queue_url: @url,
@@ -133,20 +181,18 @@ module Bricolage
133
181
  delete_message_buffer.put(msg)
134
182
  end
135
183
 
136
- def delete_message_buffer
137
- @delete_message_buffer ||= DeleteMessageBuffer.new(client, @url, logger)
184
+ def process_async_delete(now = Time.now)
185
+ delete_message_buffer.flush(now)
138
186
  end
139
187
 
140
- def put(msg)
141
- send_message(msg)
188
+ def process_async_delete_force
189
+ delete_message_buffer.flush_force
142
190
  end
143
191
 
144
- def send_message(msg)
145
- client.send_message(
146
- queue_url: @url,
147
- message_body: { 'Records' => [msg.body] }.to_json,
148
- delay_seconds: msg.delay_seconds
149
- )
192
+ private
193
+
194
+ def delete_message_buffer
195
+ @delete_message_buffer ||= DeleteMessageBuffer.new(client, @url, logger)
150
196
  end
151
197
 
152
198
  class DeleteMessageBuffer
@@ -256,6 +302,36 @@ module Bricolage
256
302
  end # class SQSDataSource
257
303
 
258
304
 
305
+ class SQSClientWrapper
306
+
307
+ def initialize(sqs, logger:)
308
+ @sqs = sqs
309
+ @logger = logger
310
+ end
311
+
312
+ def receive_message(**args)
313
+ @logger.debug "receive_message(#{args.inspect})"
314
+ @sqs.receive_message(**args)
315
+ end
316
+
317
+ def send_message(**args)
318
+ @logger.debug "send_message(#{args.inspect})"
319
+ @sqs.send_message(**args)
320
+ end
321
+
322
+ def delete_message(**args)
323
+ @logger.debug "delete_message(#{args.inspect})"
324
+ @sqs.delete_message(**args)
325
+ end
326
+
327
+ def delete_message_batch(**args)
328
+ @logger.debug "delete_message_batch(#{args.inspect})"
329
+ @sqs.delete_message_batch(**args)
330
+ end
331
+
332
+ end # class SQSClientWrapper
333
+
334
+
259
335
  class SQSMessage
260
336
 
261
337
  SQS_EVENT_SOURCE = 'bricolage:system'
@@ -0,0 +1,194 @@
1
+ require 'bricolage/sqsdatasource'
2
+ require 'bricolage/nulllogger'
3
+ require 'json'
4
+
5
+ module Bricolage
6
+
7
+ def SQSDataSource.new_mock(**args)
8
+ SQSDataSource.new(
9
+ url: 'http://sqs/000000000000/queue-name',
10
+ access_key_id: 'access_key_id_1',
11
+ secret_access_key: 'secret_access_key_1',
12
+ visibility_timeout: 30
13
+ ).tap {|ds|
14
+ logger = NullLogger.new
15
+ #logger = Bricolage::Logger.default
16
+ ds.__send__(:initialize_base, 'name', nil, logger)
17
+ ds.instance_variable_set(:@client, SQSMock::Client.new(**args))
18
+ }
19
+ end
20
+
21
+ module SQSMock
22
+
23
+ class Client
24
+ def initialize(queue: [], receive_message: nil, send_message: nil, delete_message: nil, delete_message_batch: nil)
25
+ @queue = queue # [[record]]
26
+ @call_history = []
27
+
28
+ @receive_message = receive_message || lambda {|**args|
29
+ msgs = @queue.shift or break ReceiveMessageResponse.successful([])
30
+ ReceiveMessageResponse.successful(msgs)
31
+ }
32
+
33
+ @send_message = send_message || lambda {|**args|
34
+ SendMessageResponse.successful
35
+ }
36
+
37
+ @delete_message = delete_message || lambda {|**args|
38
+ Response.successful
39
+ }
40
+
41
+ @delete_message_batch = delete_message_batch || lambda {|queue_url:, entries:|
42
+ # Returns success for all requests by default.
43
+ DeleteMessageBatchResponse.new.tap {|res|
44
+ entries.each do |ent|
45
+ res.add_success_for(ent)
46
+ end
47
+ }
48
+ }
49
+ end
50
+
51
+ # Free free to modify this array contents
52
+ attr_reader :call_history
53
+
54
+ def self.def_mock_method(name)
55
+ define_method(name) {|**args|
56
+ @call_history.push CallHistory.new(name.intern, args)
57
+ instance_variable_get("@#{name}").(**args)
58
+ }
59
+ end
60
+
61
+ def_mock_method :receive_message
62
+ def_mock_method :send_message
63
+ def_mock_method :delete_message
64
+ def_mock_method :delete_message_batch
65
+ end
66
+
67
+ CallHistory = Struct.new(:name, :args)
68
+
69
+ # success/failure only result
70
+ class Response
71
+ def Response.successful
72
+ new(successful: true)
73
+ end
74
+
75
+ def initialize(successful:)
76
+ @successful = successful
77
+ end
78
+
79
+ def successful?
80
+ @successful
81
+ end
82
+ end
83
+
84
+ class ReceiveMessageResponse < Response
85
+ def ReceiveMessageResponse.successful(msgs)
86
+ new(successful: true, messages: msgs)
87
+ end
88
+
89
+ def initialize(successful:, messages:)
90
+ super(successful: successful)
91
+ @messages = messages
92
+ end
93
+
94
+ attr_reader :messages
95
+ end
96
+
97
+ class SendMessageResponse < Response
98
+ def SendMessageResponse.successful
99
+ new(successful: true, message_id: "sqs-sent-message-id-#{Message.new_seq}")
100
+ end
101
+
102
+ def initialize(successful:, message_id:)
103
+ super(successful: successful)
104
+ @message_id = message_id
105
+ end
106
+
107
+ attr_reader :message_id
108
+ end
109
+
110
+ class DeleteMessageBatchResponse
111
+ def initialize(successful: [], failed: [])
112
+ @successful = successful
113
+ @failed = failed
114
+ end
115
+
116
+ attr_reader :successful
117
+ attr_reader :failed
118
+
119
+ Success = Struct.new(:id)
120
+ Failure = Struct.new(:id, :sender_fault, :code, :message)
121
+
122
+ def add_success_for(ent)
123
+ @successful.push Success.new(ent[:id])
124
+ end
125
+
126
+ def add_failure_for(ent)
127
+ @failed.push Failure.new(ent[:id], true, '400', 'some reason')
128
+ end
129
+ end
130
+
131
+ class Message
132
+ def Message.s3_object_created_event(url)
133
+ raise "is not a S3 URL: #{url.inspect}" unless %r<\As3://\w> =~ url
134
+ bucket, key = url.sub(%r<s3://>, '').split('/', 2)
135
+ with_body({
136
+ eventVersion: '2.0',
137
+ eventSource: 'aws:s3',
138
+ awsRegion: 'ap-northeast-1',
139
+ eventTime: Time.now.iso8601,
140
+ eventName: 'ObjectCreated:Put',
141
+ s3: {
142
+ s3SchemaVersion: '1.0',
143
+ configurationId: 'TestConfig',
144
+ bucket: {
145
+ name: bucket,
146
+ arn: "arn:aws:s3:::#{bucket}"
147
+ },
148
+ object: {
149
+ key: key,
150
+ size: 1024
151
+ }
152
+ }
153
+ })
154
+ end
155
+
156
+ @seq = 0
157
+
158
+ def Message.new_seq
159
+ @seq += 1
160
+ @seq
161
+ end
162
+
163
+ def Message.with_body(body)
164
+ seq = new_seq
165
+ new(
166
+ message_id: "sqs-message-id-#{seq}",
167
+ receipt_handle: "sqs-receipt-handle-#{seq}",
168
+ body: body
169
+ )
170
+ end
171
+
172
+ def initialize(message_id: nil, receipt_handle: nil, body: nil)
173
+ @message_id = message_id
174
+ @receipt_handle = receipt_handle
175
+ @body = body
176
+ @body_json = { Records: [body] }.to_json
177
+ end
178
+
179
+ attr_reader :message_id
180
+ attr_reader :receipt_handle
181
+
182
+ def body
183
+ @body_json
184
+ end
185
+
186
+ # for debug
187
+ def body_object
188
+ @body
189
+ end
190
+ end
191
+
192
+ end # module SQSMock
193
+
194
+ end # module Bricolage
@@ -1,3 +1,4 @@
1
+ require 'bricolage/context'
1
2
  require 'bricolage/exception'
2
3
  require 'bricolage/version'
3
4
  require 'bricolage/sqsdatasource'
@@ -15,7 +16,7 @@ module Bricolage
15
16
 
16
17
  module StreamingLoad
17
18
 
18
- class Dispatcher
19
+ class Dispatcher < SQSDataSource::MessageHandler
19
20
 
20
21
  def Dispatcher.main
21
22
  opts = DispatcherOptions.new(ARGV)
@@ -54,7 +55,6 @@ module Bricolage
54
55
 
55
56
  Process.daemon(true) if opts.daemon?
56
57
  create_pid_file opts.pid_file_path if opts.pid_file_path
57
- dispatcher.set_dispatch_timer
58
58
  dispatcher.event_loop
59
59
  end
60
60
 
@@ -82,10 +82,24 @@ module Bricolage
82
82
  @dispatch_interval = dispatch_interval
83
83
  @dispatch_message_id = nil
84
84
  @logger = logger
85
+ @checkpoint_requested = false
85
86
  end
86
87
 
88
+ attr_reader :logger
89
+
87
90
  def event_loop
88
- @event_queue.main_handler_loop(handlers: self, message_class: Event)
91
+ set_dispatch_timer
92
+ @event_queue.handle_messages(handler: self, message_class: Event)
93
+ @event_queue.process_async_delete_force
94
+ logger.info "shutdown gracefully"
95
+ end
96
+
97
+ # override
98
+ def after_message_batch
99
+ @event_queue.process_async_delete
100
+ if @checkpoint_requested
101
+ create_checkpoint
102
+ end
89
103
  end
90
104
 
91
105
  def handle_shutdown(e)
@@ -94,6 +108,29 @@ module Bricolage
94
108
  @event_queue.delete_message(e)
95
109
  end
96
110
 
111
+ def handle_checkpoint(e)
112
+ # Delay creating CHECKPOINT after the current message batch,
113
+ # because any other extra events are already received.
114
+ @checkpoint_requested = true
115
+ # Delete this event immediately
116
+ @event_queue.delete_message(e)
117
+ end
118
+
119
+ def create_checkpoint
120
+ logger.info "*** Creating checkpoint requested ***"
121
+ logger.info "Force-flushing all objects..."
122
+ flush_all_tasks_immediately
123
+ logger.info "All objects flushed; shutting down..."
124
+ @event_queue.initiate_terminate
125
+ end
126
+
127
+ def flush_all_tasks_immediately
128
+ tasks = @object_buffer.flush_tasks_force
129
+ tasks.each do |task|
130
+ @task_queue.put task
131
+ end
132
+ end
133
+
97
134
  def handle_data(e)
98
135
  unless e.created?
99
136
  @event_queue.delete_message_async(e)
@@ -11,6 +11,7 @@ module Bricolage
11
11
  when rec['eventName'] == 'shutdown' then ShutdownEvent
12
12
  when rec['eventName'] == 'dispatch' then DispatchEvent
13
13
  when rec['eventName'] == 'flush' then FlushEvent
14
+ when rec['eventName'] == 'checkpoint' then CheckPointEvent
14
15
  when rec['eventSource'] == 'aws:s3'
15
16
  S3ObjectEvent
16
17
  else
@@ -41,7 +42,26 @@ module Bricolage
41
42
 
42
43
  alias message_type name
43
44
 
44
- def init_message
45
+ def init_message(dummy: nil)
46
+ end
47
+
48
+ end
49
+
50
+
51
+ # Flushes all tables and shutdown
52
+ class CheckPointEvent < Event
53
+
54
+ def CheckPointEvent.create
55
+ super name: 'checkpoint'
56
+ end
57
+
58
+ def CheckPointEvent.parse_sqs_record(msg, rec)
59
+ {}
60
+ end
61
+
62
+ alias message_type name
63
+
64
+ def init_message(dummy: nil)
45
65
  end
46
66
 
47
67
  end
@@ -75,6 +95,7 @@ module Bricolage
75
95
 
76
96
  end
77
97
 
98
+
78
99
  class DispatchEvent < Event
79
100
 
80
101
  def DispatchEvent.create(delay_seconds:)
@@ -83,8 +104,9 @@ module Bricolage
83
104
 
84
105
  alias message_type name
85
106
 
86
- def init_message(dummy)
107
+ def init_message(dummy: nil)
87
108
  end
109
+
88
110
  end
89
111
 
90
112
 
@@ -11,7 +11,7 @@ module Bricolage
11
11
 
12
12
  module StreamingLoad
13
13
 
14
- class LoaderService
14
+ class LoaderService < SQSDataSource::MessageHandler
15
15
 
16
16
  def LoaderService.main
17
17
  opts = LoaderServiceOptions.new(ARGV)
@@ -76,7 +76,8 @@ module Bricolage
76
76
  end
77
77
 
78
78
  def event_loop
79
- @task_queue.main_handler_loop(handlers: self, message_class: Task)
79
+ @task_queue.handle_messages(handler: self, message_class: Task)
80
+ @logger.info "shutdown gracefully"
80
81
  end
81
82
 
82
83
  def execute_task_by_id(task_id)
@@ -87,6 +88,7 @@ module Bricolage
87
88
  @ctl_ds.open {|conn| LoadTask.load(conn, task_id, force: force) }
88
89
  end
89
90
 
91
+ # message handler
90
92
  def handle_streaming_load_v3(task)
91
93
  # 1. Load task detail from table
92
94
  # 2. Skip disabled (sqs message should not have disabled state since it will never be exectuted)
@@ -39,6 +39,7 @@ module Bricolage
39
39
 
40
40
  end
41
41
 
42
+
42
43
  class ObjectBuffer
43
44
 
44
45
  include SQLUtils
@@ -55,7 +56,7 @@ module Bricolage
55
56
  end
56
57
 
57
58
  def flush_tasks
58
- task_ids = []
59
+ task_ids = nil
59
60
  @ctl_ds.open {|conn|
60
61
  conn.transaction {|txn|
61
62
  task_ids = insert_tasks(conn)
@@ -65,145 +66,192 @@ module Bricolage
65
66
  return task_ids.map {|id| LoadTask.create(task_id: id) }
66
67
  end
67
68
 
69
+ # Flushes all objects of all tables immediately with no
70
+ # additional conditions, to create "stream checkpoint".
71
+ def flush_tasks_force
72
+ task_ids = []
73
+ @ctl_ds.open {|conn|
74
+ conn.transaction {|txn|
75
+ # insert_task_object_mappings may not consume all saved objects
76
+ # (e.g. there are too many objects for one table), we must create
77
+ # tasks repeatedly until there are no unassigned objects.
78
+ until (ids = insert_tasks_force(conn)).empty?
79
+ insert_task_object_mappings(conn)
80
+ task_ids.concat ids
81
+ end
82
+ }
83
+ }
84
+ return task_ids.map {|id| LoadTask.create(task_id: id) }
85
+ end
86
+
68
87
  private
69
88
 
70
89
  def insert_object(conn, obj)
71
- #HACK - suppress log per object
72
- log_level = @logger.level
73
- @logger.level = Logger::ERROR
74
- conn.update(<<-EndSQL)
75
- insert into strload_objects
76
- (object_url
77
- , object_size
78
- , data_source_id
79
- , message_id
80
- , event_time
90
+ suppress_sql_logging {
91
+ conn.update(<<-EndSQL)
92
+ insert into strload_objects
93
+ ( object_url
94
+ , object_size
95
+ , data_source_id
96
+ , message_id
97
+ , event_time
98
+ , submit_time
99
+ )
100
+ select
101
+ #{s obj.url}
102
+ , #{obj.size}
103
+ , #{s obj.data_source_id}
104
+ , #{s obj.message_id}
105
+ , '#{obj.event_time}' AT TIME ZONE 'JST'
106
+ , current_timestamp
107
+ from
108
+ strload_tables
109
+ where
110
+ data_source_id = #{s obj.data_source_id}
111
+ ;
112
+ EndSQL
113
+ }
114
+ end
115
+
116
+ def insert_tasks_force(conn)
117
+ insert_tasks(conn, force: true)
118
+ end
119
+
120
+ def insert_tasks(conn, force: false)
121
+ task_ids = conn.query_values(<<-EndSQL)
122
+ insert into strload_tasks
123
+ ( task_class
124
+ , schema_name
125
+ , table_name
81
126
  , submit_time
82
127
  )
83
128
  select
84
- #{s obj.url}
85
- , #{obj.size}
86
- , #{s obj.data_source_id}
87
- , #{s obj.message_id}
88
- , '#{obj.event_time}' AT TIME ZONE 'JST'
129
+ 'streaming_load_v3'
130
+ , tbl.schema_name
131
+ , tbl.table_name
89
132
  , current_timestamp
90
133
  from
91
- strload_tables
134
+ strload_tables tbl
135
+
136
+ -- number of objects not assigned to a task for each schema_name.table_name (> 0)
137
+ inner join (
138
+ select
139
+ data_source_id
140
+ , count(*) as object_count
141
+ from
142
+ (
143
+ select
144
+ min(object_id) as object_id
145
+ , object_url
146
+ from
147
+ strload_objects
148
+ group by
149
+ object_url
150
+ ) uniq_objects
151
+ inner join strload_objects using (object_id)
152
+ left outer join strload_task_objects using (object_id)
153
+ where
154
+ task_id is null -- not assigned to a task
155
+ group by
156
+ data_source_id
157
+ ) obj
158
+ using (data_source_id)
159
+
160
+ -- preceeding task's submit time
161
+ left outer join (
162
+ select
163
+ schema_name
164
+ , table_name
165
+ , max(submit_time) as latest_submit_time
166
+ from
167
+ strload_tasks
168
+ group by
169
+ schema_name, table_name
170
+ ) task
171
+ using (schema_name, table_name)
92
172
  where
93
- data_source_id = #{s obj.data_source_id}
173
+ not tbl.disabled -- not disabled
174
+ and (
175
+ #{force ? "true or" : ""} -- Creates tasks with no conditions if forced
176
+ obj.object_count > tbl.load_batch_size -- batch_size exceeded?
177
+ or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
178
+ or latest_submit_time is null -- no previous tasks?
179
+ )
180
+ returning task_id
94
181
  ;
95
182
  EndSQL
96
- @logger.level = log_level
97
- end
98
183
 
99
- def insert_tasks(conn)
100
- vals = conn.query_values(<<-EndSQL)
101
- insert into
102
- strload_tasks (task_class, schema_name, table_name, submit_time)
103
- select
104
- 'streaming_load_v3'
105
- , tbl.schema_name
106
- , tbl.table_name
107
- , current_timestamp
108
- from
109
- strload_tables tbl
110
- inner join (
111
- select
112
- data_source_id
113
- , count(*) as object_count
114
- from (
115
- select
116
- min(object_id) as object_id
117
- , object_url
118
- from
119
- strload_objects
120
- group by
121
- object_url
122
- ) uniq_objects
123
- inner join strload_objects
124
- using(object_id)
125
- left outer join strload_task_objects
126
- using(object_id)
127
- where
128
- task_id is null -- not assigned to a task
129
- group by
130
- data_source_id
131
- ) obj -- number of objects not assigned to a task per schema_name.table_name (won't return zero)
132
- using (data_source_id)
133
- left outer join (
134
- select
135
- schema_name
136
- , table_name
137
- , max(submit_time) as latest_submit_time
138
- from
139
- strload_tasks
140
- group by
141
- schema_name, table_name
142
- ) task -- preceeding task's submit time
143
- using(schema_name, table_name)
144
- where
145
- not tbl.disabled -- not disabled
146
- and (
147
- obj.object_count > tbl.load_batch_size -- batch_size exceeded?
148
- or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
149
- or latest_submit_time is null -- no last task
150
- )
151
- returning task_id
152
- ;
153
- EndSQL
154
- @logger.info "Number of task created: #{vals.size}"
155
- vals
184
+ @logger.info "Number of task created: #{task_ids.size}"
185
+ task_ids
156
186
  end
157
187
 
158
188
  def insert_task_object_mappings(conn)
159
189
  conn.update(<<-EndSQL)
160
- insert into
161
- strload_task_objects
162
- select
163
- task_id
164
- , object_id
165
- from (
166
- select
167
- row_number() over(partition by task.task_id order by obj.object_id) as object_count
168
- , task.task_id
169
- , obj.object_id
170
- , load_batch_size
171
- from (
172
- select
173
- min(object_id) as object_id
174
- , object_url
175
- , data_source_id
176
- from
177
- strload_objects
178
- group by
179
- 2, 3
180
- ) obj
181
- inner join (
182
- select
183
- min(task_id) as task_id -- oldest task
184
- , tbl.data_source_id
185
- , max(load_batch_size) as load_batch_size
186
- from
187
- strload_tasks
188
- inner join strload_tables tbl
189
- using(schema_name, table_name)
190
- where
191
- task_id not in (select distinct task_id from strload_task_objects) -- no assigned objects
192
- group by
193
- 2
194
- ) task -- tasks without objects
195
- using(data_source_id)
196
- left outer join strload_task_objects task_obj
197
- using(object_id)
198
- where
199
- task_obj.object_id is null -- not assigned to a task
200
- ) as t
201
- where
202
- object_count <= load_batch_size -- limit number of objects assigned to single task
203
- ;
190
+ insert into strload_task_objects
191
+ ( task_id
192
+ , object_id
193
+ )
194
+ select
195
+ task_id
196
+ , object_id
197
+ from (
198
+ select
199
+ row_number() over (partition by task.task_id order by obj.object_id) as object_count
200
+ , task.task_id
201
+ , obj.object_id
202
+ , load_batch_size
203
+ from
204
+ (
205
+ select
206
+ data_source_id
207
+ , object_url
208
+ , min(object_id) as object_id
209
+ from
210
+ strload_objects
211
+ group by
212
+ 1, 2
213
+ ) obj
214
+
215
+ -- tasks without objects
216
+ inner join (
217
+ select
218
+ tbl.data_source_id
219
+ , min(task_id) as task_id -- pick up oldest task
220
+ , max(load_batch_size) as load_batch_size
221
+ from
222
+ strload_tasks
223
+ inner join strload_tables tbl
224
+ using (schema_name, table_name)
225
+ where
226
+ -- unassigned objects
227
+ task_id not in (select task_id from strload_task_objects)
228
+ group by
229
+ 1
230
+ ) task
231
+ using (data_source_id)
232
+
233
+ left outer join strload_task_objects task_obj
234
+ using (object_id)
235
+ where
236
+ task_obj.object_id is null -- unassigned to a task
237
+ ) as t
238
+ where
239
+ object_count <= load_batch_size -- limit number of objects assigned to single task
240
+ ;
204
241
  EndSQL
205
242
  end
206
243
 
244
+ def suppress_sql_logging
245
+ # CLUDGE
246
+ orig = @logger.level
247
+ begin
248
+ @logger.level = Logger::ERROR
249
+ yield
250
+ ensure
251
+ @logger.level = orig
252
+ end
253
+ end
254
+
207
255
  end
208
256
 
209
257
  end
@@ -1,5 +1,5 @@
1
1
  module Bricolage
2
2
  module StreamingLoad
3
- VERSION = '0.3.0'
3
+ VERSION = '0.4.0'
4
4
  end
5
5
  end
@@ -0,0 +1,111 @@
1
+ require 'test/unit'
2
+ require 'bricolage/context'
3
+ require 'bricolage/sqsdatasource'
4
+ require 'bricolage/sqsmock'
5
+ require 'bricolage/streamingload/dispatcher'
6
+
7
+ module Bricolage
8
+ module StreamingLoad
9
+
10
+ class TestDispatcher < Test::Unit::TestCase
11
+
12
+ test "checkpoint event" do
13
+ ctx = Context.for_application('.', environment: 'test', logger: NullLogger.new)
14
+ ctl_ds = ctx.get_data_source('sql', 'dwhctl')
15
+
16
+ event_queue = SQSDataSource.new_mock(queue: [
17
+ # 1st ReceiveMessage
18
+ [
19
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0001.json.gz'),
20
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0002.json.gz'),
21
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0003.json.gz'),
22
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0004.json.gz'),
23
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0005.json.gz')
24
+ ],
25
+ # 2nd ReceiveMessage
26
+ [
27
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0006.json.gz'),
28
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0007.json.gz'),
29
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0008.json.gz'),
30
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0009.json.gz'),
31
+ SQSMock::Message.new(body: {eventSource: 'bricolage:system', eventName: 'checkpoint'}),
32
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0010.json.gz')
33
+ ]
34
+ ])
35
+
36
+ task_queue = SQSDataSource.new_mock
37
+
38
+ object_buffer = ObjectBuffer.new(
39
+ control_data_source: ctl_ds,
40
+ logger: ctx.logger
41
+ )
42
+
43
+ url_patterns = URLPatterns.for_config([
44
+ {
45
+ "url" => %r<\As3://test-bucket/testschema\.desttable/datafile-\d{4}\.json\.gz>.source,
46
+ "schema" => 'testschema',
47
+ "table" => 'desttable'
48
+ }
49
+ ])
50
+
51
+ dispatcher = Dispatcher.new(
52
+ event_queue: event_queue,
53
+ task_queue: task_queue,
54
+ object_buffer: object_buffer,
55
+ url_patterns: url_patterns,
56
+ dispatch_interval: 600,
57
+ logger: ctx.logger
58
+ )
59
+
60
+ # FIXME: database cleaner
61
+ ctl_ds.open {|conn|
62
+ conn.update("truncate strload_tables")
63
+ conn.update("truncate strload_objects")
64
+ conn.update("truncate strload_task_objects")
65
+ conn.update("truncate strload_tasks")
66
+ conn.update("insert into strload_tables values ('testschema', 'desttable', 'testschema.desttable', 100, 1800, false)")
67
+ }
68
+ dispatcher.event_loop
69
+
70
+ # Event Queue Call Sequence
71
+ hst = event_queue.client.call_history
72
+ assert_equal :send_message, hst[0].name # start flush timer
73
+ assert_equal :receive_message, hst[1].name
74
+ assert_equal :delete_message_batch, hst[2].name
75
+ assert_equal :receive_message, hst[3].name
76
+ assert_equal :delete_message, hst[4].name # delete checkpoint
77
+ assert_equal :delete_message_batch, hst[5].name
78
+
79
+ # Task Queue Call Sequence
80
+ hst = task_queue.client.call_history
81
+ assert_equal :send_message, hst[0].name
82
+ assert(/streaming_load_v3/ =~ hst[0].args[:message_body])
83
+ task_id = JSON.load(hst[0].args[:message_body])['Records'][0]['taskId'].to_i
84
+ assert_not_equal 0, task_id
85
+
86
+ # Object Buffer
87
+ assert_equal [], unassigned_objects(ctl_ds)
88
+ task = ctl_ds.open {|conn| LoadTask.load(conn, task_id) }
89
+ assert_equal 'testschema', task.schema
90
+ assert_equal 'desttable', task.table
91
+ assert_equal 10, task.object_urls.size
92
+ end
93
+
94
+ def unassigned_objects(ctl_ds)
95
+ ctl_ds.open {|conn|
96
+ conn.query_values(<<-EndSQL)
97
+ select
98
+ object_url
99
+ from
100
+ strload_objects
101
+ where
102
+ object_id not in (select object_id from strload_task_objects)
103
+ ;
104
+ EndSQL
105
+ }
106
+ end
107
+
108
+ end
109
+
110
+ end
111
+ end
@@ -1,111 +1,54 @@
1
1
  require 'test/unit'
2
2
  require 'bricolage/streamingload/event'
3
+ require 'bricolage/sqsmock'
3
4
  require 'bricolage/logger'
4
5
 
5
6
  module Bricolage
6
7
 
7
8
  class TestSQSDataSource < Test::Unit::TestCase
8
9
 
9
- def new_sqs_ds(mock_client = nil)
10
- SQSDataSource.new(
11
- url: 'http://sqs/000000000000/queue-name',
12
- access_key_id: 'access_key_id_1',
13
- secret_access_key: 'secret_access_key_1',
14
- visibility_timeout: 30
15
- ).tap {|ds|
16
- logger = NullLogger.new
17
- #logger = Bricolage::Logger.default
18
- ds.__send__(:initialize_base, 'name', nil, logger)
19
- ds.instance_variable_set(:@client, mock_client) if mock_client
20
- }
21
- end
22
-
23
- class MockSQSClient
24
- def initialize(&block)
25
- @handler = block
26
- end
27
-
28
- def delete_message_batch(**args)
29
- @handler.call(args)
30
- end
31
- end
32
-
33
- class NullLogger
34
- def debug(*args) end
35
- def info(*args) end
36
- def warn(*args) end
37
- def error(*args) end
38
- def exception(*args) end
39
- def with_elapsed_time(*args) yield end
40
- def elapsed_time(*args) yield end
41
- end
42
-
43
- def sqs_message(seq)
44
- MockSQSMessage.new("message_id_#{seq}", "receipt_handle_#{seq}")
45
- end
46
-
47
- MockSQSMessage = Struct.new(:message_id, :receipt_handle)
48
-
49
- class MockSQSResponse
50
- def initialize(successful: [], failed: [])
51
- @successful = successful
52
- @failed = failed
53
- end
54
-
55
- attr_reader :successful
56
- attr_reader :failed
57
-
58
- Success = Struct.new(:id)
59
- Failure = Struct.new(:id, :sender_fault, :code, :message)
60
-
61
- def add_success_for(ent)
62
- @successful.push Success.new(ent[:id])
63
- end
64
-
65
- def add_failure_for(ent)
66
- @failed.push Failure.new(ent[:id], true, '400', 'some reason')
67
- end
68
- end
69
-
70
10
  test "#delete_message_async" do
71
- messages = [sqs_message(0), sqs_message(1), sqs_message(2)]
72
- mock = MockSQSClient.new {|args|
73
- entries = args[:entries]
74
- if entries.size == 3
75
- # first time
76
- assert_equal messages[0].receipt_handle, entries[0][:receipt_handle]
77
- assert_equal messages[1].receipt_handle, entries[1][:receipt_handle]
78
- assert_equal messages[2].receipt_handle, entries[2][:receipt_handle]
79
- MockSQSResponse.new.tap {|res|
80
- res.add_success_for(entries[0])
81
- res.add_failure_for(entries[1])
82
- res.add_success_for(entries[2])
83
- }
84
- else
85
- # second time
86
- MockSQSResponse.new.tap {|res|
87
- res.add_success_for(entries[0])
88
- }
89
- end
11
+ messages = (0..2).map {|seq|
12
+ SQSMock::Message.new(message_id: "message_id_#{seq}", receipt_handle: "receipt_handle_#{seq}")
90
13
  }
91
- ds = new_sqs_ds(mock)
14
+ ds = SQSDataSource.new_mock(
15
+ delete_message_batch: -> (queue_url:, entries:) {
16
+ if entries.size == 3
17
+ # first time
18
+ assert_equal messages[0].receipt_handle, entries[0][:receipt_handle]
19
+ assert_equal messages[1].receipt_handle, entries[1][:receipt_handle]
20
+ assert_equal messages[2].receipt_handle, entries[2][:receipt_handle]
21
+ SQSMock::DeleteMessageBatchResponse.new.tap {|res|
22
+ res.add_success_for(entries[0])
23
+ res.add_failure_for(entries[1])
24
+ res.add_success_for(entries[2])
25
+ }
26
+ else
27
+ # second time
28
+ SQSMock::DeleteMessageBatchResponse.new.tap {|res|
29
+ res.add_success_for(entries[0])
30
+ }
31
+ end
32
+ }
33
+ )
34
+
92
35
  ds.delete_message_async(messages[0])
93
36
  ds.delete_message_async(messages[1])
94
37
  ds.delete_message_async(messages[2])
95
38
 
96
39
  # first flush
97
40
  flush_time = Time.now
98
- ds.delete_message_buffer.flush(flush_time)
99
- assert_equal 1, ds.delete_message_buffer.size
100
- bufent = ds.delete_message_buffer.instance_variable_get(:@buf).values.first
41
+ ds.process_async_delete(flush_time)
42
+ delete_buf = ds.__send__(:delete_message_buffer)
43
+ bufent = delete_buf.instance_variable_get(:@buf).values.first
101
44
  assert_equal 'receipt_handle_1', bufent.message.receipt_handle
102
45
  assert_equal 1, bufent.n_failure
103
46
  assert_false bufent.issuable?(flush_time)
104
47
  assert_true bufent.issuable?(flush_time + 180)
105
48
 
106
49
  # second flush
107
- ds.delete_message_buffer.flush(flush_time + 180)
108
- assert_true ds.delete_message_buffer.empty?
50
+ ds.process_async_delete(flush_time + 180)
51
+ assert_true delete_buf.empty?
109
52
  end
110
53
 
111
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage-streamingload
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
@@ -107,9 +107,10 @@ files:
107
107
  - README.md
108
108
  - bin/bricolage-streaming-dispatcher
109
109
  - bin/bricolage-streaming-loader
110
+ - lib/bricolage/nulllogger.rb
110
111
  - lib/bricolage/snsdatasource.rb
111
112
  - lib/bricolage/sqsdatasource.rb
112
- - lib/bricolage/sqswrapper.rb
113
+ - lib/bricolage/sqsmock.rb
113
114
  - lib/bricolage/streamingload/alertinglogger.rb
114
115
  - lib/bricolage/streamingload/dispatcher.rb
115
116
  - lib/bricolage/streamingload/event.rb
@@ -122,6 +123,7 @@ files:
122
123
  - lib/bricolage/streamingload/urlpatterns.rb
123
124
  - lib/bricolage/streamingload/version.rb
124
125
  - test/all.rb
126
+ - test/streamingload/test_dispatcher.rb
125
127
  - test/streamingload/test_event.rb
126
128
  - test/test_sqsdatasource.rb
127
129
  homepage: https://github.com/aamine/bricolage-streamingload
@@ -1,77 +0,0 @@
1
- require 'json'
2
-
3
- module Bricolage
4
-
5
- class SQSClientWrapper
6
- def initialize(sqs, logger:)
7
- @sqs = sqs
8
- @logger = logger
9
- end
10
-
11
- def receive_message(**args)
12
- @logger.debug "receive_message(#{args.inspect})"
13
- @sqs.receive_message(**args)
14
- end
15
-
16
- def send_message(**args)
17
- @logger.debug "send_message(#{args.inspect})"
18
- @sqs.send_message(**args)
19
- end
20
-
21
- def delete_message(**args)
22
- @logger.debug "delete_message(#{args.inspect})"
23
- @sqs.delete_message(**args)
24
- end
25
-
26
- def delete_message_batch(**args)
27
- @logger.debug "delete_message_batch(#{args.inspect})"
28
- @sqs.delete_message_batch(**args)
29
- end
30
- end
31
-
32
-
33
- class DummySQSClient
34
- def initialize(queue = [])
35
- @queue = queue
36
- end
37
-
38
- def receive_message(**args)
39
- msg_recs = @queue.shift or return EMPTY_RESULT
40
- msgs = msg_recs.map {|recs| Message.new({'Records' => recs}.to_json) }
41
- Result.new(true, msgs)
42
- end
43
-
44
- def send_message(**args)
45
- SUCCESS_RESULT
46
- end
47
-
48
- def delete_message(**args)
49
- SUCCESS_RESULT
50
- end
51
-
52
- class Result
53
- def initialize(successful, messages = nil)
54
- @successful = successful
55
- @messages = messages
56
- end
57
-
58
- def successful?
59
- @successful
60
- end
61
-
62
- attr_reader :messages
63
- end
64
-
65
- SUCCESS_RESULT = Result.new(true)
66
- EMPTY_RESULT = Result.new(true, [])
67
-
68
- class Message
69
- def initialize(body)
70
- @body = body
71
- end
72
-
73
- attr_reader :body
74
- end
75
- end
76
-
77
- end # module Bricolage