bricolage-streamingload 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d81ff86cb9addccb7ca9db4d240218679b1f72f9
4
- data.tar.gz: f2ec045c994f1c6b619695f74a13aebcc9318722
3
+ metadata.gz: 848b815669c6580505119917a72a4e97833064c2
4
+ data.tar.gz: 1809410699822e2a60a21407cad1b5814551adcd
5
5
  SHA512:
6
- metadata.gz: a78c3b8f35f8d10cbc3da301667ea811874ccadb7653b02388990d63afc2183ee3d0c4e357dccc7799721e1dcd8e33b88e9b3d66147903c32ff109595bced0f6
7
- data.tar.gz: a3632f13d3ea039aa690deca646659e0d89ad636b2af55abf5f7011f54cfecddcef53d57152de513bcbaf5c03671af7a512a10781e7846efc60af9c752b8c364
6
+ metadata.gz: 84ecbe1e548953cc1f4889eb07e9289101e1ade28e89370dae39462b61d193521ceda25f59d8a0b0f1760b8555cf254abab50cf7ed3d11e70d81c422af2f8b82
7
+ data.tar.gz: b367bb4faa24e9755bee5dbeaaaf937a317826e8e562466e3264a19563bbfcc877ddba9801ffda4875d21e39feddfcc8e87df81a6433b9cd1c3a2cabcefc0754
@@ -0,0 +1,20 @@
1
+ require 'logger'
2
+
3
+ module Bricolage
4
+ # FIXME: should be defined in the Bricolage package
5
+ class NullLogger
6
+ def debug(*args) end
7
+ def debug?() false end
8
+ def info(*args) end
9
+ def info?() false end
10
+ def warn(*args) end
11
+ def warn?() false end
12
+ def error(*args) end
13
+ def error?() false end
14
+ def exception(*args) end
15
+ def with_elapsed_time(*args) yield end
16
+ def elapsed_time(*args) yield end
17
+ def level() Logger::ERROR end
18
+ def level=(l) l end
19
+ end
20
+ end
@@ -1,5 +1,4 @@
1
1
  require 'bricolage/datasource'
2
- require 'bricolage/sqswrapper'
3
2
  require 'securerandom'
4
3
  require 'aws-sdk'
5
4
  require 'json'
@@ -28,6 +27,10 @@ module Bricolage
28
27
  attr_reader :access_key_id
29
28
  attr_reader :secret_access_key
30
29
 
30
+ attr_reader :visibility_timeout
31
+ attr_reader :max_number_of_messages
32
+ attr_reader :wait_time_seconds
33
+
31
34
  def client
32
35
  @client ||= begin
33
36
  c = @noop ? DummySQSClient.new : Aws::SQS::Client.new(region: @region, access_key_id: @access_key_id, secret_access_key: @secret_access_key)
@@ -39,22 +42,18 @@ module Bricolage
39
42
  # High-Level Polling Interface
40
43
  #
41
44
 
42
- def main_handler_loop(handlers:, message_class:)
45
+ def handle_messages(handler:, message_class:)
43
46
  trap_signals
44
-
45
- n_zero = 0
46
- until terminating?
47
- insert_handler_wait(n_zero)
48
- n_msg = handle_messages(handlers: handlers, message_class: message_class)
49
- if n_msg == 0
50
- n_zero += 1
51
- else
52
- n_zero = 0
47
+ polling_loop do
48
+ result = poll or next true
49
+ msgs = message_class.for_sqs_result(result)
50
+ msgs.each do |msg|
51
+ handler.handle(msg)
53
52
  end
54
- delete_message_buffer.flush
53
+ handler.after_message_batch
54
+ break if terminating?
55
+ msgs.empty?
55
56
  end
56
- delete_message_buffer.flush_force
57
- logger.info "shutdown gracefully"
58
57
  end
59
58
 
60
59
  def trap_signals
@@ -66,6 +65,7 @@ module Bricolage
66
65
  initiate_terminate
67
66
  }
68
67
  end
68
+ private :trap_signals
69
69
 
70
70
  def initiate_terminate
71
71
  # No I/O allowed in this method
@@ -76,36 +76,72 @@ module Bricolage
76
76
  @terminating
77
77
  end
78
78
 
79
- def insert_handler_wait(n_zero)
80
- sec = 2 ** [n_zero, 6].min # max 64s
81
- logger.info "queue wait: sleep #{sec}" if n_zero > 0
82
- sleep sec
83
- end
84
-
85
- def handle_messages(handlers:, message_class:)
86
- n_msg = foreach_message(message_class) do |msg|
87
- logger.debug "handling message: #{msg.inspect}" if logger.debug?
88
- mid = "handle_#{msg.message_type}"
89
- # just ignore unknown event to make app migration easy
90
- if handlers.respond_to?(mid, true)
91
- handlers.__send__(mid, msg)
79
+ def polling_loop
80
+ n_failure = 0
81
+ while true
82
+ failed = yield
83
+ if failed
84
+ n_failure += 1
92
85
  else
93
- logger.error "unknown SQS message type: #{msg.message_type.inspect} (message-id: #{msg.message_id})"
86
+ n_failure = 0
94
87
  end
88
+ insert_handler_wait(n_failure)
95
89
  end
96
- n_msg
97
90
  end
91
+ private :polling_loop
98
92
 
99
- def foreach_message(message_class, &block)
93
+ def insert_handler_wait(n_failure)
94
+ sec = 2 ** [n_failure, 6].min # max 64s
95
+ logger.info "queue wait: sleep #{sec}" if n_failure > 0
96
+ sleep sec
97
+ end
98
+ private :insert_handler_wait
99
+
100
+ def poll
100
101
  result = receive_messages()
101
102
  unless result and result.successful?
102
103
  logger.error "ReceiveMessage failed: #{result ? result.error.message : '(result=nil)'}"
103
104
  return nil
104
105
  end
105
- logger.info "receive #{result.messages.size} messages" unless result.messages.empty?
106
- msgs = message_class.for_sqs_result(result)
107
- msgs.each(&block)
108
- msgs.size
106
+ logger.info "receive #{result.messages.size} messages"
107
+ result
108
+ end
109
+
110
+ class MessageHandler
111
+ # abstract logger()
112
+
113
+ def handle(msg)
114
+ logger.debug "handling message: #{msg.inspect}" if logger.debug?
115
+ if handleable?(msg)
116
+ call_handler_method(msg)
117
+ else
118
+ handle_unknown(msg)
119
+ end
120
+ end
121
+
122
+ def handleable?(msg)
123
+ respond_to?(handler_method(msg), true)
124
+ end
125
+
126
+ def call_handler_method(msg)
127
+ __send__(handler_method(msg), msg)
128
+ end
129
+
130
+ def handler_method(msg)
131
+ "handle_#{msg.message_type}".intern
132
+ end
133
+
134
+ # Unknown message handler.
135
+ # Feel free to override this method.
136
+ def handle_unknown(msg)
137
+ # just ignore unknown message to make app migration easy
138
+ logger.error "unknown message type: #{msg.message_type.inspect} (message-id: #{msg.message_id})"
139
+ end
140
+
141
+ # Called after each message batch (ReceiveMessage) is processed.
142
+ # Override this method in subclasses on demand.
143
+ def after_message_batch
144
+ end
109
145
  end
110
146
 
111
147
  #
@@ -122,6 +158,18 @@ module Bricolage
122
158
  result
123
159
  end
124
160
 
161
+ def put(msg)
162
+ send_message(msg)
163
+ end
164
+
165
+ def send_message(msg)
166
+ client.send_message(
167
+ queue_url: @url,
168
+ message_body: { 'Records' => [msg.body] }.to_json,
169
+ delay_seconds: msg.delay_seconds
170
+ )
171
+ end
172
+
125
173
  def delete_message(msg)
126
174
  client.delete_message(
127
175
  queue_url: @url,
@@ -133,20 +181,18 @@ module Bricolage
133
181
  delete_message_buffer.put(msg)
134
182
  end
135
183
 
136
- def delete_message_buffer
137
- @delete_message_buffer ||= DeleteMessageBuffer.new(client, @url, logger)
184
+ def process_async_delete(now = Time.now)
185
+ delete_message_buffer.flush(now)
138
186
  end
139
187
 
140
- def put(msg)
141
- send_message(msg)
188
+ def process_async_delete_force
189
+ delete_message_buffer.flush_force
142
190
  end
143
191
 
144
- def send_message(msg)
145
- client.send_message(
146
- queue_url: @url,
147
- message_body: { 'Records' => [msg.body] }.to_json,
148
- delay_seconds: msg.delay_seconds
149
- )
192
+ private
193
+
194
+ def delete_message_buffer
195
+ @delete_message_buffer ||= DeleteMessageBuffer.new(client, @url, logger)
150
196
  end
151
197
 
152
198
  class DeleteMessageBuffer
@@ -256,6 +302,36 @@ module Bricolage
256
302
  end # class SQSDataSource
257
303
 
258
304
 
305
+ class SQSClientWrapper
306
+
307
+ def initialize(sqs, logger:)
308
+ @sqs = sqs
309
+ @logger = logger
310
+ end
311
+
312
+ def receive_message(**args)
313
+ @logger.debug "receive_message(#{args.inspect})"
314
+ @sqs.receive_message(**args)
315
+ end
316
+
317
+ def send_message(**args)
318
+ @logger.debug "send_message(#{args.inspect})"
319
+ @sqs.send_message(**args)
320
+ end
321
+
322
+ def delete_message(**args)
323
+ @logger.debug "delete_message(#{args.inspect})"
324
+ @sqs.delete_message(**args)
325
+ end
326
+
327
+ def delete_message_batch(**args)
328
+ @logger.debug "delete_message_batch(#{args.inspect})"
329
+ @sqs.delete_message_batch(**args)
330
+ end
331
+
332
+ end # class SQSClientWrapper
333
+
334
+
259
335
  class SQSMessage
260
336
 
261
337
  SQS_EVENT_SOURCE = 'bricolage:system'
@@ -0,0 +1,194 @@
1
+ require 'bricolage/sqsdatasource'
2
+ require 'bricolage/nulllogger'
3
+ require 'json'
4
+
5
+ module Bricolage
6
+
7
+ def SQSDataSource.new_mock(**args)
8
+ SQSDataSource.new(
9
+ url: 'http://sqs/000000000000/queue-name',
10
+ access_key_id: 'access_key_id_1',
11
+ secret_access_key: 'secret_access_key_1',
12
+ visibility_timeout: 30
13
+ ).tap {|ds|
14
+ logger = NullLogger.new
15
+ #logger = Bricolage::Logger.default
16
+ ds.__send__(:initialize_base, 'name', nil, logger)
17
+ ds.instance_variable_set(:@client, SQSMock::Client.new(**args))
18
+ }
19
+ end
20
+
21
+ module SQSMock
22
+
23
+ class Client
24
+ def initialize(queue: [], receive_message: nil, send_message: nil, delete_message: nil, delete_message_batch: nil)
25
+ @queue = queue # [[record]]
26
+ @call_history = []
27
+
28
+ @receive_message = receive_message || lambda {|**args|
29
+ msgs = @queue.shift or break ReceiveMessageResponse.successful([])
30
+ ReceiveMessageResponse.successful(msgs)
31
+ }
32
+
33
+ @send_message = send_message || lambda {|**args|
34
+ SendMessageResponse.successful
35
+ }
36
+
37
+ @delete_message = delete_message || lambda {|**args|
38
+ Response.successful
39
+ }
40
+
41
+ @delete_message_batch = delete_message_batch || lambda {|queue_url:, entries:|
42
+ # Returns success for all requests by default.
43
+ DeleteMessageBatchResponse.new.tap {|res|
44
+ entries.each do |ent|
45
+ res.add_success_for(ent)
46
+ end
47
+ }
48
+ }
49
+ end
50
+
51
+ # Free free to modify this array contents
52
+ attr_reader :call_history
53
+
54
+ def self.def_mock_method(name)
55
+ define_method(name) {|**args|
56
+ @call_history.push CallHistory.new(name.intern, args)
57
+ instance_variable_get("@#{name}").(**args)
58
+ }
59
+ end
60
+
61
+ def_mock_method :receive_message
62
+ def_mock_method :send_message
63
+ def_mock_method :delete_message
64
+ def_mock_method :delete_message_batch
65
+ end
66
+
67
+ CallHistory = Struct.new(:name, :args)
68
+
69
+ # success/failure only result
70
+ class Response
71
+ def Response.successful
72
+ new(successful: true)
73
+ end
74
+
75
+ def initialize(successful:)
76
+ @successful = successful
77
+ end
78
+
79
+ def successful?
80
+ @successful
81
+ end
82
+ end
83
+
84
+ class ReceiveMessageResponse < Response
85
+ def ReceiveMessageResponse.successful(msgs)
86
+ new(successful: true, messages: msgs)
87
+ end
88
+
89
+ def initialize(successful:, messages:)
90
+ super(successful: successful)
91
+ @messages = messages
92
+ end
93
+
94
+ attr_reader :messages
95
+ end
96
+
97
+ class SendMessageResponse < Response
98
+ def SendMessageResponse.successful
99
+ new(successful: true, message_id: "sqs-sent-message-id-#{Message.new_seq}")
100
+ end
101
+
102
+ def initialize(successful:, message_id:)
103
+ super(successful: successful)
104
+ @message_id = message_id
105
+ end
106
+
107
+ attr_reader :message_id
108
+ end
109
+
110
+ class DeleteMessageBatchResponse
111
+ def initialize(successful: [], failed: [])
112
+ @successful = successful
113
+ @failed = failed
114
+ end
115
+
116
+ attr_reader :successful
117
+ attr_reader :failed
118
+
119
+ Success = Struct.new(:id)
120
+ Failure = Struct.new(:id, :sender_fault, :code, :message)
121
+
122
+ def add_success_for(ent)
123
+ @successful.push Success.new(ent[:id])
124
+ end
125
+
126
+ def add_failure_for(ent)
127
+ @failed.push Failure.new(ent[:id], true, '400', 'some reason')
128
+ end
129
+ end
130
+
131
+ class Message
132
+ def Message.s3_object_created_event(url)
133
+ raise "is not a S3 URL: #{url.inspect}" unless %r<\As3://\w> =~ url
134
+ bucket, key = url.sub(%r<s3://>, '').split('/', 2)
135
+ with_body({
136
+ eventVersion: '2.0',
137
+ eventSource: 'aws:s3',
138
+ awsRegion: 'ap-northeast-1',
139
+ eventTime: Time.now.iso8601,
140
+ eventName: 'ObjectCreated:Put',
141
+ s3: {
142
+ s3SchemaVersion: '1.0',
143
+ configurationId: 'TestConfig',
144
+ bucket: {
145
+ name: bucket,
146
+ arn: "arn:aws:s3:::#{bucket}"
147
+ },
148
+ object: {
149
+ key: key,
150
+ size: 1024
151
+ }
152
+ }
153
+ })
154
+ end
155
+
156
+ @seq = 0
157
+
158
+ def Message.new_seq
159
+ @seq += 1
160
+ @seq
161
+ end
162
+
163
+ def Message.with_body(body)
164
+ seq = new_seq
165
+ new(
166
+ message_id: "sqs-message-id-#{seq}",
167
+ receipt_handle: "sqs-receipt-handle-#{seq}",
168
+ body: body
169
+ )
170
+ end
171
+
172
+ def initialize(message_id: nil, receipt_handle: nil, body: nil)
173
+ @message_id = message_id
174
+ @receipt_handle = receipt_handle
175
+ @body = body
176
+ @body_json = { Records: [body] }.to_json
177
+ end
178
+
179
+ attr_reader :message_id
180
+ attr_reader :receipt_handle
181
+
182
+ def body
183
+ @body_json
184
+ end
185
+
186
+ # for debug
187
+ def body_object
188
+ @body
189
+ end
190
+ end
191
+
192
+ end # module SQSMock
193
+
194
+ end # module Bricolage
@@ -1,3 +1,4 @@
1
+ require 'bricolage/context'
1
2
  require 'bricolage/exception'
2
3
  require 'bricolage/version'
3
4
  require 'bricolage/sqsdatasource'
@@ -15,7 +16,7 @@ module Bricolage
15
16
 
16
17
  module StreamingLoad
17
18
 
18
- class Dispatcher
19
+ class Dispatcher < SQSDataSource::MessageHandler
19
20
 
20
21
  def Dispatcher.main
21
22
  opts = DispatcherOptions.new(ARGV)
@@ -54,7 +55,6 @@ module Bricolage
54
55
 
55
56
  Process.daemon(true) if opts.daemon?
56
57
  create_pid_file opts.pid_file_path if opts.pid_file_path
57
- dispatcher.set_dispatch_timer
58
58
  dispatcher.event_loop
59
59
  end
60
60
 
@@ -82,10 +82,24 @@ module Bricolage
82
82
  @dispatch_interval = dispatch_interval
83
83
  @dispatch_message_id = nil
84
84
  @logger = logger
85
+ @checkpoint_requested = false
85
86
  end
86
87
 
88
+ attr_reader :logger
89
+
87
90
  def event_loop
88
- @event_queue.main_handler_loop(handlers: self, message_class: Event)
91
+ set_dispatch_timer
92
+ @event_queue.handle_messages(handler: self, message_class: Event)
93
+ @event_queue.process_async_delete_force
94
+ logger.info "shutdown gracefully"
95
+ end
96
+
97
+ # override
98
+ def after_message_batch
99
+ @event_queue.process_async_delete
100
+ if @checkpoint_requested
101
+ create_checkpoint
102
+ end
89
103
  end
90
104
 
91
105
  def handle_shutdown(e)
@@ -94,6 +108,29 @@ module Bricolage
94
108
  @event_queue.delete_message(e)
95
109
  end
96
110
 
111
+ def handle_checkpoint(e)
112
+ # Delay creating CHECKPOINT after the current message batch,
113
+ # because any other extra events are already received.
114
+ @checkpoint_requested = true
115
+ # Delete this event immediately
116
+ @event_queue.delete_message(e)
117
+ end
118
+
119
+ def create_checkpoint
120
+ logger.info "*** Creating checkpoint requested ***"
121
+ logger.info "Force-flushing all objects..."
122
+ flush_all_tasks_immediately
123
+ logger.info "All objects flushed; shutting down..."
124
+ @event_queue.initiate_terminate
125
+ end
126
+
127
+ def flush_all_tasks_immediately
128
+ tasks = @object_buffer.flush_tasks_force
129
+ tasks.each do |task|
130
+ @task_queue.put task
131
+ end
132
+ end
133
+
97
134
  def handle_data(e)
98
135
  unless e.created?
99
136
  @event_queue.delete_message_async(e)
@@ -11,6 +11,7 @@ module Bricolage
11
11
  when rec['eventName'] == 'shutdown' then ShutdownEvent
12
12
  when rec['eventName'] == 'dispatch' then DispatchEvent
13
13
  when rec['eventName'] == 'flush' then FlushEvent
14
+ when rec['eventName'] == 'checkpoint' then CheckPointEvent
14
15
  when rec['eventSource'] == 'aws:s3'
15
16
  S3ObjectEvent
16
17
  else
@@ -41,7 +42,26 @@ module Bricolage
41
42
 
42
43
  alias message_type name
43
44
 
44
- def init_message
45
+ def init_message(dummy: nil)
46
+ end
47
+
48
+ end
49
+
50
+
51
+ # Flushes all tables and shutdown
52
+ class CheckPointEvent < Event
53
+
54
+ def CheckPointEvent.create
55
+ super name: 'checkpoint'
56
+ end
57
+
58
+ def CheckPointEvent.parse_sqs_record(msg, rec)
59
+ {}
60
+ end
61
+
62
+ alias message_type name
63
+
64
+ def init_message(dummy: nil)
45
65
  end
46
66
 
47
67
  end
@@ -75,6 +95,7 @@ module Bricolage
75
95
 
76
96
  end
77
97
 
98
+
78
99
  class DispatchEvent < Event
79
100
 
80
101
  def DispatchEvent.create(delay_seconds:)
@@ -83,8 +104,9 @@ module Bricolage
83
104
 
84
105
  alias message_type name
85
106
 
86
- def init_message(dummy)
107
+ def init_message(dummy: nil)
87
108
  end
109
+
88
110
  end
89
111
 
90
112
 
@@ -11,7 +11,7 @@ module Bricolage
11
11
 
12
12
  module StreamingLoad
13
13
 
14
- class LoaderService
14
+ class LoaderService < SQSDataSource::MessageHandler
15
15
 
16
16
  def LoaderService.main
17
17
  opts = LoaderServiceOptions.new(ARGV)
@@ -76,7 +76,8 @@ module Bricolage
76
76
  end
77
77
 
78
78
  def event_loop
79
- @task_queue.main_handler_loop(handlers: self, message_class: Task)
79
+ @task_queue.handle_messages(handler: self, message_class: Task)
80
+ @logger.info "shutdown gracefully"
80
81
  end
81
82
 
82
83
  def execute_task_by_id(task_id)
@@ -87,6 +88,7 @@ module Bricolage
87
88
  @ctl_ds.open {|conn| LoadTask.load(conn, task_id, force: force) }
88
89
  end
89
90
 
91
+ # message handler
90
92
  def handle_streaming_load_v3(task)
91
93
  # 1. Load task detail from table
92
94
  # 2. Skip disabled (sqs message should not have disabled state since it will never be exectuted)
@@ -39,6 +39,7 @@ module Bricolage
39
39
 
40
40
  end
41
41
 
42
+
42
43
  class ObjectBuffer
43
44
 
44
45
  include SQLUtils
@@ -55,7 +56,7 @@ module Bricolage
55
56
  end
56
57
 
57
58
  def flush_tasks
58
- task_ids = []
59
+ task_ids = nil
59
60
  @ctl_ds.open {|conn|
60
61
  conn.transaction {|txn|
61
62
  task_ids = insert_tasks(conn)
@@ -65,145 +66,192 @@ module Bricolage
65
66
  return task_ids.map {|id| LoadTask.create(task_id: id) }
66
67
  end
67
68
 
69
+ # Flushes all objects of all tables immediately with no
70
+ # additional conditions, to create "stream checkpoint".
71
+ def flush_tasks_force
72
+ task_ids = []
73
+ @ctl_ds.open {|conn|
74
+ conn.transaction {|txn|
75
+ # insert_task_object_mappings may not consume all saved objects
76
+ # (e.g. there are too many objects for one table), we must create
77
+ # tasks repeatedly until there are no unassigned objects.
78
+ until (ids = insert_tasks_force(conn)).empty?
79
+ insert_task_object_mappings(conn)
80
+ task_ids.concat ids
81
+ end
82
+ }
83
+ }
84
+ return task_ids.map {|id| LoadTask.create(task_id: id) }
85
+ end
86
+
68
87
  private
69
88
 
70
89
  def insert_object(conn, obj)
71
- #HACK - suppress log per object
72
- log_level = @logger.level
73
- @logger.level = Logger::ERROR
74
- conn.update(<<-EndSQL)
75
- insert into strload_objects
76
- (object_url
77
- , object_size
78
- , data_source_id
79
- , message_id
80
- , event_time
90
+ suppress_sql_logging {
91
+ conn.update(<<-EndSQL)
92
+ insert into strload_objects
93
+ ( object_url
94
+ , object_size
95
+ , data_source_id
96
+ , message_id
97
+ , event_time
98
+ , submit_time
99
+ )
100
+ select
101
+ #{s obj.url}
102
+ , #{obj.size}
103
+ , #{s obj.data_source_id}
104
+ , #{s obj.message_id}
105
+ , '#{obj.event_time}' AT TIME ZONE 'JST'
106
+ , current_timestamp
107
+ from
108
+ strload_tables
109
+ where
110
+ data_source_id = #{s obj.data_source_id}
111
+ ;
112
+ EndSQL
113
+ }
114
+ end
115
+
116
+ def insert_tasks_force(conn)
117
+ insert_tasks(conn, force: true)
118
+ end
119
+
120
+ def insert_tasks(conn, force: false)
121
+ task_ids = conn.query_values(<<-EndSQL)
122
+ insert into strload_tasks
123
+ ( task_class
124
+ , schema_name
125
+ , table_name
81
126
  , submit_time
82
127
  )
83
128
  select
84
- #{s obj.url}
85
- , #{obj.size}
86
- , #{s obj.data_source_id}
87
- , #{s obj.message_id}
88
- , '#{obj.event_time}' AT TIME ZONE 'JST'
129
+ 'streaming_load_v3'
130
+ , tbl.schema_name
131
+ , tbl.table_name
89
132
  , current_timestamp
90
133
  from
91
- strload_tables
134
+ strload_tables tbl
135
+
136
+ -- number of objects not assigned to a task for each schema_name.table_name (> 0)
137
+ inner join (
138
+ select
139
+ data_source_id
140
+ , count(*) as object_count
141
+ from
142
+ (
143
+ select
144
+ min(object_id) as object_id
145
+ , object_url
146
+ from
147
+ strload_objects
148
+ group by
149
+ object_url
150
+ ) uniq_objects
151
+ inner join strload_objects using (object_id)
152
+ left outer join strload_task_objects using (object_id)
153
+ where
154
+ task_id is null -- not assigned to a task
155
+ group by
156
+ data_source_id
157
+ ) obj
158
+ using (data_source_id)
159
+
160
+ -- preceeding task's submit time
161
+ left outer join (
162
+ select
163
+ schema_name
164
+ , table_name
165
+ , max(submit_time) as latest_submit_time
166
+ from
167
+ strload_tasks
168
+ group by
169
+ schema_name, table_name
170
+ ) task
171
+ using (schema_name, table_name)
92
172
  where
93
- data_source_id = #{s obj.data_source_id}
173
+ not tbl.disabled -- not disabled
174
+ and (
175
+ #{force ? "true or" : ""} -- Creates tasks with no conditions if forced
176
+ obj.object_count > tbl.load_batch_size -- batch_size exceeded?
177
+ or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
178
+ or latest_submit_time is null -- no previous tasks?
179
+ )
180
+ returning task_id
94
181
  ;
95
182
  EndSQL
96
- @logger.level = log_level
97
- end
98
183
 
99
- def insert_tasks(conn)
100
- vals = conn.query_values(<<-EndSQL)
101
- insert into
102
- strload_tasks (task_class, schema_name, table_name, submit_time)
103
- select
104
- 'streaming_load_v3'
105
- , tbl.schema_name
106
- , tbl.table_name
107
- , current_timestamp
108
- from
109
- strload_tables tbl
110
- inner join (
111
- select
112
- data_source_id
113
- , count(*) as object_count
114
- from (
115
- select
116
- min(object_id) as object_id
117
- , object_url
118
- from
119
- strload_objects
120
- group by
121
- object_url
122
- ) uniq_objects
123
- inner join strload_objects
124
- using(object_id)
125
- left outer join strload_task_objects
126
- using(object_id)
127
- where
128
- task_id is null -- not assigned to a task
129
- group by
130
- data_source_id
131
- ) obj -- number of objects not assigned to a task per schema_name.table_name (won't return zero)
132
- using (data_source_id)
133
- left outer join (
134
- select
135
- schema_name
136
- , table_name
137
- , max(submit_time) as latest_submit_time
138
- from
139
- strload_tasks
140
- group by
141
- schema_name, table_name
142
- ) task -- preceeding task's submit time
143
- using(schema_name, table_name)
144
- where
145
- not tbl.disabled -- not disabled
146
- and (
147
- obj.object_count > tbl.load_batch_size -- batch_size exceeded?
148
- or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
149
- or latest_submit_time is null -- no last task
150
- )
151
- returning task_id
152
- ;
153
- EndSQL
154
- @logger.info "Number of task created: #{vals.size}"
155
- vals
184
+ @logger.info "Number of task created: #{task_ids.size}"
185
+ task_ids
156
186
  end
157
187
 
158
188
  def insert_task_object_mappings(conn)
159
189
  conn.update(<<-EndSQL)
160
- insert into
161
- strload_task_objects
162
- select
163
- task_id
164
- , object_id
165
- from (
166
- select
167
- row_number() over(partition by task.task_id order by obj.object_id) as object_count
168
- , task.task_id
169
- , obj.object_id
170
- , load_batch_size
171
- from (
172
- select
173
- min(object_id) as object_id
174
- , object_url
175
- , data_source_id
176
- from
177
- strload_objects
178
- group by
179
- 2, 3
180
- ) obj
181
- inner join (
182
- select
183
- min(task_id) as task_id -- oldest task
184
- , tbl.data_source_id
185
- , max(load_batch_size) as load_batch_size
186
- from
187
- strload_tasks
188
- inner join strload_tables tbl
189
- using(schema_name, table_name)
190
- where
191
- task_id not in (select distinct task_id from strload_task_objects) -- no assigned objects
192
- group by
193
- 2
194
- ) task -- tasks without objects
195
- using(data_source_id)
196
- left outer join strload_task_objects task_obj
197
- using(object_id)
198
- where
199
- task_obj.object_id is null -- not assigned to a task
200
- ) as t
201
- where
202
- object_count <= load_batch_size -- limit number of objects assigned to single task
203
- ;
190
+ insert into strload_task_objects
191
+ ( task_id
192
+ , object_id
193
+ )
194
+ select
195
+ task_id
196
+ , object_id
197
+ from (
198
+ select
199
+ row_number() over (partition by task.task_id order by obj.object_id) as object_count
200
+ , task.task_id
201
+ , obj.object_id
202
+ , load_batch_size
203
+ from
204
+ (
205
+ select
206
+ data_source_id
207
+ , object_url
208
+ , min(object_id) as object_id
209
+ from
210
+ strload_objects
211
+ group by
212
+ 1, 2
213
+ ) obj
214
+
215
+ -- tasks without objects
216
+ inner join (
217
+ select
218
+ tbl.data_source_id
219
+ , min(task_id) as task_id -- pick up oldest task
220
+ , max(load_batch_size) as load_batch_size
221
+ from
222
+ strload_tasks
223
+ inner join strload_tables tbl
224
+ using (schema_name, table_name)
225
+ where
226
+ -- unassigned objects
227
+ task_id not in (select task_id from strload_task_objects)
228
+ group by
229
+ 1
230
+ ) task
231
+ using (data_source_id)
232
+
233
+ left outer join strload_task_objects task_obj
234
+ using (object_id)
235
+ where
236
+ task_obj.object_id is null -- unassigned to a task
237
+ ) as t
238
+ where
239
+ object_count <= load_batch_size -- limit number of objects assigned to single task
240
+ ;
204
241
  EndSQL
205
242
  end
206
243
 
244
+ def suppress_sql_logging
245
+ # CLUDGE
246
+ orig = @logger.level
247
+ begin
248
+ @logger.level = Logger::ERROR
249
+ yield
250
+ ensure
251
+ @logger.level = orig
252
+ end
253
+ end
254
+
207
255
  end
208
256
 
209
257
  end
@@ -1,5 +1,5 @@
1
1
  module Bricolage
2
2
  module StreamingLoad
3
- VERSION = '0.3.0'
3
+ VERSION = '0.4.0'
4
4
  end
5
5
  end
@@ -0,0 +1,111 @@
1
+ require 'test/unit'
2
+ require 'bricolage/context'
3
+ require 'bricolage/sqsdatasource'
4
+ require 'bricolage/sqsmock'
5
+ require 'bricolage/streamingload/dispatcher'
6
+
7
+ module Bricolage
8
+ module StreamingLoad
9
+
10
+ class TestDispatcher < Test::Unit::TestCase
11
+
12
+ test "checkpoint event" do
13
+ ctx = Context.for_application('.', environment: 'test', logger: NullLogger.new)
14
+ ctl_ds = ctx.get_data_source('sql', 'dwhctl')
15
+
16
+ event_queue = SQSDataSource.new_mock(queue: [
17
+ # 1st ReceiveMessage
18
+ [
19
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0001.json.gz'),
20
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0002.json.gz'),
21
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0003.json.gz'),
22
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0004.json.gz'),
23
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0005.json.gz')
24
+ ],
25
+ # 2nd ReceiveMessage
26
+ [
27
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0006.json.gz'),
28
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0007.json.gz'),
29
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0008.json.gz'),
30
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0009.json.gz'),
31
+ SQSMock::Message.new(body: {eventSource: 'bricolage:system', eventName: 'checkpoint'}),
32
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.desttable/datafile-0010.json.gz')
33
+ ]
34
+ ])
35
+
36
+ task_queue = SQSDataSource.new_mock
37
+
38
+ object_buffer = ObjectBuffer.new(
39
+ control_data_source: ctl_ds,
40
+ logger: ctx.logger
41
+ )
42
+
43
+ url_patterns = URLPatterns.for_config([
44
+ {
45
+ "url" => %r<\As3://test-bucket/testschema\.desttable/datafile-\d{4}\.json\.gz>.source,
46
+ "schema" => 'testschema',
47
+ "table" => 'desttable'
48
+ }
49
+ ])
50
+
51
+ dispatcher = Dispatcher.new(
52
+ event_queue: event_queue,
53
+ task_queue: task_queue,
54
+ object_buffer: object_buffer,
55
+ url_patterns: url_patterns,
56
+ dispatch_interval: 600,
57
+ logger: ctx.logger
58
+ )
59
+
60
+ # FIXME: database cleaner
61
+ ctl_ds.open {|conn|
62
+ conn.update("truncate strload_tables")
63
+ conn.update("truncate strload_objects")
64
+ conn.update("truncate strload_task_objects")
65
+ conn.update("truncate strload_tasks")
66
+ conn.update("insert into strload_tables values ('testschema', 'desttable', 'testschema.desttable', 100, 1800, false)")
67
+ }
68
+ dispatcher.event_loop
69
+
70
+ # Event Queue Call Sequence
71
+ hst = event_queue.client.call_history
72
+ assert_equal :send_message, hst[0].name # start flush timer
73
+ assert_equal :receive_message, hst[1].name
74
+ assert_equal :delete_message_batch, hst[2].name
75
+ assert_equal :receive_message, hst[3].name
76
+ assert_equal :delete_message, hst[4].name # delete checkpoint
77
+ assert_equal :delete_message_batch, hst[5].name
78
+
79
+ # Task Queue Call Sequence
80
+ hst = task_queue.client.call_history
81
+ assert_equal :send_message, hst[0].name
82
+ assert(/streaming_load_v3/ =~ hst[0].args[:message_body])
83
+ task_id = JSON.load(hst[0].args[:message_body])['Records'][0]['taskId'].to_i
84
+ assert_not_equal 0, task_id
85
+
86
+ # Object Buffer
87
+ assert_equal [], unassigned_objects(ctl_ds)
88
+ task = ctl_ds.open {|conn| LoadTask.load(conn, task_id) }
89
+ assert_equal 'testschema', task.schema
90
+ assert_equal 'desttable', task.table
91
+ assert_equal 10, task.object_urls.size
92
+ end
93
+
94
+ def unassigned_objects(ctl_ds)
95
+ ctl_ds.open {|conn|
96
+ conn.query_values(<<-EndSQL)
97
+ select
98
+ object_url
99
+ from
100
+ strload_objects
101
+ where
102
+ object_id not in (select object_id from strload_task_objects)
103
+ ;
104
+ EndSQL
105
+ }
106
+ end
107
+
108
+ end
109
+
110
+ end
111
+ end
@@ -1,111 +1,54 @@
1
1
  require 'test/unit'
2
2
  require 'bricolage/streamingload/event'
3
+ require 'bricolage/sqsmock'
3
4
  require 'bricolage/logger'
4
5
 
5
6
  module Bricolage
6
7
 
7
8
  class TestSQSDataSource < Test::Unit::TestCase
8
9
 
9
- def new_sqs_ds(mock_client = nil)
10
- SQSDataSource.new(
11
- url: 'http://sqs/000000000000/queue-name',
12
- access_key_id: 'access_key_id_1',
13
- secret_access_key: 'secret_access_key_1',
14
- visibility_timeout: 30
15
- ).tap {|ds|
16
- logger = NullLogger.new
17
- #logger = Bricolage::Logger.default
18
- ds.__send__(:initialize_base, 'name', nil, logger)
19
- ds.instance_variable_set(:@client, mock_client) if mock_client
20
- }
21
- end
22
-
23
- class MockSQSClient
24
- def initialize(&block)
25
- @handler = block
26
- end
27
-
28
- def delete_message_batch(**args)
29
- @handler.call(args)
30
- end
31
- end
32
-
33
- class NullLogger
34
- def debug(*args) end
35
- def info(*args) end
36
- def warn(*args) end
37
- def error(*args) end
38
- def exception(*args) end
39
- def with_elapsed_time(*args) yield end
40
- def elapsed_time(*args) yield end
41
- end
42
-
43
- def sqs_message(seq)
44
- MockSQSMessage.new("message_id_#{seq}", "receipt_handle_#{seq}")
45
- end
46
-
47
- MockSQSMessage = Struct.new(:message_id, :receipt_handle)
48
-
49
- class MockSQSResponse
50
- def initialize(successful: [], failed: [])
51
- @successful = successful
52
- @failed = failed
53
- end
54
-
55
- attr_reader :successful
56
- attr_reader :failed
57
-
58
- Success = Struct.new(:id)
59
- Failure = Struct.new(:id, :sender_fault, :code, :message)
60
-
61
- def add_success_for(ent)
62
- @successful.push Success.new(ent[:id])
63
- end
64
-
65
- def add_failure_for(ent)
66
- @failed.push Failure.new(ent[:id], true, '400', 'some reason')
67
- end
68
- end
69
-
70
10
  test "#delete_message_async" do
71
- messages = [sqs_message(0), sqs_message(1), sqs_message(2)]
72
- mock = MockSQSClient.new {|args|
73
- entries = args[:entries]
74
- if entries.size == 3
75
- # first time
76
- assert_equal messages[0].receipt_handle, entries[0][:receipt_handle]
77
- assert_equal messages[1].receipt_handle, entries[1][:receipt_handle]
78
- assert_equal messages[2].receipt_handle, entries[2][:receipt_handle]
79
- MockSQSResponse.new.tap {|res|
80
- res.add_success_for(entries[0])
81
- res.add_failure_for(entries[1])
82
- res.add_success_for(entries[2])
83
- }
84
- else
85
- # second time
86
- MockSQSResponse.new.tap {|res|
87
- res.add_success_for(entries[0])
88
- }
89
- end
11
+ messages = (0..2).map {|seq|
12
+ SQSMock::Message.new(message_id: "message_id_#{seq}", receipt_handle: "receipt_handle_#{seq}")
90
13
  }
91
- ds = new_sqs_ds(mock)
14
+ ds = SQSDataSource.new_mock(
15
+ delete_message_batch: -> (queue_url:, entries:) {
16
+ if entries.size == 3
17
+ # first time
18
+ assert_equal messages[0].receipt_handle, entries[0][:receipt_handle]
19
+ assert_equal messages[1].receipt_handle, entries[1][:receipt_handle]
20
+ assert_equal messages[2].receipt_handle, entries[2][:receipt_handle]
21
+ SQSMock::DeleteMessageBatchResponse.new.tap {|res|
22
+ res.add_success_for(entries[0])
23
+ res.add_failure_for(entries[1])
24
+ res.add_success_for(entries[2])
25
+ }
26
+ else
27
+ # second time
28
+ SQSMock::DeleteMessageBatchResponse.new.tap {|res|
29
+ res.add_success_for(entries[0])
30
+ }
31
+ end
32
+ }
33
+ )
34
+
92
35
  ds.delete_message_async(messages[0])
93
36
  ds.delete_message_async(messages[1])
94
37
  ds.delete_message_async(messages[2])
95
38
 
96
39
  # first flush
97
40
  flush_time = Time.now
98
- ds.delete_message_buffer.flush(flush_time)
99
- assert_equal 1, ds.delete_message_buffer.size
100
- bufent = ds.delete_message_buffer.instance_variable_get(:@buf).values.first
41
+ ds.process_async_delete(flush_time)
42
+ delete_buf = ds.__send__(:delete_message_buffer)
43
+ bufent = delete_buf.instance_variable_get(:@buf).values.first
101
44
  assert_equal 'receipt_handle_1', bufent.message.receipt_handle
102
45
  assert_equal 1, bufent.n_failure
103
46
  assert_false bufent.issuable?(flush_time)
104
47
  assert_true bufent.issuable?(flush_time + 180)
105
48
 
106
49
  # second flush
107
- ds.delete_message_buffer.flush(flush_time + 180)
108
- assert_true ds.delete_message_buffer.empty?
50
+ ds.process_async_delete(flush_time + 180)
51
+ assert_true delete_buf.empty?
109
52
  end
110
53
 
111
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage-streamingload
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
@@ -107,9 +107,10 @@ files:
107
107
  - README.md
108
108
  - bin/bricolage-streaming-dispatcher
109
109
  - bin/bricolage-streaming-loader
110
+ - lib/bricolage/nulllogger.rb
110
111
  - lib/bricolage/snsdatasource.rb
111
112
  - lib/bricolage/sqsdatasource.rb
112
- - lib/bricolage/sqswrapper.rb
113
+ - lib/bricolage/sqsmock.rb
113
114
  - lib/bricolage/streamingload/alertinglogger.rb
114
115
  - lib/bricolage/streamingload/dispatcher.rb
115
116
  - lib/bricolage/streamingload/event.rb
@@ -122,6 +123,7 @@ files:
122
123
  - lib/bricolage/streamingload/urlpatterns.rb
123
124
  - lib/bricolage/streamingload/version.rb
124
125
  - test/all.rb
126
+ - test/streamingload/test_dispatcher.rb
125
127
  - test/streamingload/test_event.rb
126
128
  - test/test_sqsdatasource.rb
127
129
  homepage: https://github.com/aamine/bricolage-streamingload
@@ -1,77 +0,0 @@
1
- require 'json'
2
-
3
- module Bricolage
4
-
5
- class SQSClientWrapper
6
- def initialize(sqs, logger:)
7
- @sqs = sqs
8
- @logger = logger
9
- end
10
-
11
- def receive_message(**args)
12
- @logger.debug "receive_message(#{args.inspect})"
13
- @sqs.receive_message(**args)
14
- end
15
-
16
- def send_message(**args)
17
- @logger.debug "send_message(#{args.inspect})"
18
- @sqs.send_message(**args)
19
- end
20
-
21
- def delete_message(**args)
22
- @logger.debug "delete_message(#{args.inspect})"
23
- @sqs.delete_message(**args)
24
- end
25
-
26
- def delete_message_batch(**args)
27
- @logger.debug "delete_message_batch(#{args.inspect})"
28
- @sqs.delete_message_batch(**args)
29
- end
30
- end
31
-
32
-
33
- class DummySQSClient
34
- def initialize(queue = [])
35
- @queue = queue
36
- end
37
-
38
- def receive_message(**args)
39
- msg_recs = @queue.shift or return EMPTY_RESULT
40
- msgs = msg_recs.map {|recs| Message.new({'Records' => recs}.to_json) }
41
- Result.new(true, msgs)
42
- end
43
-
44
- def send_message(**args)
45
- SUCCESS_RESULT
46
- end
47
-
48
- def delete_message(**args)
49
- SUCCESS_RESULT
50
- end
51
-
52
- class Result
53
- def initialize(successful, messages = nil)
54
- @successful = successful
55
- @messages = messages
56
- end
57
-
58
- def successful?
59
- @successful
60
- end
61
-
62
- attr_reader :messages
63
- end
64
-
65
- SUCCESS_RESULT = Result.new(true)
66
- EMPTY_RESULT = Result.new(true, [])
67
-
68
- class Message
69
- def initialize(body)
70
- @body = body
71
- end
72
-
73
- attr_reader :body
74
- end
75
- end
76
-
77
- end # module Bricolage