bricolage-streamingload 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 848b815669c6580505119917a72a4e97833064c2
4
- data.tar.gz: 1809410699822e2a60a21407cad1b5814551adcd
3
+ metadata.gz: 41d99ce0dce7affca77b305ce527c5ec53e733b1
4
+ data.tar.gz: bd2254dab228018edf50debcf4627deef3bbdbd0
5
5
  SHA512:
6
- metadata.gz: 84ecbe1e548953cc1f4889eb07e9289101e1ade28e89370dae39462b61d193521ceda25f59d8a0b0f1760b8555cf254abab50cf7ed3d11e70d81c422af2f8b82
7
- data.tar.gz: b367bb4faa24e9755bee5dbeaaaf937a317826e8e562466e3264a19563bbfcc877ddba9801ffda4875d21e39feddfcc8e87df81a6433b9cd1c3a2cabcefc0754
6
+ metadata.gz: 66855f5839f86deb3c6af0500090bed718bd6860b50a83e6a11f1c79e822b81b3d7888ff5417391f4f1b0a669242ed5de3d9a9f4c3fe4e500d88ad77acfbe7fa
7
+ data.tar.gz: c18b88f15e4cd75095f5cf72863921a43ee467bb1b03f5b3481bf63a09e1a0e319f3afcd4e79e5902bdb6f62695528edd962ed561c29bfc5f6dc173ed029eb89
@@ -9,11 +9,14 @@ module Bricolage
9
9
  @sns_logger.level = Kernel.const_get("Logger").const_get(alert_level.upcase)
10
10
  end
11
11
 
12
+ def_delegators '@logger', :level, :level=, :debug?, :info?, :warn?, :error?, :fatal?, :unknown?
13
+
12
14
  %w(log debug info warn error fatal unknown).each do |m|
13
15
  define_method(m) do |*args|
14
16
  [@logger, @sns_logger].map {|t| t.send(m, *args) }
15
17
  end
16
18
  end
19
+
17
20
  end
18
21
  end
19
22
  end
@@ -56,6 +56,9 @@ module Bricolage
56
56
  Process.daemon(true) if opts.daemon?
57
57
  create_pid_file opts.pid_file_path if opts.pid_file_path
58
58
  dispatcher.event_loop
59
+ rescue Exception => e
60
+ alert_logger.error e.message
61
+ raise
59
62
  end
60
63
 
61
64
  def Dispatcher.new_logger(path, config)
@@ -119,18 +122,12 @@ module Bricolage
119
122
  def create_checkpoint
120
123
  logger.info "*** Creating checkpoint requested ***"
121
124
  logger.info "Force-flushing all objects..."
122
- flush_all_tasks_immediately
125
+ tasks = @object_buffer.flush_tasks_force
126
+ send_tasks tasks
123
127
  logger.info "All objects flushed; shutting down..."
124
128
  @event_queue.initiate_terminate
125
129
  end
126
130
 
127
- def flush_all_tasks_immediately
128
- tasks = @object_buffer.flush_tasks_force
129
- tasks.each do |task|
130
- @task_queue.put task
131
- end
132
- end
133
-
134
131
  def handle_data(e)
135
132
  unless e.created?
136
133
  @event_queue.delete_message_async(e)
@@ -144,7 +141,7 @@ module Bricolage
144
141
  def handle_dispatch(e)
145
142
  if @dispatch_message_id == e.message_id
146
143
  tasks = @object_buffer.flush_tasks
147
- tasks.each {|task| @task_queue.put task }
144
+ send_tasks tasks
148
145
  set_dispatch_timer
149
146
  end
150
147
  # Delete this event immediately
@@ -156,6 +153,20 @@ module Bricolage
156
153
  @dispatch_message_id = res.message_id
157
154
  end
158
155
 
156
+ def handle_flushtable(e)
157
+ logger.info "flushing #{e.table_name} requested"
158
+ tasks = @object_buffer.flush_table_force(e.table_name)
159
+ send_tasks tasks
160
+ # Delete this event immediately
161
+ @event_queue.delete_message(e)
162
+ end
163
+
164
+ def send_tasks(tasks)
165
+ tasks.each do |task|
166
+ @task_queue.put task
167
+ end
168
+ end
169
+
159
170
  end
160
171
 
161
172
 
@@ -10,7 +10,7 @@ module Bricolage
10
10
  case
11
11
  when rec['eventName'] == 'shutdown' then ShutdownEvent
12
12
  when rec['eventName'] == 'dispatch' then DispatchEvent
13
- when rec['eventName'] == 'flush' then FlushEvent
13
+ when rec['eventName'] == 'flushtable' then FlushTableEvent
14
14
  when rec['eventName'] == 'checkpoint' then CheckPointEvent
15
15
  when rec['eventSource'] == 'aws:s3'
16
16
  S3ObjectEvent
@@ -67,13 +67,13 @@ module Bricolage
67
67
  end
68
68
 
69
69
 
70
- class FlushEvent < Event
70
+ class FlushTableEvent < Event
71
71
 
72
- def FlushEvent.create(delay_seconds:, table_name:)
73
- super name: 'flush', delay_seconds: delay_seconds, table_name: table_name
72
+ def FlushTableEvent.create(table_name:)
73
+ super name: 'flushtable', table_name: table_name
74
74
  end
75
75
 
76
- def FlushEvent.parse_sqs_record(msg, rec)
76
+ def FlushTableEvent.parse_sqs_record(msg, rec)
77
77
  {
78
78
  table_name: rec['tableName']
79
79
  }
@@ -120,6 +120,7 @@ module Bricolage
120
120
 
121
121
  def write_job_error(status, message)
122
122
  @end_time = Time.now
123
+ @logger.warn message.lines.first
123
124
  write_job_result status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
124
125
  end
125
126
 
@@ -49,6 +49,9 @@ module Bricolage
49
49
  create_pid_file opts.pid_file_path if opts.pid_file_path
50
50
  service.event_loop
51
51
  end
52
+ rescue Exception => e
53
+ alert_logger.error e.message
54
+ raise
52
55
  end
53
56
 
54
57
  def LoaderService.new_logger(path, config)
@@ -75,6 +78,8 @@ module Bricolage
75
78
  @logger = logger
76
79
  end
77
80
 
81
+ attr_reader :logger
82
+
78
83
  def event_loop
79
84
  @task_queue.handle_messages(handler: self, message_class: Task)
80
85
  @logger.info "shutdown gracefully"
@@ -55,6 +55,7 @@ module Bricolage
55
55
  }
56
56
  end
57
57
 
58
+ # Flushes multiple tables periodically
58
59
  def flush_tasks
59
60
  task_ids = nil
60
61
  @ctl_ds.open {|conn|
@@ -84,6 +85,24 @@ module Bricolage
84
85
  return task_ids.map {|id| LoadTask.create(task_id: id) }
85
86
  end
86
87
 
88
+ # Flushes the all objects of the specified table immediately
89
+ # with no additional conditions, to create "table checkpoint".
90
+ def flush_table_force(table_name)
91
+ task_ids = []
92
+ @ctl_ds.open {|conn|
93
+ conn.transaction {|txn|
94
+ # insert_task_object_mappings may not consume all saved objects
95
+ # (e.g. there are too many objects for one table), we must create
96
+ # tasks repeatedly until there are no unassigned objects.
97
+ until (ids = insert_table_task_force(conn, table_name)).empty?
98
+ insert_task_object_mappings(conn)
99
+ task_ids.concat ids
100
+ end
101
+ }
102
+ }
103
+ return task_ids.map {|id| LoadTask.create(task_id: id) }
104
+ end
105
+
87
106
  private
88
107
 
89
108
  def insert_object(conn, obj)
@@ -185,6 +204,60 @@ module Bricolage
185
204
  task_ids
186
205
  end
187
206
 
207
+ def insert_table_task_force(conn, table_name)
208
+ task_ids = conn.query_values(<<-EndSQL)
209
+ insert into strload_tasks
210
+ ( task_class
211
+ , schema_name
212
+ , table_name
213
+ , submit_time
214
+ )
215
+ select
216
+ 'streaming_load_v3'
217
+ , tbl.schema_name
218
+ , tbl.table_name
219
+ , current_timestamp
220
+ from
221
+ strload_tables tbl
222
+
223
+ -- The number of objects for each tables, which is not assigned to any task (> 0).
224
+ -- This subquery is covered by the index.
225
+ inner join (
226
+ select
227
+ data_source_id
228
+ , count(*) as object_count
229
+ from
230
+ (
231
+ select
232
+ min(object_id) as object_id
233
+ , object_url
234
+ from
235
+ strload_objects
236
+ where
237
+ data_source_id = #{s table_name}
238
+ group by
239
+ object_url
240
+ ) uniq_objects
241
+ inner join strload_objects using (object_id)
242
+ left outer join strload_task_objects using (object_id)
243
+ where
244
+ task_id is null -- not assigned to a task
245
+ group by
246
+ data_source_id
247
+ ) obj
248
+ using (data_source_id)
249
+ where
250
+ -- does not check disabled
251
+ data_source_id = #{s table_name}
252
+ returning task_id
253
+ ;
254
+ EndSQL
255
+
256
+ # It must be 1
257
+ @logger.info "Number of task created: #{task_ids.size}"
258
+ task_ids
259
+ end
260
+
188
261
  def insert_task_object_mappings(conn)
189
262
  conn.update(<<-EndSQL)
190
263
  insert into strload_task_objects
@@ -1,5 +1,5 @@
1
1
  module Bricolage
2
2
  module StreamingLoad
3
- VERSION = '0.4.0'
3
+ VERSION = '0.5.0'
4
4
  end
5
5
  end
@@ -69,6 +69,7 @@ module Bricolage
69
69
 
70
70
  # Event Queue Call Sequence
71
71
  hst = event_queue.client.call_history
72
+ assert_equal 6, hst.size
72
73
  assert_equal :send_message, hst[0].name # start flush timer
73
74
  assert_equal :receive_message, hst[1].name
74
75
  assert_equal :delete_message_batch, hst[2].name
@@ -78,6 +79,7 @@ module Bricolage
78
79
 
79
80
  # Task Queue Call Sequence
80
81
  hst = task_queue.client.call_history
82
+ assert_equal 1, hst.size
81
83
  assert_equal :send_message, hst[0].name
82
84
  assert(/streaming_load_v3/ =~ hst[0].args[:message_body])
83
85
  task_id = JSON.load(hst[0].args[:message_body])['Records'][0]['taskId'].to_i
@@ -105,6 +107,100 @@ module Bricolage
105
107
  }
106
108
  end
107
109
 
110
+ test "flushtable event" do
111
+ ctx = Context.for_application('.', environment: 'test', logger: NullLogger.new)
112
+ ctl_ds = ctx.get_data_source('sql', 'dwhctl')
113
+
114
+ event_queue = SQSDataSource.new_mock(queue: [
115
+ # 1st ReceiveMessage
116
+ [
117
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.aaa/datafile-0001.json.gz'),
118
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.bbb/datafile-0001.json.gz'),
119
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.ccc/datafile-0002.json.gz'),
120
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.aaa/datafile-0002.json.gz'),
121
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.bbb/datafile-0003.json.gz'),
122
+ SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.ccc/datafile-0003.json.gz'),
123
+ SQSMock::Message.new(body: {eventSource: 'bricolage:system', eventName: 'flushtable', tableName: 'testschema.bbb'}),
124
+ SQSMock::Message.new(body: {eventSource: 'bricolage:system', eventName: 'shutdown'})
125
+ ]
126
+ ])
127
+
128
+ task_queue = SQSDataSource.new_mock
129
+
130
+ object_buffer = ObjectBuffer.new(
131
+ control_data_source: ctl_ds,
132
+ logger: ctx.logger
133
+ )
134
+
135
+ url_patterns = URLPatterns.for_config([
136
+ {
137
+ "url" => %r<\As3://test-bucket/testschema\.(?<table>\w+)/datafile-\d{4}\.json\.gz>.source,
138
+ "schema" => 'testschema',
139
+ "table" => '%table'
140
+ }
141
+ ])
142
+
143
+ dispatcher = Dispatcher.new(
144
+ event_queue: event_queue,
145
+ task_queue: task_queue,
146
+ object_buffer: object_buffer,
147
+ url_patterns: url_patterns,
148
+ dispatch_interval: 600,
149
+ logger: ctx.logger
150
+ )
151
+
152
+ # FIXME: database cleaner
153
+ ctl_ds.open {|conn|
154
+ conn.update("truncate strload_tables")
155
+ conn.update("truncate strload_objects")
156
+ conn.update("truncate strload_task_objects")
157
+ conn.update("truncate strload_tasks")
158
+ conn.update("insert into strload_tables values ('testschema', 'aaa', 'testschema.aaa', 100, 1800, false)")
159
+ conn.update("insert into strload_tables values ('testschema', 'bbb', 'testschema.bbb', 100, 1800, false)")
160
+ conn.update("insert into strload_tables values ('testschema', 'ccc', 'testschema.ccc', 100, 1800, false)")
161
+ }
162
+ dispatcher.event_loop
163
+
164
+ # Event Queue Call Sequence
165
+ hst = event_queue.client.call_history
166
+ assert_equal 5, hst.size
167
+ assert_equal :send_message, hst[0].name # start dispatch timer
168
+ assert_equal :receive_message, hst[1].name
169
+ assert_equal :delete_message, hst[2].name # delete flushtable event
170
+ assert_equal :delete_message, hst[3].name # delete shutdown event
171
+ assert_equal :delete_message_batch, hst[4].name
172
+
173
+ # Task Queue Call Sequence
174
+ hst = task_queue.client.call_history
175
+ assert_equal 1, hst.size
176
+ assert_equal :send_message, hst[0].name
177
+ assert(/streaming_load_v3/ =~ hst[0].args[:message_body])
178
+ task_id = JSON.load(hst[0].args[:message_body])['Records'][0]['taskId'].to_i
179
+ assert_not_equal 0, task_id
180
+
181
+ # Object Buffer
182
+ assert_equal [], unassigned_table_objects(ctl_ds, 'testschema.bbb')
183
+ task = ctl_ds.open {|conn| LoadTask.load(conn, task_id) }
184
+ assert_equal 'testschema', task.schema
185
+ assert_equal 'bbb', task.table
186
+ assert_equal 2, task.object_urls.size
187
+ end
188
+
189
+ def unassigned_table_objects(ctl_ds, table_name)
190
+ ctl_ds.open {|conn|
191
+ conn.query_values(<<-EndSQL)
192
+ select
193
+ object_url
194
+ from
195
+ strload_objects
196
+ where
197
+ data_source_id = '#{table_name}'
198
+ and object_id not in (select object_id from strload_task_objects)
199
+ ;
200
+ EndSQL
201
+ }
202
+ end
203
+
108
204
  end
109
205
 
110
206
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage-streamingload
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-08-12 00:00:00.000000000 Z
12
+ date: 2016-08-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bricolage