bricolage-streamingload 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bricolage/streamingload/alertinglogger.rb +3 -0
- data/lib/bricolage/streamingload/dispatcher.rb +20 -9
- data/lib/bricolage/streamingload/event.rb +5 -5
- data/lib/bricolage/streamingload/loader.rb +1 -0
- data/lib/bricolage/streamingload/loaderservice.rb +5 -0
- data/lib/bricolage/streamingload/objectbuffer.rb +73 -0
- data/lib/bricolage/streamingload/version.rb +1 -1
- data/test/streamingload/test_dispatcher.rb +96 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 41d99ce0dce7affca77b305ce527c5ec53e733b1
|
4
|
+
data.tar.gz: bd2254dab228018edf50debcf4627deef3bbdbd0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 66855f5839f86deb3c6af0500090bed718bd6860b50a83e6a11f1c79e822b81b3d7888ff5417391f4f1b0a669242ed5de3d9a9f4c3fe4e500d88ad77acfbe7fa
|
7
|
+
data.tar.gz: c18b88f15e4cd75095f5cf72863921a43ee467bb1b03f5b3481bf63a09e1a0e319f3afcd4e79e5902bdb6f62695528edd962ed561c29bfc5f6dc173ed029eb89
|
@@ -9,11 +9,14 @@ module Bricolage
|
|
9
9
|
@sns_logger.level = Kernel.const_get("Logger").const_get(alert_level.upcase)
|
10
10
|
end
|
11
11
|
|
12
|
+
def_delegators '@logger', :level, :level=, :debug?, :info?, :warn?, :error?, :fatal?, :unknown?
|
13
|
+
|
12
14
|
%w(log debug info warn error fatal unknown).each do |m|
|
13
15
|
define_method(m) do |*args|
|
14
16
|
[@logger, @sns_logger].map {|t| t.send(m, *args) }
|
15
17
|
end
|
16
18
|
end
|
19
|
+
|
17
20
|
end
|
18
21
|
end
|
19
22
|
end
|
@@ -56,6 +56,9 @@ module Bricolage
|
|
56
56
|
Process.daemon(true) if opts.daemon?
|
57
57
|
create_pid_file opts.pid_file_path if opts.pid_file_path
|
58
58
|
dispatcher.event_loop
|
59
|
+
rescue Exception => e
|
60
|
+
alert_logger.error e.message
|
61
|
+
raise
|
59
62
|
end
|
60
63
|
|
61
64
|
def Dispatcher.new_logger(path, config)
|
@@ -119,18 +122,12 @@ module Bricolage
|
|
119
122
|
def create_checkpoint
|
120
123
|
logger.info "*** Creating checkpoint requested ***"
|
121
124
|
logger.info "Force-flushing all objects..."
|
122
|
-
|
125
|
+
tasks = @object_buffer.flush_tasks_force
|
126
|
+
send_tasks tasks
|
123
127
|
logger.info "All objects flushed; shutting down..."
|
124
128
|
@event_queue.initiate_terminate
|
125
129
|
end
|
126
130
|
|
127
|
-
def flush_all_tasks_immediately
|
128
|
-
tasks = @object_buffer.flush_tasks_force
|
129
|
-
tasks.each do |task|
|
130
|
-
@task_queue.put task
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
131
|
def handle_data(e)
|
135
132
|
unless e.created?
|
136
133
|
@event_queue.delete_message_async(e)
|
@@ -144,7 +141,7 @@ module Bricolage
|
|
144
141
|
def handle_dispatch(e)
|
145
142
|
if @dispatch_message_id == e.message_id
|
146
143
|
tasks = @object_buffer.flush_tasks
|
147
|
-
tasks
|
144
|
+
send_tasks tasks
|
148
145
|
set_dispatch_timer
|
149
146
|
end
|
150
147
|
# Delete this event immediately
|
@@ -156,6 +153,20 @@ module Bricolage
|
|
156
153
|
@dispatch_message_id = res.message_id
|
157
154
|
end
|
158
155
|
|
156
|
+
def handle_flushtable(e)
|
157
|
+
logger.info "flushing #{e.table_name} requested"
|
158
|
+
tasks = @object_buffer.flush_table_force(e.table_name)
|
159
|
+
send_tasks tasks
|
160
|
+
# Delete this event immediately
|
161
|
+
@event_queue.delete_message(e)
|
162
|
+
end
|
163
|
+
|
164
|
+
def send_tasks(tasks)
|
165
|
+
tasks.each do |task|
|
166
|
+
@task_queue.put task
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
159
170
|
end
|
160
171
|
|
161
172
|
|
@@ -10,7 +10,7 @@ module Bricolage
|
|
10
10
|
case
|
11
11
|
when rec['eventName'] == 'shutdown' then ShutdownEvent
|
12
12
|
when rec['eventName'] == 'dispatch' then DispatchEvent
|
13
|
-
when rec['eventName'] == '
|
13
|
+
when rec['eventName'] == 'flushtable' then FlushTableEvent
|
14
14
|
when rec['eventName'] == 'checkpoint' then CheckPointEvent
|
15
15
|
when rec['eventSource'] == 'aws:s3'
|
16
16
|
S3ObjectEvent
|
@@ -67,13 +67,13 @@ module Bricolage
|
|
67
67
|
end
|
68
68
|
|
69
69
|
|
70
|
-
class
|
70
|
+
class FlushTableEvent < Event
|
71
71
|
|
72
|
-
def
|
73
|
-
super name: '
|
72
|
+
def FlushTableEvent.create(table_name:)
|
73
|
+
super name: 'flushtable', table_name: table_name
|
74
74
|
end
|
75
75
|
|
76
|
-
def
|
76
|
+
def FlushTableEvent.parse_sqs_record(msg, rec)
|
77
77
|
{
|
78
78
|
table_name: rec['tableName']
|
79
79
|
}
|
@@ -49,6 +49,9 @@ module Bricolage
|
|
49
49
|
create_pid_file opts.pid_file_path if opts.pid_file_path
|
50
50
|
service.event_loop
|
51
51
|
end
|
52
|
+
rescue Exception => e
|
53
|
+
alert_logger.error e.message
|
54
|
+
raise
|
52
55
|
end
|
53
56
|
|
54
57
|
def LoaderService.new_logger(path, config)
|
@@ -75,6 +78,8 @@ module Bricolage
|
|
75
78
|
@logger = logger
|
76
79
|
end
|
77
80
|
|
81
|
+
attr_reader :logger
|
82
|
+
|
78
83
|
def event_loop
|
79
84
|
@task_queue.handle_messages(handler: self, message_class: Task)
|
80
85
|
@logger.info "shutdown gracefully"
|
@@ -55,6 +55,7 @@ module Bricolage
|
|
55
55
|
}
|
56
56
|
end
|
57
57
|
|
58
|
+
# Flushes multiple tables periodically
|
58
59
|
def flush_tasks
|
59
60
|
task_ids = nil
|
60
61
|
@ctl_ds.open {|conn|
|
@@ -84,6 +85,24 @@ module Bricolage
|
|
84
85
|
return task_ids.map {|id| LoadTask.create(task_id: id) }
|
85
86
|
end
|
86
87
|
|
88
|
+
# Flushes the all objects of the specified table immediately
|
89
|
+
# with no additional conditions, to create "table checkpoint".
|
90
|
+
def flush_table_force(table_name)
|
91
|
+
task_ids = []
|
92
|
+
@ctl_ds.open {|conn|
|
93
|
+
conn.transaction {|txn|
|
94
|
+
# insert_task_object_mappings may not consume all saved objects
|
95
|
+
# (e.g. there are too many objects for one table), we must create
|
96
|
+
# tasks repeatedly until there are no unassigned objects.
|
97
|
+
until (ids = insert_table_task_force(conn, table_name)).empty?
|
98
|
+
insert_task_object_mappings(conn)
|
99
|
+
task_ids.concat ids
|
100
|
+
end
|
101
|
+
}
|
102
|
+
}
|
103
|
+
return task_ids.map {|id| LoadTask.create(task_id: id) }
|
104
|
+
end
|
105
|
+
|
87
106
|
private
|
88
107
|
|
89
108
|
def insert_object(conn, obj)
|
@@ -185,6 +204,60 @@ module Bricolage
|
|
185
204
|
task_ids
|
186
205
|
end
|
187
206
|
|
207
|
+
def insert_table_task_force(conn, table_name)
|
208
|
+
task_ids = conn.query_values(<<-EndSQL)
|
209
|
+
insert into strload_tasks
|
210
|
+
( task_class
|
211
|
+
, schema_name
|
212
|
+
, table_name
|
213
|
+
, submit_time
|
214
|
+
)
|
215
|
+
select
|
216
|
+
'streaming_load_v3'
|
217
|
+
, tbl.schema_name
|
218
|
+
, tbl.table_name
|
219
|
+
, current_timestamp
|
220
|
+
from
|
221
|
+
strload_tables tbl
|
222
|
+
|
223
|
+
-- The number of objects for each tables, which is not assigned to any task (> 0).
|
224
|
+
-- This subquery is covered by the index.
|
225
|
+
inner join (
|
226
|
+
select
|
227
|
+
data_source_id
|
228
|
+
, count(*) as object_count
|
229
|
+
from
|
230
|
+
(
|
231
|
+
select
|
232
|
+
min(object_id) as object_id
|
233
|
+
, object_url
|
234
|
+
from
|
235
|
+
strload_objects
|
236
|
+
where
|
237
|
+
data_source_id = #{s table_name}
|
238
|
+
group by
|
239
|
+
object_url
|
240
|
+
) uniq_objects
|
241
|
+
inner join strload_objects using (object_id)
|
242
|
+
left outer join strload_task_objects using (object_id)
|
243
|
+
where
|
244
|
+
task_id is null -- not assigned to a task
|
245
|
+
group by
|
246
|
+
data_source_id
|
247
|
+
) obj
|
248
|
+
using (data_source_id)
|
249
|
+
where
|
250
|
+
-- does not check disabled
|
251
|
+
data_source_id = #{s table_name}
|
252
|
+
returning task_id
|
253
|
+
;
|
254
|
+
EndSQL
|
255
|
+
|
256
|
+
# It must be 1
|
257
|
+
@logger.info "Number of task created: #{task_ids.size}"
|
258
|
+
task_ids
|
259
|
+
end
|
260
|
+
|
188
261
|
def insert_task_object_mappings(conn)
|
189
262
|
conn.update(<<-EndSQL)
|
190
263
|
insert into strload_task_objects
|
@@ -69,6 +69,7 @@ module Bricolage
|
|
69
69
|
|
70
70
|
# Event Queue Call Sequence
|
71
71
|
hst = event_queue.client.call_history
|
72
|
+
assert_equal 6, hst.size
|
72
73
|
assert_equal :send_message, hst[0].name # start flush timer
|
73
74
|
assert_equal :receive_message, hst[1].name
|
74
75
|
assert_equal :delete_message_batch, hst[2].name
|
@@ -78,6 +79,7 @@ module Bricolage
|
|
78
79
|
|
79
80
|
# Task Queue Call Sequence
|
80
81
|
hst = task_queue.client.call_history
|
82
|
+
assert_equal 1, hst.size
|
81
83
|
assert_equal :send_message, hst[0].name
|
82
84
|
assert(/streaming_load_v3/ =~ hst[0].args[:message_body])
|
83
85
|
task_id = JSON.load(hst[0].args[:message_body])['Records'][0]['taskId'].to_i
|
@@ -105,6 +107,100 @@ module Bricolage
|
|
105
107
|
}
|
106
108
|
end
|
107
109
|
|
110
|
+
test "flushtable event" do
|
111
|
+
ctx = Context.for_application('.', environment: 'test', logger: NullLogger.new)
|
112
|
+
ctl_ds = ctx.get_data_source('sql', 'dwhctl')
|
113
|
+
|
114
|
+
event_queue = SQSDataSource.new_mock(queue: [
|
115
|
+
# 1st ReceiveMessage
|
116
|
+
[
|
117
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.aaa/datafile-0001.json.gz'),
|
118
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.bbb/datafile-0001.json.gz'),
|
119
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.ccc/datafile-0002.json.gz'),
|
120
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.aaa/datafile-0002.json.gz'),
|
121
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.bbb/datafile-0003.json.gz'),
|
122
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.ccc/datafile-0003.json.gz'),
|
123
|
+
SQSMock::Message.new(body: {eventSource: 'bricolage:system', eventName: 'flushtable', tableName: 'testschema.bbb'}),
|
124
|
+
SQSMock::Message.new(body: {eventSource: 'bricolage:system', eventName: 'shutdown'})
|
125
|
+
]
|
126
|
+
])
|
127
|
+
|
128
|
+
task_queue = SQSDataSource.new_mock
|
129
|
+
|
130
|
+
object_buffer = ObjectBuffer.new(
|
131
|
+
control_data_source: ctl_ds,
|
132
|
+
logger: ctx.logger
|
133
|
+
)
|
134
|
+
|
135
|
+
url_patterns = URLPatterns.for_config([
|
136
|
+
{
|
137
|
+
"url" => %r<\As3://test-bucket/testschema\.(?<table>\w+)/datafile-\d{4}\.json\.gz>.source,
|
138
|
+
"schema" => 'testschema',
|
139
|
+
"table" => '%table'
|
140
|
+
}
|
141
|
+
])
|
142
|
+
|
143
|
+
dispatcher = Dispatcher.new(
|
144
|
+
event_queue: event_queue,
|
145
|
+
task_queue: task_queue,
|
146
|
+
object_buffer: object_buffer,
|
147
|
+
url_patterns: url_patterns,
|
148
|
+
dispatch_interval: 600,
|
149
|
+
logger: ctx.logger
|
150
|
+
)
|
151
|
+
|
152
|
+
# FIXME: database cleaner
|
153
|
+
ctl_ds.open {|conn|
|
154
|
+
conn.update("truncate strload_tables")
|
155
|
+
conn.update("truncate strload_objects")
|
156
|
+
conn.update("truncate strload_task_objects")
|
157
|
+
conn.update("truncate strload_tasks")
|
158
|
+
conn.update("insert into strload_tables values ('testschema', 'aaa', 'testschema.aaa', 100, 1800, false)")
|
159
|
+
conn.update("insert into strload_tables values ('testschema', 'bbb', 'testschema.bbb', 100, 1800, false)")
|
160
|
+
conn.update("insert into strload_tables values ('testschema', 'ccc', 'testschema.ccc', 100, 1800, false)")
|
161
|
+
}
|
162
|
+
dispatcher.event_loop
|
163
|
+
|
164
|
+
# Event Queue Call Sequence
|
165
|
+
hst = event_queue.client.call_history
|
166
|
+
assert_equal 5, hst.size
|
167
|
+
assert_equal :send_message, hst[0].name # start dispatch timer
|
168
|
+
assert_equal :receive_message, hst[1].name
|
169
|
+
assert_equal :delete_message, hst[2].name # delete flushtable event
|
170
|
+
assert_equal :delete_message, hst[3].name # delete shutdown event
|
171
|
+
assert_equal :delete_message_batch, hst[4].name
|
172
|
+
|
173
|
+
# Task Queue Call Sequence
|
174
|
+
hst = task_queue.client.call_history
|
175
|
+
assert_equal 1, hst.size
|
176
|
+
assert_equal :send_message, hst[0].name
|
177
|
+
assert(/streaming_load_v3/ =~ hst[0].args[:message_body])
|
178
|
+
task_id = JSON.load(hst[0].args[:message_body])['Records'][0]['taskId'].to_i
|
179
|
+
assert_not_equal 0, task_id
|
180
|
+
|
181
|
+
# Object Buffer
|
182
|
+
assert_equal [], unassigned_table_objects(ctl_ds, 'testschema.bbb')
|
183
|
+
task = ctl_ds.open {|conn| LoadTask.load(conn, task_id) }
|
184
|
+
assert_equal 'testschema', task.schema
|
185
|
+
assert_equal 'bbb', task.table
|
186
|
+
assert_equal 2, task.object_urls.size
|
187
|
+
end
|
188
|
+
|
189
|
+
def unassigned_table_objects(ctl_ds, table_name)
|
190
|
+
ctl_ds.open {|conn|
|
191
|
+
conn.query_values(<<-EndSQL)
|
192
|
+
select
|
193
|
+
object_url
|
194
|
+
from
|
195
|
+
strload_objects
|
196
|
+
where
|
197
|
+
data_source_id = '#{table_name}'
|
198
|
+
and object_id not in (select object_id from strload_task_objects)
|
199
|
+
;
|
200
|
+
EndSQL
|
201
|
+
}
|
202
|
+
end
|
203
|
+
|
108
204
|
end
|
109
205
|
|
110
206
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bricolage-streamingload
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minero Aoki
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-08-
|
12
|
+
date: 2016-08-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bricolage
|