bricolage-streamingload 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bricolage/streamingload/alertinglogger.rb +3 -0
- data/lib/bricolage/streamingload/dispatcher.rb +20 -9
- data/lib/bricolage/streamingload/event.rb +5 -5
- data/lib/bricolage/streamingload/loader.rb +1 -0
- data/lib/bricolage/streamingload/loaderservice.rb +5 -0
- data/lib/bricolage/streamingload/objectbuffer.rb +73 -0
- data/lib/bricolage/streamingload/version.rb +1 -1
- data/test/streamingload/test_dispatcher.rb +96 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 41d99ce0dce7affca77b305ce527c5ec53e733b1
|
4
|
+
data.tar.gz: bd2254dab228018edf50debcf4627deef3bbdbd0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 66855f5839f86deb3c6af0500090bed718bd6860b50a83e6a11f1c79e822b81b3d7888ff5417391f4f1b0a669242ed5de3d9a9f4c3fe4e500d88ad77acfbe7fa
|
7
|
+
data.tar.gz: c18b88f15e4cd75095f5cf72863921a43ee467bb1b03f5b3481bf63a09e1a0e319f3afcd4e79e5902bdb6f62695528edd962ed561c29bfc5f6dc173ed029eb89
|
@@ -9,11 +9,14 @@ module Bricolage
|
|
9
9
|
@sns_logger.level = Kernel.const_get("Logger").const_get(alert_level.upcase)
|
10
10
|
end
|
11
11
|
|
12
|
+
def_delegators '@logger', :level, :level=, :debug?, :info?, :warn?, :error?, :fatal?, :unknown?
|
13
|
+
|
12
14
|
%w(log debug info warn error fatal unknown).each do |m|
|
13
15
|
define_method(m) do |*args|
|
14
16
|
[@logger, @sns_logger].map {|t| t.send(m, *args) }
|
15
17
|
end
|
16
18
|
end
|
19
|
+
|
17
20
|
end
|
18
21
|
end
|
19
22
|
end
|
@@ -56,6 +56,9 @@ module Bricolage
|
|
56
56
|
Process.daemon(true) if opts.daemon?
|
57
57
|
create_pid_file opts.pid_file_path if opts.pid_file_path
|
58
58
|
dispatcher.event_loop
|
59
|
+
rescue Exception => e
|
60
|
+
alert_logger.error e.message
|
61
|
+
raise
|
59
62
|
end
|
60
63
|
|
61
64
|
def Dispatcher.new_logger(path, config)
|
@@ -119,18 +122,12 @@ module Bricolage
|
|
119
122
|
def create_checkpoint
|
120
123
|
logger.info "*** Creating checkpoint requested ***"
|
121
124
|
logger.info "Force-flushing all objects..."
|
122
|
-
|
125
|
+
tasks = @object_buffer.flush_tasks_force
|
126
|
+
send_tasks tasks
|
123
127
|
logger.info "All objects flushed; shutting down..."
|
124
128
|
@event_queue.initiate_terminate
|
125
129
|
end
|
126
130
|
|
127
|
-
def flush_all_tasks_immediately
|
128
|
-
tasks = @object_buffer.flush_tasks_force
|
129
|
-
tasks.each do |task|
|
130
|
-
@task_queue.put task
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
131
|
def handle_data(e)
|
135
132
|
unless e.created?
|
136
133
|
@event_queue.delete_message_async(e)
|
@@ -144,7 +141,7 @@ module Bricolage
|
|
144
141
|
def handle_dispatch(e)
|
145
142
|
if @dispatch_message_id == e.message_id
|
146
143
|
tasks = @object_buffer.flush_tasks
|
147
|
-
tasks
|
144
|
+
send_tasks tasks
|
148
145
|
set_dispatch_timer
|
149
146
|
end
|
150
147
|
# Delete this event immediately
|
@@ -156,6 +153,20 @@ module Bricolage
|
|
156
153
|
@dispatch_message_id = res.message_id
|
157
154
|
end
|
158
155
|
|
156
|
+
def handle_flushtable(e)
|
157
|
+
logger.info "flushing #{e.table_name} requested"
|
158
|
+
tasks = @object_buffer.flush_table_force(e.table_name)
|
159
|
+
send_tasks tasks
|
160
|
+
# Delete this event immediately
|
161
|
+
@event_queue.delete_message(e)
|
162
|
+
end
|
163
|
+
|
164
|
+
def send_tasks(tasks)
|
165
|
+
tasks.each do |task|
|
166
|
+
@task_queue.put task
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
159
170
|
end
|
160
171
|
|
161
172
|
|
@@ -10,7 +10,7 @@ module Bricolage
|
|
10
10
|
case
|
11
11
|
when rec['eventName'] == 'shutdown' then ShutdownEvent
|
12
12
|
when rec['eventName'] == 'dispatch' then DispatchEvent
|
13
|
-
when rec['eventName'] == '
|
13
|
+
when rec['eventName'] == 'flushtable' then FlushTableEvent
|
14
14
|
when rec['eventName'] == 'checkpoint' then CheckPointEvent
|
15
15
|
when rec['eventSource'] == 'aws:s3'
|
16
16
|
S3ObjectEvent
|
@@ -67,13 +67,13 @@ module Bricolage
|
|
67
67
|
end
|
68
68
|
|
69
69
|
|
70
|
-
class
|
70
|
+
class FlushTableEvent < Event
|
71
71
|
|
72
|
-
def
|
73
|
-
super name: '
|
72
|
+
def FlushTableEvent.create(table_name:)
|
73
|
+
super name: 'flushtable', table_name: table_name
|
74
74
|
end
|
75
75
|
|
76
|
-
def
|
76
|
+
def FlushTableEvent.parse_sqs_record(msg, rec)
|
77
77
|
{
|
78
78
|
table_name: rec['tableName']
|
79
79
|
}
|
@@ -49,6 +49,9 @@ module Bricolage
|
|
49
49
|
create_pid_file opts.pid_file_path if opts.pid_file_path
|
50
50
|
service.event_loop
|
51
51
|
end
|
52
|
+
rescue Exception => e
|
53
|
+
alert_logger.error e.message
|
54
|
+
raise
|
52
55
|
end
|
53
56
|
|
54
57
|
def LoaderService.new_logger(path, config)
|
@@ -75,6 +78,8 @@ module Bricolage
|
|
75
78
|
@logger = logger
|
76
79
|
end
|
77
80
|
|
81
|
+
attr_reader :logger
|
82
|
+
|
78
83
|
def event_loop
|
79
84
|
@task_queue.handle_messages(handler: self, message_class: Task)
|
80
85
|
@logger.info "shutdown gracefully"
|
@@ -55,6 +55,7 @@ module Bricolage
|
|
55
55
|
}
|
56
56
|
end
|
57
57
|
|
58
|
+
# Flushes multiple tables periodically
|
58
59
|
def flush_tasks
|
59
60
|
task_ids = nil
|
60
61
|
@ctl_ds.open {|conn|
|
@@ -84,6 +85,24 @@ module Bricolage
|
|
84
85
|
return task_ids.map {|id| LoadTask.create(task_id: id) }
|
85
86
|
end
|
86
87
|
|
88
|
+
# Flushes the all objects of the specified table immediately
|
89
|
+
# with no additional conditions, to create "table checkpoint".
|
90
|
+
def flush_table_force(table_name)
|
91
|
+
task_ids = []
|
92
|
+
@ctl_ds.open {|conn|
|
93
|
+
conn.transaction {|txn|
|
94
|
+
# insert_task_object_mappings may not consume all saved objects
|
95
|
+
# (e.g. there are too many objects for one table), we must create
|
96
|
+
# tasks repeatedly until there are no unassigned objects.
|
97
|
+
until (ids = insert_table_task_force(conn, table_name)).empty?
|
98
|
+
insert_task_object_mappings(conn)
|
99
|
+
task_ids.concat ids
|
100
|
+
end
|
101
|
+
}
|
102
|
+
}
|
103
|
+
return task_ids.map {|id| LoadTask.create(task_id: id) }
|
104
|
+
end
|
105
|
+
|
87
106
|
private
|
88
107
|
|
89
108
|
def insert_object(conn, obj)
|
@@ -185,6 +204,60 @@ module Bricolage
|
|
185
204
|
task_ids
|
186
205
|
end
|
187
206
|
|
207
|
+
def insert_table_task_force(conn, table_name)
|
208
|
+
task_ids = conn.query_values(<<-EndSQL)
|
209
|
+
insert into strload_tasks
|
210
|
+
( task_class
|
211
|
+
, schema_name
|
212
|
+
, table_name
|
213
|
+
, submit_time
|
214
|
+
)
|
215
|
+
select
|
216
|
+
'streaming_load_v3'
|
217
|
+
, tbl.schema_name
|
218
|
+
, tbl.table_name
|
219
|
+
, current_timestamp
|
220
|
+
from
|
221
|
+
strload_tables tbl
|
222
|
+
|
223
|
+
-- The number of objects for each tables, which is not assigned to any task (> 0).
|
224
|
+
-- This subquery is covered by the index.
|
225
|
+
inner join (
|
226
|
+
select
|
227
|
+
data_source_id
|
228
|
+
, count(*) as object_count
|
229
|
+
from
|
230
|
+
(
|
231
|
+
select
|
232
|
+
min(object_id) as object_id
|
233
|
+
, object_url
|
234
|
+
from
|
235
|
+
strload_objects
|
236
|
+
where
|
237
|
+
data_source_id = #{s table_name}
|
238
|
+
group by
|
239
|
+
object_url
|
240
|
+
) uniq_objects
|
241
|
+
inner join strload_objects using (object_id)
|
242
|
+
left outer join strload_task_objects using (object_id)
|
243
|
+
where
|
244
|
+
task_id is null -- not assigned to a task
|
245
|
+
group by
|
246
|
+
data_source_id
|
247
|
+
) obj
|
248
|
+
using (data_source_id)
|
249
|
+
where
|
250
|
+
-- does not check disabled
|
251
|
+
data_source_id = #{s table_name}
|
252
|
+
returning task_id
|
253
|
+
;
|
254
|
+
EndSQL
|
255
|
+
|
256
|
+
# It must be 1
|
257
|
+
@logger.info "Number of task created: #{task_ids.size}"
|
258
|
+
task_ids
|
259
|
+
end
|
260
|
+
|
188
261
|
def insert_task_object_mappings(conn)
|
189
262
|
conn.update(<<-EndSQL)
|
190
263
|
insert into strload_task_objects
|
@@ -69,6 +69,7 @@ module Bricolage
|
|
69
69
|
|
70
70
|
# Event Queue Call Sequence
|
71
71
|
hst = event_queue.client.call_history
|
72
|
+
assert_equal 6, hst.size
|
72
73
|
assert_equal :send_message, hst[0].name # start flush timer
|
73
74
|
assert_equal :receive_message, hst[1].name
|
74
75
|
assert_equal :delete_message_batch, hst[2].name
|
@@ -78,6 +79,7 @@ module Bricolage
|
|
78
79
|
|
79
80
|
# Task Queue Call Sequence
|
80
81
|
hst = task_queue.client.call_history
|
82
|
+
assert_equal 1, hst.size
|
81
83
|
assert_equal :send_message, hst[0].name
|
82
84
|
assert(/streaming_load_v3/ =~ hst[0].args[:message_body])
|
83
85
|
task_id = JSON.load(hst[0].args[:message_body])['Records'][0]['taskId'].to_i
|
@@ -105,6 +107,100 @@ module Bricolage
|
|
105
107
|
}
|
106
108
|
end
|
107
109
|
|
110
|
+
test "flushtable event" do
|
111
|
+
ctx = Context.for_application('.', environment: 'test', logger: NullLogger.new)
|
112
|
+
ctl_ds = ctx.get_data_source('sql', 'dwhctl')
|
113
|
+
|
114
|
+
event_queue = SQSDataSource.new_mock(queue: [
|
115
|
+
# 1st ReceiveMessage
|
116
|
+
[
|
117
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.aaa/datafile-0001.json.gz'),
|
118
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.bbb/datafile-0001.json.gz'),
|
119
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.ccc/datafile-0002.json.gz'),
|
120
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.aaa/datafile-0002.json.gz'),
|
121
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.bbb/datafile-0003.json.gz'),
|
122
|
+
SQSMock::Message.s3_object_created_event('s3://test-bucket/testschema.ccc/datafile-0003.json.gz'),
|
123
|
+
SQSMock::Message.new(body: {eventSource: 'bricolage:system', eventName: 'flushtable', tableName: 'testschema.bbb'}),
|
124
|
+
SQSMock::Message.new(body: {eventSource: 'bricolage:system', eventName: 'shutdown'})
|
125
|
+
]
|
126
|
+
])
|
127
|
+
|
128
|
+
task_queue = SQSDataSource.new_mock
|
129
|
+
|
130
|
+
object_buffer = ObjectBuffer.new(
|
131
|
+
control_data_source: ctl_ds,
|
132
|
+
logger: ctx.logger
|
133
|
+
)
|
134
|
+
|
135
|
+
url_patterns = URLPatterns.for_config([
|
136
|
+
{
|
137
|
+
"url" => %r<\As3://test-bucket/testschema\.(?<table>\w+)/datafile-\d{4}\.json\.gz>.source,
|
138
|
+
"schema" => 'testschema',
|
139
|
+
"table" => '%table'
|
140
|
+
}
|
141
|
+
])
|
142
|
+
|
143
|
+
dispatcher = Dispatcher.new(
|
144
|
+
event_queue: event_queue,
|
145
|
+
task_queue: task_queue,
|
146
|
+
object_buffer: object_buffer,
|
147
|
+
url_patterns: url_patterns,
|
148
|
+
dispatch_interval: 600,
|
149
|
+
logger: ctx.logger
|
150
|
+
)
|
151
|
+
|
152
|
+
# FIXME: database cleaner
|
153
|
+
ctl_ds.open {|conn|
|
154
|
+
conn.update("truncate strload_tables")
|
155
|
+
conn.update("truncate strload_objects")
|
156
|
+
conn.update("truncate strload_task_objects")
|
157
|
+
conn.update("truncate strload_tasks")
|
158
|
+
conn.update("insert into strload_tables values ('testschema', 'aaa', 'testschema.aaa', 100, 1800, false)")
|
159
|
+
conn.update("insert into strload_tables values ('testschema', 'bbb', 'testschema.bbb', 100, 1800, false)")
|
160
|
+
conn.update("insert into strload_tables values ('testschema', 'ccc', 'testschema.ccc', 100, 1800, false)")
|
161
|
+
}
|
162
|
+
dispatcher.event_loop
|
163
|
+
|
164
|
+
# Event Queue Call Sequence
|
165
|
+
hst = event_queue.client.call_history
|
166
|
+
assert_equal 5, hst.size
|
167
|
+
assert_equal :send_message, hst[0].name # start dispatch timer
|
168
|
+
assert_equal :receive_message, hst[1].name
|
169
|
+
assert_equal :delete_message, hst[2].name # delete flushtable event
|
170
|
+
assert_equal :delete_message, hst[3].name # delete shutdown event
|
171
|
+
assert_equal :delete_message_batch, hst[4].name
|
172
|
+
|
173
|
+
# Task Queue Call Sequence
|
174
|
+
hst = task_queue.client.call_history
|
175
|
+
assert_equal 1, hst.size
|
176
|
+
assert_equal :send_message, hst[0].name
|
177
|
+
assert(/streaming_load_v3/ =~ hst[0].args[:message_body])
|
178
|
+
task_id = JSON.load(hst[0].args[:message_body])['Records'][0]['taskId'].to_i
|
179
|
+
assert_not_equal 0, task_id
|
180
|
+
|
181
|
+
# Object Buffer
|
182
|
+
assert_equal [], unassigned_table_objects(ctl_ds, 'testschema.bbb')
|
183
|
+
task = ctl_ds.open {|conn| LoadTask.load(conn, task_id) }
|
184
|
+
assert_equal 'testschema', task.schema
|
185
|
+
assert_equal 'bbb', task.table
|
186
|
+
assert_equal 2, task.object_urls.size
|
187
|
+
end
|
188
|
+
|
189
|
+
def unassigned_table_objects(ctl_ds, table_name)
|
190
|
+
ctl_ds.open {|conn|
|
191
|
+
conn.query_values(<<-EndSQL)
|
192
|
+
select
|
193
|
+
object_url
|
194
|
+
from
|
195
|
+
strload_objects
|
196
|
+
where
|
197
|
+
data_source_id = '#{table_name}'
|
198
|
+
and object_id not in (select object_id from strload_task_objects)
|
199
|
+
;
|
200
|
+
EndSQL
|
201
|
+
}
|
202
|
+
end
|
203
|
+
|
108
204
|
end
|
109
205
|
|
110
206
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bricolage-streamingload
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minero Aoki
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-08-
|
12
|
+
date: 2016-08-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bricolage
|