bricolage-streamingload 0.12.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bricolage/streamingload/chunk.rb +19 -0
- data/lib/bricolage/streamingload/{objectbuffer.rb → chunkbuffer.rb} +81 -100
- data/lib/bricolage/streamingload/chunkrouter.rb +57 -0
- data/lib/bricolage/streamingload/dispatcher.rb +50 -46
- data/lib/bricolage/streamingload/{event.rb → dispatchermessage.rb} +24 -23
- data/lib/bricolage/streamingload/incomingchunk.rb +35 -0
- data/lib/bricolage/streamingload/{task.rb → loadermessage.rb} +10 -6
- data/lib/bricolage/streamingload/loadtask.rb +17 -0
- data/lib/bricolage/streamingload/taskhandler.rb +2 -2
- data/lib/bricolage/streamingload/version.rb +1 -1
- data/test/streamingload/test_dispatcher.rb +10 -8
- data/test/streamingload/{test_event.rb → test_dispatchermessage.rb} +5 -5
- data/test/test_sqsdatasource.rb +0 -1
- metadata +10 -7
- data/lib/bricolage/streamingload/urlpatterns.rb +0 -59
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1aacc6209260b4c74e823fc9ea903bdd811c4d2b
|
4
|
+
data.tar.gz: c0b869e51e67a22708dd36b22afefb275e9a3169
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6721bcb5c9dd506c801e71970e450e6a9318eac5171df099906ef91236e516a14f155cdd952240a534b80ab82771a88e22db49b309c8ab9cb210e1ae0747f1a
|
7
|
+
data.tar.gz: 95469bdc023a41ab08f4b0d6df776b95ef733949a87fb5e053f9bf0b1d74c75f35cde9b89335fe12ea486a214364cddcb9c263c7340bbdbdbd5f16f541f66653
|
@@ -1,43 +1,12 @@
|
|
1
|
-
require 'bricolage/streamingload/
|
1
|
+
require 'bricolage/streamingload/loadtask'
|
2
|
+
require 'bricolage/streamingload/chunk'
|
2
3
|
require 'bricolage/sqlutils'
|
3
|
-
require 'forwardable'
|
4
4
|
|
5
5
|
module Bricolage
|
6
6
|
|
7
7
|
module StreamingLoad
|
8
8
|
|
9
|
-
class
|
10
|
-
|
11
|
-
extend Forwardable
|
12
|
-
|
13
|
-
def initialize(event, components)
|
14
|
-
@event = event
|
15
|
-
@components = components
|
16
|
-
end
|
17
|
-
|
18
|
-
attr_reader :event
|
19
|
-
|
20
|
-
def_delegator '@event', :url
|
21
|
-
def_delegator '@event', :size
|
22
|
-
def_delegator '@event', :message_id
|
23
|
-
def_delegator '@event', :receipt_handle
|
24
|
-
def_delegator '@components', :schema_name
|
25
|
-
def_delegator '@components', :table_name
|
26
|
-
|
27
|
-
def data_source_id
|
28
|
-
"#{schema_name}.#{table_name}"
|
29
|
-
end
|
30
|
-
|
31
|
-
alias qualified_name data_source_id
|
32
|
-
|
33
|
-
def event_time
|
34
|
-
@event.time
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
class ObjectBuffer
|
9
|
+
class ChunkBuffer
|
41
10
|
|
42
11
|
TASK_GENERATION_TIME_LIMIT = 30 #sec
|
43
12
|
|
@@ -49,74 +18,83 @@ module Bricolage
|
|
49
18
|
@task_generation_time_limit = TASK_GENERATION_TIME_LIMIT
|
50
19
|
end
|
51
20
|
|
52
|
-
|
21
|
+
# chunk :: IncomingChunk
|
22
|
+
def save(chunk)
|
53
23
|
@ctl_ds.open {|conn|
|
54
24
|
suppress_sql_logging {
|
55
25
|
conn.transaction {
|
56
|
-
object_id = insert_object(conn,
|
26
|
+
object_id = insert_object(conn, chunk)
|
57
27
|
if object_id
|
58
28
|
insert_task_objects(conn, object_id)
|
59
29
|
else
|
60
|
-
|
30
|
+
@logger.info "Duplicated object recieved: url=#{chunk.url}"
|
31
|
+
insert_dup_object(conn, chunk)
|
61
32
|
end
|
62
33
|
}
|
63
34
|
}
|
64
35
|
}
|
65
36
|
end
|
66
37
|
|
67
|
-
# Flushes multiple
|
68
|
-
def
|
69
|
-
task_ids =
|
70
|
-
|
71
|
-
|
38
|
+
# Flushes chunks of multiple streams, which are met conditions.
|
39
|
+
def flush_partial
|
40
|
+
task_ids = nil
|
41
|
+
tasks = nil
|
42
|
+
|
43
|
+
@ctl_ds.open {|conn|
|
44
|
+
warn_slow_task_generation {
|
72
45
|
conn.transaction {|txn|
|
73
46
|
task_ids = insert_tasks(conn)
|
74
|
-
unless task_ids.empty?
|
75
|
-
update_task_object_mappings(conn, task_ids)
|
76
|
-
log_mapped_object_num(conn, task_ids)
|
77
|
-
end
|
47
|
+
update_task_objects(conn, task_ids) unless task_ids.empty?
|
78
48
|
}
|
79
49
|
}
|
50
|
+
log_task_ids(task_ids)
|
51
|
+
tasks = load_tasks(conn, task_ids)
|
80
52
|
}
|
81
|
-
|
53
|
+
tasks
|
82
54
|
end
|
83
55
|
|
84
|
-
# Flushes all
|
85
|
-
#
|
86
|
-
def
|
87
|
-
|
56
|
+
# Flushes all chunks of all stream with no additional conditions,
|
57
|
+
# to create "system checkpoint".
|
58
|
+
def flush_all
|
59
|
+
all_task_ids = []
|
60
|
+
tasks = nil
|
61
|
+
|
88
62
|
@ctl_ds.open {|conn|
|
89
63
|
conn.transaction {|txn|
|
90
|
-
#
|
64
|
+
# update_task_objects may not consume all saved objects
|
91
65
|
# (e.g. there are too many objects for one table), we must create
|
92
|
-
# tasks repeatedly until
|
93
|
-
until (
|
94
|
-
|
95
|
-
|
96
|
-
task_ids.concat ids
|
66
|
+
# tasks repeatedly until all objects are flushed.
|
67
|
+
until (task_ids = insert_tasks(conn, force: true)).empty?
|
68
|
+
update_task_objects(conn, task_ids)
|
69
|
+
all_task_ids.concat task_ids
|
97
70
|
end
|
98
71
|
}
|
72
|
+
log_task_ids(all_task_ids)
|
73
|
+
tasks = load_tasks(conn, all_task_ids)
|
99
74
|
}
|
100
|
-
|
75
|
+
tasks
|
101
76
|
end
|
102
77
|
|
103
|
-
# Flushes
|
104
|
-
#
|
105
|
-
def
|
106
|
-
|
78
|
+
# Flushes all chunks of the specified stream with no additional conditions,
|
79
|
+
# to create "stream checkpoint".
|
80
|
+
def flush_stream(stream_name)
|
81
|
+
all_task_ids = []
|
82
|
+
tasks = nil
|
83
|
+
|
107
84
|
@ctl_ds.open {|conn|
|
108
85
|
conn.transaction {|txn|
|
109
|
-
#
|
86
|
+
# update_task_objects may not consume all saved objects
|
110
87
|
# (e.g. there are too many objects for one table), we must create
|
111
|
-
# tasks repeatedly until
|
112
|
-
until (
|
113
|
-
|
114
|
-
|
115
|
-
task_ids.concat ids
|
88
|
+
# tasks repeatedly until all objects are flushed.
|
89
|
+
until (task_ids = insert_tasks_for_stream(conn, stream_name)).empty?
|
90
|
+
update_task_objects(conn, task_ids)
|
91
|
+
all_task_ids.concat task_ids
|
116
92
|
end
|
117
93
|
}
|
94
|
+
log_task_ids(all_task_ids)
|
95
|
+
tasks = load_tasks(conn, all_task_ids)
|
118
96
|
}
|
119
|
-
|
97
|
+
tasks
|
120
98
|
end
|
121
99
|
|
122
100
|
private
|
@@ -134,7 +112,7 @@ module Bricolage
|
|
134
112
|
values
|
135
113
|
( #{s obj.url}
|
136
114
|
, #{obj.size}
|
137
|
-
, #{s obj.
|
115
|
+
, #{s obj.stream_name}
|
138
116
|
, #{s obj.message_id}
|
139
117
|
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
140
118
|
, current_timestamp
|
@@ -148,7 +126,6 @@ module Bricolage
|
|
148
126
|
end
|
149
127
|
|
150
128
|
def insert_dup_object(conn, obj)
|
151
|
-
@logger.info "Duplicated object recieved: object_url=#{obj.url}"
|
152
129
|
conn.update(<<-EndSQL)
|
153
130
|
insert into strload_dup_objects
|
154
131
|
( object_url
|
@@ -161,7 +138,7 @@ module Bricolage
|
|
161
138
|
values
|
162
139
|
( #{s obj.url}
|
163
140
|
, #{obj.size}
|
164
|
-
, #{s obj.
|
141
|
+
, #{s obj.stream_name}
|
165
142
|
, #{s obj.message_id}
|
166
143
|
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
167
144
|
, current_timestamp
|
@@ -184,10 +161,6 @@ module Bricolage
|
|
184
161
|
EndSQL
|
185
162
|
end
|
186
163
|
|
187
|
-
def insert_tasks_force(conn)
|
188
|
-
insert_tasks(conn, force: true)
|
189
|
-
end
|
190
|
-
|
191
164
|
def insert_tasks(conn, force: false)
|
192
165
|
task_ids = conn.query_values(<<-EndSQL)
|
193
166
|
insert into strload_tasks
|
@@ -239,11 +212,10 @@ module Bricolage
|
|
239
212
|
;
|
240
213
|
EndSQL
|
241
214
|
|
242
|
-
log_created_tasks task_ids
|
243
215
|
task_ids
|
244
216
|
end
|
245
217
|
|
246
|
-
def
|
218
|
+
def insert_tasks_for_stream(conn, stream_name)
|
247
219
|
task_ids = conn.query_values(<<-EndSQL)
|
248
220
|
insert into strload_tasks
|
249
221
|
( task_class
|
@@ -273,17 +245,15 @@ module Bricolage
|
|
273
245
|
using (data_source_id)
|
274
246
|
where
|
275
247
|
-- does not check disabled
|
276
|
-
data_source_id = #{s
|
248
|
+
data_source_id = #{s stream_name}
|
277
249
|
returning task_id
|
278
250
|
;
|
279
251
|
EndSQL
|
280
252
|
|
281
|
-
# It must be 1
|
282
|
-
log_created_tasks(task_ids)
|
283
253
|
task_ids
|
284
254
|
end
|
285
255
|
|
286
|
-
def
|
256
|
+
def update_task_objects(conn, task_ids)
|
287
257
|
conn.update(<<-EndSQL)
|
288
258
|
update strload_task_objects dst
|
289
259
|
set
|
@@ -309,25 +279,36 @@ module Bricolage
|
|
309
279
|
and tsk_obj.object_seq <= tables.load_batch_size
|
310
280
|
;
|
311
281
|
EndSQL
|
282
|
+
# UPDATE statement cannot return values
|
283
|
+
nil
|
312
284
|
end
|
313
285
|
|
314
|
-
def
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
286
|
+
def load_tasks(conn, task_ids)
|
287
|
+
return [] if task_ids.empty?
|
288
|
+
|
289
|
+
records = suppress_sql_logging {
|
290
|
+
conn.query_rows(<<-EndSQL)
|
291
|
+
select
|
292
|
+
t.task_id
|
293
|
+
, t.object_id
|
294
|
+
, o.object_url
|
295
|
+
, o.object_size
|
296
|
+
from
|
297
|
+
strload_task_objects t
|
298
|
+
inner join strload_objects o using (object_id)
|
299
|
+
where
|
300
|
+
task_id in (#{task_ids.join(',')})
|
301
|
+
;
|
302
|
+
EndSQL
|
303
|
+
}
|
304
|
+
|
305
|
+
records.group_by {|row| row['task_id'] }.map {|task_id, rows|
|
306
|
+
chunks = rows.map {|row|
|
307
|
+
id, url, size = row.values_at('object_id', 'object_url', 'object_size')
|
308
|
+
Chunk.new(id: id, url: url, size: size)
|
309
|
+
}
|
310
|
+
LoadTask.new(id: task_id, chunks: chunks)
|
311
|
+
}
|
331
312
|
end
|
332
313
|
|
333
314
|
def suppress_sql_logging
|
@@ -341,7 +322,7 @@ module Bricolage
|
|
341
322
|
end
|
342
323
|
end
|
343
324
|
|
344
|
-
def
|
325
|
+
def log_task_ids(task_ids)
|
345
326
|
created_task_num = task_ids.size
|
346
327
|
@logger.info "Number of task created: #{created_task_num}"
|
347
328
|
@logger.info "Created task ids: #{task_ids}" if created_task_num > 0
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'bricolage/streamingload/incomingchunk'
|
2
|
+
|
3
|
+
module Bricolage
|
4
|
+
|
5
|
+
module StreamingLoad
|
6
|
+
|
7
|
+
class ChunkRoutingFailed < StandardError; end
|
8
|
+
|
9
|
+
|
10
|
+
class ChunkRouter
|
11
|
+
|
12
|
+
def ChunkRouter.for_config(configs)
|
13
|
+
new(configs.map {|c|
|
14
|
+
Route.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
|
15
|
+
})
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(routes)
|
19
|
+
@routes = routes
|
20
|
+
end
|
21
|
+
|
22
|
+
def route(msg)
|
23
|
+
@routes.each do |route|
|
24
|
+
stream_name = route.match(msg.url)
|
25
|
+
return IncomingChunk.new(msg, stream_name) if stream_name
|
26
|
+
end
|
27
|
+
raise ChunkRoutingFailed, "could not detect stream name: #{url.inspect}"
|
28
|
+
end
|
29
|
+
|
30
|
+
class Route
|
31
|
+
def initialize(url:, schema:, table:)
|
32
|
+
@url_pattern = /\A#{url}\z/
|
33
|
+
@schema = schema
|
34
|
+
@table = table
|
35
|
+
end
|
36
|
+
|
37
|
+
def match(url)
|
38
|
+
m = @url_pattern.match(url) or return nil
|
39
|
+
c1 = get_component(m, @schema)
|
40
|
+
c2 = get_component(m, @table)
|
41
|
+
"#{c1}.#{c2}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_component(m, label)
|
45
|
+
if /\A%/ =~ label
|
46
|
+
m[label[1..-1]]
|
47
|
+
else
|
48
|
+
label
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end # module StreamingLoad
|
56
|
+
|
57
|
+
end # module Bricolage
|
@@ -3,9 +3,10 @@ require 'bricolage/exception'
|
|
3
3
|
require 'bricolage/version'
|
4
4
|
require 'bricolage/sqsdatasource'
|
5
5
|
require 'bricolage/logger'
|
6
|
-
require 'bricolage/streamingload/
|
7
|
-
require 'bricolage/streamingload/
|
8
|
-
require 'bricolage/streamingload/
|
6
|
+
require 'bricolage/streamingload/dispatchermessage'
|
7
|
+
require 'bricolage/streamingload/loadermessage'
|
8
|
+
require 'bricolage/streamingload/chunkrouter'
|
9
|
+
require 'bricolage/streamingload/chunkbuffer'
|
9
10
|
require 'bricolage/streamingload/alertinglogger'
|
10
11
|
require 'aws-sdk'
|
11
12
|
require 'yaml'
|
@@ -40,18 +41,18 @@ module Bricolage
|
|
40
41
|
)
|
41
42
|
end
|
42
43
|
|
43
|
-
|
44
|
+
chunk_buffer = ChunkBuffer.new(
|
44
45
|
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_ctl')),
|
45
46
|
logger: logger
|
46
47
|
)
|
47
48
|
|
48
|
-
|
49
|
+
chunk_router = ChunkRouter.for_config(config.fetch('url_patterns'))
|
49
50
|
|
50
51
|
dispatcher = Dispatcher.new(
|
51
52
|
event_queue: event_queue,
|
52
53
|
task_queue: task_queue,
|
53
|
-
|
54
|
-
|
54
|
+
chunk_router: chunk_router,
|
55
|
+
chunk_buffer: chunk_buffer,
|
55
56
|
dispatch_interval: config.fetch('dispatch-interval', 60),
|
56
57
|
logger: logger
|
57
58
|
)
|
@@ -60,6 +61,8 @@ module Bricolage
|
|
60
61
|
create_pid_file opts.pid_file_path if opts.pid_file_path
|
61
62
|
Dir.chdir '/'
|
62
63
|
dispatcher.event_loop
|
64
|
+
rescue SystemExit
|
65
|
+
;
|
63
66
|
rescue Exception => e
|
64
67
|
logger.exception e
|
65
68
|
logger.error "dispatcher abort: pid=#{$$}"
|
@@ -82,11 +85,11 @@ module Bricolage
|
|
82
85
|
# ignore
|
83
86
|
end
|
84
87
|
|
85
|
-
def initialize(event_queue:, task_queue:,
|
88
|
+
def initialize(event_queue:, task_queue:, chunk_router:, chunk_buffer:, dispatch_interval:, logger:)
|
86
89
|
@event_queue = event_queue
|
87
90
|
@task_queue = task_queue
|
88
|
-
@
|
89
|
-
@
|
91
|
+
@chunk_router = chunk_router
|
92
|
+
@chunk_buffer = chunk_buffer
|
90
93
|
@dispatch_interval = dispatch_interval
|
91
94
|
@dispatch_message_id = nil
|
92
95
|
@logger = logger
|
@@ -99,7 +102,7 @@ module Bricolage
|
|
99
102
|
def event_loop
|
100
103
|
logger.info "*** dispatcher started: pid=#{$$}"
|
101
104
|
set_dispatch_timer
|
102
|
-
@event_queue.handle_messages(handler: self, message_class:
|
105
|
+
@event_queue.handle_messages(handler: self, message_class: DispatcherMessage)
|
103
106
|
@event_queue.process_async_delete_force
|
104
107
|
logger.info "*** shutdown gracefully: pid=#{$$}"
|
105
108
|
end
|
@@ -111,86 +114,90 @@ module Bricolage
|
|
111
114
|
|
112
115
|
if @dispatch_requested
|
113
116
|
logger.info "*** dispatch requested"
|
114
|
-
|
117
|
+
do_handle_dispatch
|
115
118
|
@dispatch_requested = false
|
116
119
|
end
|
117
120
|
|
118
121
|
if @checkpoint_requested
|
119
|
-
|
122
|
+
do_handle_checkpoint
|
120
123
|
@checkpoint_requested = false # is needless, but reset it just in case
|
121
124
|
end
|
122
125
|
end
|
123
126
|
|
124
|
-
def handle_unknown(
|
125
|
-
logger.warn "unknown event: #{
|
126
|
-
@event_queue.delete_message_async(
|
127
|
+
def handle_unknown(msg)
|
128
|
+
logger.warn "unknown event: #{msg.message_body}"
|
129
|
+
@event_queue.delete_message_async(msg)
|
127
130
|
end
|
128
131
|
|
129
|
-
def handle_shutdown(
|
132
|
+
def handle_shutdown(msg)
|
130
133
|
logger.info "*** shutdown requested"
|
131
134
|
@event_queue.initiate_terminate
|
132
135
|
# Delete this event immediately
|
133
|
-
@event_queue.delete_message(
|
136
|
+
@event_queue.delete_message(msg)
|
134
137
|
end
|
135
138
|
|
136
|
-
def handle_checkpoint(
|
139
|
+
def handle_checkpoint(msg)
|
137
140
|
# Delay creating CHECKPOINT after the current message batch,
|
138
141
|
# because any other extra events are already received.
|
139
142
|
@checkpoint_requested = true
|
140
143
|
# Delete this event immediately
|
141
|
-
@event_queue.delete_message(
|
144
|
+
@event_queue.delete_message(msg)
|
142
145
|
end
|
143
146
|
|
144
|
-
def
|
147
|
+
def do_handle_checkpoint
|
145
148
|
logger.info "*** checkpoint requested"
|
146
149
|
logger.info "Force-flushing all objects..."
|
147
|
-
tasks = @
|
148
|
-
|
150
|
+
tasks = @chunk_buffer.flush_all
|
151
|
+
dispatch_tasks tasks
|
149
152
|
logger.info "All objects flushed; shutting down..."
|
150
153
|
@event_queue.initiate_terminate
|
151
154
|
end
|
152
155
|
|
153
|
-
def handle_data(
|
154
|
-
unless
|
155
|
-
@event_queue.delete_message_async(
|
156
|
+
def handle_data(msg)
|
157
|
+
unless msg.created_event?
|
158
|
+
@event_queue.delete_message_async(msg)
|
156
159
|
return
|
157
160
|
end
|
158
|
-
|
159
|
-
@
|
160
|
-
@event_queue.delete_message_async(
|
161
|
+
chunk = @chunk_router.route(msg)
|
162
|
+
@chunk_buffer.save(chunk)
|
163
|
+
@event_queue.delete_message_async(msg)
|
161
164
|
end
|
162
165
|
|
163
|
-
def handle_dispatch(
|
166
|
+
def handle_dispatch(msg)
|
164
167
|
# Dispatching tasks may takes 10 minutes or more, it can exceeds visibility timeout.
|
165
168
|
# To avoid this, delay dispatching until all events of current message batch are processed.
|
166
|
-
if @dispatch_message_id ==
|
169
|
+
if @dispatch_message_id == msg.message_id
|
167
170
|
@dispatch_requested = true
|
168
171
|
end
|
169
|
-
@event_queue.delete_message_async(
|
172
|
+
@event_queue.delete_message_async(msg)
|
170
173
|
end
|
171
174
|
|
172
|
-
def
|
173
|
-
tasks = @
|
174
|
-
|
175
|
+
def do_handle_dispatch
|
176
|
+
tasks = @chunk_buffer.flush_partial
|
177
|
+
dispatch_tasks tasks
|
175
178
|
set_dispatch_timer
|
176
179
|
end
|
177
180
|
|
178
181
|
def set_dispatch_timer
|
179
|
-
res = @event_queue.send_message(
|
182
|
+
res = @event_queue.send_message(DispatchDispatcherMessage.create(delay_seconds: @dispatch_interval))
|
180
183
|
@dispatch_message_id = res.message_id
|
181
184
|
end
|
182
185
|
|
183
|
-
def handle_flushtable(
|
184
|
-
|
185
|
-
|
186
|
-
|
186
|
+
def handle_flushtable(msg)
|
187
|
+
# FIXME: badly named attribute. table_name is really stream_name, which is called as data_source_id, too.
|
188
|
+
stream_name = msg.table_name
|
189
|
+
|
190
|
+
logger.info "*** flushtable requested: stream_name=#{stream_name}"
|
191
|
+
tasks = @chunk_buffer.flush_stream(stream_name)
|
192
|
+
dispatch_tasks tasks
|
187
193
|
# Delete this event immediately
|
188
|
-
@event_queue.delete_message(
|
194
|
+
@event_queue.delete_message(msg)
|
189
195
|
end
|
190
196
|
|
191
|
-
def
|
197
|
+
def dispatch_tasks(tasks)
|
192
198
|
tasks.each do |task|
|
193
|
-
|
199
|
+
msg = StreamingLoadV3LoaderMessage.for_load_task(task)
|
200
|
+
@task_queue.put msg
|
194
201
|
end
|
195
202
|
end
|
196
203
|
|
@@ -207,9 +214,6 @@ module Bricolage
|
|
207
214
|
@rest_arguments = nil
|
208
215
|
|
209
216
|
@opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
|
210
|
-
opts.on('--task-id=id', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
|
211
|
-
@task_id = task_id
|
212
|
-
}
|
213
217
|
opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{Context::DEFAULT_ENV}]") {|env|
|
214
218
|
@environment = env
|
215
219
|
}
|
@@ -1,18 +1,19 @@
|
|
1
|
+
require 'bricolage/streamingload/chunk'
|
1
2
|
require 'bricolage/sqsdatasource'
|
2
3
|
|
3
4
|
module Bricolage
|
4
5
|
|
5
6
|
module StreamingLoad
|
6
7
|
|
7
|
-
class
|
8
|
+
class DispatcherMessage < SQSMessage
|
8
9
|
|
9
|
-
def
|
10
|
+
def DispatcherMessage.get_concrete_class(msg, rec)
|
10
11
|
case
|
11
|
-
when rec['eventName'] == 'shutdown' then
|
12
|
-
when rec['eventName'] == 'dispatch' then
|
13
|
-
when rec['eventName'] == 'flushtable' then
|
14
|
-
when rec['eventName'] == 'checkpoint' then
|
15
|
-
when !!rec['s3'] then
|
12
|
+
when rec['eventName'] == 'shutdown' then ShutdownDispatcherMessage
|
13
|
+
when rec['eventName'] == 'dispatch' then DispatchDispatcherMessage
|
14
|
+
when rec['eventName'] == 'flushtable' then FlushTableDispatcherMessage
|
15
|
+
when rec['eventName'] == 'checkpoint' then CheckPointDispatcherMessage
|
16
|
+
when !!rec['s3'] then S3ObjectDispatcherMessage
|
16
17
|
else UnknownSQSMessage
|
17
18
|
end
|
18
19
|
end
|
@@ -28,13 +29,13 @@ module Bricolage
|
|
28
29
|
end
|
29
30
|
|
30
31
|
|
31
|
-
class
|
32
|
+
class ShutdownDispatcherMessage < DispatcherMessage
|
32
33
|
|
33
|
-
def
|
34
|
+
def ShutdownDispatcherMessage.create
|
34
35
|
super name: 'shutdown'
|
35
36
|
end
|
36
37
|
|
37
|
-
def
|
38
|
+
def ShutdownDispatcherMessage.parse_sqs_record(msg, rec)
|
38
39
|
{}
|
39
40
|
end
|
40
41
|
|
@@ -47,13 +48,13 @@ module Bricolage
|
|
47
48
|
|
48
49
|
|
49
50
|
# Flushes all tables and shutdown
|
50
|
-
class
|
51
|
+
class CheckPointDispatcherMessage < DispatcherMessage
|
51
52
|
|
52
|
-
def
|
53
|
+
def CheckPointDispatcherMessage.create
|
53
54
|
super name: 'checkpoint'
|
54
55
|
end
|
55
56
|
|
56
|
-
def
|
57
|
+
def CheckPointDispatcherMessage.parse_sqs_record(msg, rec)
|
57
58
|
{}
|
58
59
|
end
|
59
60
|
|
@@ -65,13 +66,13 @@ module Bricolage
|
|
65
66
|
end
|
66
67
|
|
67
68
|
|
68
|
-
class
|
69
|
+
class FlushTableDispatcherMessage < DispatcherMessage
|
69
70
|
|
70
|
-
def
|
71
|
+
def FlushTableDispatcherMessage.create(table_name:)
|
71
72
|
super name: 'flushtable', table_name: table_name
|
72
73
|
end
|
73
74
|
|
74
|
-
def
|
75
|
+
def FlushTableDispatcherMessage.parse_sqs_record(msg, rec)
|
75
76
|
{
|
76
77
|
table_name: rec['tableName']
|
77
78
|
}
|
@@ -94,9 +95,9 @@ module Bricolage
|
|
94
95
|
end
|
95
96
|
|
96
97
|
|
97
|
-
class
|
98
|
+
class DispatchDispatcherMessage < DispatcherMessage
|
98
99
|
|
99
|
-
def
|
100
|
+
def DispatchDispatcherMessage.create(delay_seconds:)
|
100
101
|
super name: 'dispatch', delay_seconds: delay_seconds
|
101
102
|
end
|
102
103
|
|
@@ -108,9 +109,9 @@ module Bricolage
|
|
108
109
|
end
|
109
110
|
|
110
111
|
|
111
|
-
class
|
112
|
+
class S3ObjectDispatcherMessage < DispatcherMessage
|
112
113
|
|
113
|
-
def
|
114
|
+
def S3ObjectDispatcherMessage.parse_sqs_record(msg, rec)
|
114
115
|
{
|
115
116
|
region: rec['awsRegion'],
|
116
117
|
bucket: rec['s3']['bucket']['name'],
|
@@ -144,12 +145,12 @@ module Bricolage
|
|
144
145
|
true
|
145
146
|
end
|
146
147
|
|
147
|
-
def
|
148
|
+
def created_event?
|
148
149
|
!!(/\AObjectCreated:(?!Copy)/ =~ @name)
|
149
150
|
end
|
150
151
|
|
151
|
-
def
|
152
|
-
|
152
|
+
def chunk
|
153
|
+
Chunk.new(id: nil, url: url, size: size)
|
153
154
|
end
|
154
155
|
|
155
156
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
module Bricolage
|
4
|
+
|
5
|
+
module StreamingLoad
|
6
|
+
|
7
|
+
# a Chunk which is not saved yet (received from SQS)
|
8
|
+
class IncomingChunk
|
9
|
+
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
def initialize(message, stream_name)
|
13
|
+
@chunk = message.chunk
|
14
|
+
@message = message
|
15
|
+
@stream_name = stream_name
|
16
|
+
end
|
17
|
+
|
18
|
+
def_delegator '@chunk', :id
|
19
|
+
def_delegator '@chunk', :url
|
20
|
+
def_delegator '@chunk', :size
|
21
|
+
|
22
|
+
def_delegator '@message', :message_id
|
23
|
+
def_delegator '@message', :receipt_handle
|
24
|
+
|
25
|
+
def event_time
|
26
|
+
@message.time
|
27
|
+
end
|
28
|
+
|
29
|
+
attr_reader :stream_name
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -4,11 +4,11 @@ module Bricolage
|
|
4
4
|
|
5
5
|
module StreamingLoad
|
6
6
|
|
7
|
-
class
|
7
|
+
class LoaderMessage < SQSMessage
|
8
8
|
|
9
|
-
def
|
9
|
+
def LoaderMessage.get_concrete_class(msg, rec)
|
10
10
|
case
|
11
|
-
when rec['eventName'] == 'streaming_load_v3' then
|
11
|
+
when rec['eventName'] == 'streaming_load_v3' then StreamingLoadV3LoaderMessage
|
12
12
|
else UnknownSQSMessage
|
13
13
|
end
|
14
14
|
end
|
@@ -24,13 +24,17 @@ module Bricolage
|
|
24
24
|
end
|
25
25
|
|
26
26
|
|
27
|
-
class
|
27
|
+
class StreamingLoadV3LoaderMessage < LoaderMessage
|
28
28
|
|
29
|
-
def
|
29
|
+
def StreamingLoadV3LoaderMessage.for_load_task(load_task)
|
30
|
+
create(task_id: load_task.id)
|
31
|
+
end
|
32
|
+
|
33
|
+
def StreamingLoadV3LoaderMessage.create(task_id:, force: false)
|
30
34
|
super name: 'streaming_load_v3', task_id: task_id, force: force
|
31
35
|
end
|
32
36
|
|
33
|
-
def
|
37
|
+
def StreamingLoadV3LoaderMessage.parse_sqs_record(msg, rec)
|
34
38
|
{
|
35
39
|
task_id: rec['taskId'],
|
36
40
|
force: (rec['force'].to_s == 'true')
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'bricolage/context'
|
2
2
|
require 'bricolage/sqsdatasource'
|
3
|
-
require 'bricolage/streamingload/
|
3
|
+
require 'bricolage/streamingload/loadermessage'
|
4
4
|
require 'bricolage/streamingload/job'
|
5
5
|
require 'bricolage/streamingload/alertinglogger'
|
6
6
|
require 'bricolage/logger'
|
@@ -113,7 +113,7 @@ module Bricolage
|
|
113
113
|
end
|
114
114
|
|
115
115
|
def event_loop
|
116
|
-
@task_queue.handle_messages(handler: self, message_class:
|
116
|
+
@task_queue.handle_messages(handler: self, message_class: LoaderMessage)
|
117
117
|
end
|
118
118
|
|
119
119
|
# message handler
|
@@ -3,6 +3,8 @@ require 'bricolage/context'
|
|
3
3
|
require 'bricolage/sqsdatasource'
|
4
4
|
require 'bricolage/sqsmock'
|
5
5
|
require 'bricolage/streamingload/dispatcher'
|
6
|
+
require 'bricolage/streamingload/chunkrouter'
|
7
|
+
require 'bricolage/streamingload/chunkbuffer'
|
6
8
|
|
7
9
|
module Bricolage
|
8
10
|
module StreamingLoad
|
@@ -35,12 +37,12 @@ module Bricolage
|
|
35
37
|
|
36
38
|
task_queue = SQSDataSource.new_mock
|
37
39
|
|
38
|
-
|
40
|
+
chunk_buffer = ChunkBuffer.new(
|
39
41
|
control_data_source: ctl_ds,
|
40
42
|
logger: ctx.logger
|
41
43
|
)
|
42
44
|
|
43
|
-
|
45
|
+
chunk_router = ChunkRouter.for_config([
|
44
46
|
{
|
45
47
|
"url" => %r<\As3://test-bucket/testschema\.desttable/datafile-\d{4}\.json\.gz>.source,
|
46
48
|
"schema" => 'testschema',
|
@@ -51,8 +53,8 @@ module Bricolage
|
|
51
53
|
dispatcher = Dispatcher.new(
|
52
54
|
event_queue: event_queue,
|
53
55
|
task_queue: task_queue,
|
54
|
-
|
55
|
-
|
56
|
+
chunk_buffer: chunk_buffer,
|
57
|
+
chunk_router: chunk_router,
|
56
58
|
dispatch_interval: 600,
|
57
59
|
logger: ctx.logger
|
58
60
|
)
|
@@ -127,12 +129,12 @@ module Bricolage
|
|
127
129
|
|
128
130
|
task_queue = SQSDataSource.new_mock
|
129
131
|
|
130
|
-
|
132
|
+
chunk_buffer = ChunkBuffer.new(
|
131
133
|
control_data_source: ctl_ds,
|
132
134
|
logger: ctx.logger
|
133
135
|
)
|
134
136
|
|
135
|
-
|
137
|
+
chunk_router = ChunkRouter.for_config([
|
136
138
|
{
|
137
139
|
"url" => %r<\As3://test-bucket/testschema\.(?<table>\w+)/datafile-\d{4}\.json\.gz>.source,
|
138
140
|
"schema" => 'testschema',
|
@@ -143,8 +145,8 @@ module Bricolage
|
|
143
145
|
dispatcher = Dispatcher.new(
|
144
146
|
event_queue: event_queue,
|
145
147
|
task_queue: task_queue,
|
146
|
-
|
147
|
-
|
148
|
+
chunk_buffer: chunk_buffer,
|
149
|
+
chunk_router: chunk_router,
|
148
150
|
dispatch_interval: 600,
|
149
151
|
logger: ctx.logger
|
150
152
|
)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'test/unit'
|
2
|
-
require 'bricolage/streamingload/
|
2
|
+
require 'bricolage/streamingload/dispatchermessage'
|
3
3
|
|
4
4
|
module Bricolage::StreamingLoad
|
5
5
|
|
6
|
-
class
|
6
|
+
class TestDispatcherMessage < Test::Unit::TestCase
|
7
7
|
|
8
8
|
def new_s3event(message_id: nil, receipt_handle: nil, name: nil, time: nil, source: nil, region: nil, bucket: nil, key: nil, size: nil)
|
9
|
-
|
9
|
+
S3ObjectDispatcherMessage.new(
|
10
10
|
message_id: message_id,
|
11
11
|
receipt_handle: receipt_handle,
|
12
12
|
name: name,
|
@@ -21,9 +21,9 @@ module Bricolage::StreamingLoad
|
|
21
21
|
|
22
22
|
test "#created?" do
|
23
23
|
e = new_s3event(name: "ObjectCreated:Put")
|
24
|
-
assert_true e.
|
24
|
+
assert_true e.created_event?
|
25
25
|
e = new_s3event(name: "ObjectCreated:Copy")
|
26
|
-
assert_false e.
|
26
|
+
assert_false e.created_event?
|
27
27
|
end
|
28
28
|
|
29
29
|
end
|
data/test/test_sqsdatasource.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bricolage-streamingload
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minero Aoki
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2018-01-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bricolage
|
@@ -120,19 +120,22 @@ files:
|
|
120
120
|
- lib/bricolage/sqsdatasource.rb
|
121
121
|
- lib/bricolage/sqsmock.rb
|
122
122
|
- lib/bricolage/streamingload/alertinglogger.rb
|
123
|
+
- lib/bricolage/streamingload/chunk.rb
|
124
|
+
- lib/bricolage/streamingload/chunkbuffer.rb
|
125
|
+
- lib/bricolage/streamingload/chunkrouter.rb
|
123
126
|
- lib/bricolage/streamingload/dispatcher.rb
|
124
|
-
- lib/bricolage/streamingload/
|
127
|
+
- lib/bricolage/streamingload/dispatchermessage.rb
|
128
|
+
- lib/bricolage/streamingload/incomingchunk.rb
|
125
129
|
- lib/bricolage/streamingload/job.rb
|
126
130
|
- lib/bricolage/streamingload/jobparams.rb
|
131
|
+
- lib/bricolage/streamingload/loadermessage.rb
|
132
|
+
- lib/bricolage/streamingload/loadtask.rb
|
127
133
|
- lib/bricolage/streamingload/manifest.rb
|
128
|
-
- lib/bricolage/streamingload/objectbuffer.rb
|
129
|
-
- lib/bricolage/streamingload/task.rb
|
130
134
|
- lib/bricolage/streamingload/taskhandler.rb
|
131
|
-
- lib/bricolage/streamingload/urlpatterns.rb
|
132
135
|
- lib/bricolage/streamingload/version.rb
|
133
136
|
- test/all.rb
|
134
137
|
- test/streamingload/test_dispatcher.rb
|
135
|
-
- test/streamingload/
|
138
|
+
- test/streamingload/test_dispatchermessage.rb
|
136
139
|
- test/streamingload/test_job.rb
|
137
140
|
- test/test_sqsdatasource.rb
|
138
141
|
homepage: https://github.com/aamine/bricolage-streamingload
|
@@ -1,59 +0,0 @@
|
|
1
|
-
module Bricolage
|
2
|
-
|
3
|
-
module StreamingLoad
|
4
|
-
|
5
|
-
class URLPatternNotMatched < StandardError; end
|
6
|
-
|
7
|
-
|
8
|
-
class URLPatterns
|
9
|
-
|
10
|
-
def URLPatterns.for_config(configs)
|
11
|
-
new(configs.map {|c|
|
12
|
-
Pattern.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
|
13
|
-
})
|
14
|
-
end
|
15
|
-
|
16
|
-
def initialize(patterns)
|
17
|
-
@patterns = patterns
|
18
|
-
end
|
19
|
-
|
20
|
-
def match(url)
|
21
|
-
@patterns.each do |pat|
|
22
|
-
components = pat.match(url)
|
23
|
-
return components if components
|
24
|
-
end
|
25
|
-
raise URLPatternNotMatched, "no URL pattern matches the object url: #{url.inspect}"
|
26
|
-
end
|
27
|
-
|
28
|
-
class Pattern
|
29
|
-
def initialize(url:, schema:, table:)
|
30
|
-
@url_pattern = /\A#{url}\z/
|
31
|
-
@schema = schema
|
32
|
-
@table = table
|
33
|
-
end
|
34
|
-
|
35
|
-
attr_reader :url_pattern
|
36
|
-
attr_reader :schema
|
37
|
-
attr_reader :table
|
38
|
-
|
39
|
-
def match(url)
|
40
|
-
m = @url_pattern.match(url) or return nil
|
41
|
-
Components.new(get_component(m, @schema), get_component(m, @table))
|
42
|
-
end
|
43
|
-
|
44
|
-
def get_component(m, label)
|
45
|
-
if /\A%/ =~ label
|
46
|
-
m[label[1..-1]]
|
47
|
-
else
|
48
|
-
label
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
Components = Struct.new(:schema_name, :table_name)
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
end # module StreamingLoad
|
58
|
-
|
59
|
-
end # module Bricolage
|