bricolage-streamingload 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bricolage/streamingload/chunk.rb +19 -0
- data/lib/bricolage/streamingload/{objectbuffer.rb → chunkbuffer.rb} +81 -100
- data/lib/bricolage/streamingload/chunkrouter.rb +57 -0
- data/lib/bricolage/streamingload/dispatcher.rb +50 -46
- data/lib/bricolage/streamingload/{event.rb → dispatchermessage.rb} +24 -23
- data/lib/bricolage/streamingload/incomingchunk.rb +35 -0
- data/lib/bricolage/streamingload/{task.rb → loadermessage.rb} +10 -6
- data/lib/bricolage/streamingload/loadtask.rb +17 -0
- data/lib/bricolage/streamingload/taskhandler.rb +2 -2
- data/lib/bricolage/streamingload/version.rb +1 -1
- data/test/streamingload/test_dispatcher.rb +10 -8
- data/test/streamingload/{test_event.rb → test_dispatchermessage.rb} +5 -5
- data/test/test_sqsdatasource.rb +0 -1
- metadata +10 -7
- data/lib/bricolage/streamingload/urlpatterns.rb +0 -59
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1aacc6209260b4c74e823fc9ea903bdd811c4d2b
|
4
|
+
data.tar.gz: c0b869e51e67a22708dd36b22afefb275e9a3169
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6721bcb5c9dd506c801e71970e450e6a9318eac5171df099906ef91236e516a14f155cdd952240a534b80ab82771a88e22db49b309c8ab9cb210e1ae0747f1a
|
7
|
+
data.tar.gz: 95469bdc023a41ab08f4b0d6df776b95ef733949a87fb5e053f9bf0b1d74c75f35cde9b89335fe12ea486a214364cddcb9c263c7340bbdbdbd5f16f541f66653
|
@@ -1,43 +1,12 @@
|
|
1
|
-
require 'bricolage/streamingload/
|
1
|
+
require 'bricolage/streamingload/loadtask'
|
2
|
+
require 'bricolage/streamingload/chunk'
|
2
3
|
require 'bricolage/sqlutils'
|
3
|
-
require 'forwardable'
|
4
4
|
|
5
5
|
module Bricolage
|
6
6
|
|
7
7
|
module StreamingLoad
|
8
8
|
|
9
|
-
class
|
10
|
-
|
11
|
-
extend Forwardable
|
12
|
-
|
13
|
-
def initialize(event, components)
|
14
|
-
@event = event
|
15
|
-
@components = components
|
16
|
-
end
|
17
|
-
|
18
|
-
attr_reader :event
|
19
|
-
|
20
|
-
def_delegator '@event', :url
|
21
|
-
def_delegator '@event', :size
|
22
|
-
def_delegator '@event', :message_id
|
23
|
-
def_delegator '@event', :receipt_handle
|
24
|
-
def_delegator '@components', :schema_name
|
25
|
-
def_delegator '@components', :table_name
|
26
|
-
|
27
|
-
def data_source_id
|
28
|
-
"#{schema_name}.#{table_name}"
|
29
|
-
end
|
30
|
-
|
31
|
-
alias qualified_name data_source_id
|
32
|
-
|
33
|
-
def event_time
|
34
|
-
@event.time
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
class ObjectBuffer
|
9
|
+
class ChunkBuffer
|
41
10
|
|
42
11
|
TASK_GENERATION_TIME_LIMIT = 30 #sec
|
43
12
|
|
@@ -49,74 +18,83 @@ module Bricolage
|
|
49
18
|
@task_generation_time_limit = TASK_GENERATION_TIME_LIMIT
|
50
19
|
end
|
51
20
|
|
52
|
-
|
21
|
+
# chunk :: IncomingChunk
|
22
|
+
def save(chunk)
|
53
23
|
@ctl_ds.open {|conn|
|
54
24
|
suppress_sql_logging {
|
55
25
|
conn.transaction {
|
56
|
-
object_id = insert_object(conn,
|
26
|
+
object_id = insert_object(conn, chunk)
|
57
27
|
if object_id
|
58
28
|
insert_task_objects(conn, object_id)
|
59
29
|
else
|
60
|
-
|
30
|
+
@logger.info "Duplicated object recieved: url=#{chunk.url}"
|
31
|
+
insert_dup_object(conn, chunk)
|
61
32
|
end
|
62
33
|
}
|
63
34
|
}
|
64
35
|
}
|
65
36
|
end
|
66
37
|
|
67
|
-
# Flushes multiple
|
68
|
-
def
|
69
|
-
task_ids =
|
70
|
-
|
71
|
-
|
38
|
+
# Flushes chunks of multiple streams, which are met conditions.
|
39
|
+
def flush_partial
|
40
|
+
task_ids = nil
|
41
|
+
tasks = nil
|
42
|
+
|
43
|
+
@ctl_ds.open {|conn|
|
44
|
+
warn_slow_task_generation {
|
72
45
|
conn.transaction {|txn|
|
73
46
|
task_ids = insert_tasks(conn)
|
74
|
-
unless task_ids.empty?
|
75
|
-
update_task_object_mappings(conn, task_ids)
|
76
|
-
log_mapped_object_num(conn, task_ids)
|
77
|
-
end
|
47
|
+
update_task_objects(conn, task_ids) unless task_ids.empty?
|
78
48
|
}
|
79
49
|
}
|
50
|
+
log_task_ids(task_ids)
|
51
|
+
tasks = load_tasks(conn, task_ids)
|
80
52
|
}
|
81
|
-
|
53
|
+
tasks
|
82
54
|
end
|
83
55
|
|
84
|
-
# Flushes all
|
85
|
-
#
|
86
|
-
def
|
87
|
-
|
56
|
+
# Flushes all chunks of all stream with no additional conditions,
|
57
|
+
# to create "system checkpoint".
|
58
|
+
def flush_all
|
59
|
+
all_task_ids = []
|
60
|
+
tasks = nil
|
61
|
+
|
88
62
|
@ctl_ds.open {|conn|
|
89
63
|
conn.transaction {|txn|
|
90
|
-
#
|
64
|
+
# update_task_objects may not consume all saved objects
|
91
65
|
# (e.g. there are too many objects for one table), we must create
|
92
|
-
# tasks repeatedly until
|
93
|
-
until (
|
94
|
-
|
95
|
-
|
96
|
-
task_ids.concat ids
|
66
|
+
# tasks repeatedly until all objects are flushed.
|
67
|
+
until (task_ids = insert_tasks(conn, force: true)).empty?
|
68
|
+
update_task_objects(conn, task_ids)
|
69
|
+
all_task_ids.concat task_ids
|
97
70
|
end
|
98
71
|
}
|
72
|
+
log_task_ids(all_task_ids)
|
73
|
+
tasks = load_tasks(conn, all_task_ids)
|
99
74
|
}
|
100
|
-
|
75
|
+
tasks
|
101
76
|
end
|
102
77
|
|
103
|
-
# Flushes
|
104
|
-
#
|
105
|
-
def
|
106
|
-
|
78
|
+
# Flushes all chunks of the specified stream with no additional conditions,
|
79
|
+
# to create "stream checkpoint".
|
80
|
+
def flush_stream(stream_name)
|
81
|
+
all_task_ids = []
|
82
|
+
tasks = nil
|
83
|
+
|
107
84
|
@ctl_ds.open {|conn|
|
108
85
|
conn.transaction {|txn|
|
109
|
-
#
|
86
|
+
# update_task_objects may not consume all saved objects
|
110
87
|
# (e.g. there are too many objects for one table), we must create
|
111
|
-
# tasks repeatedly until
|
112
|
-
until (
|
113
|
-
|
114
|
-
|
115
|
-
task_ids.concat ids
|
88
|
+
# tasks repeatedly until all objects are flushed.
|
89
|
+
until (task_ids = insert_tasks_for_stream(conn, stream_name)).empty?
|
90
|
+
update_task_objects(conn, task_ids)
|
91
|
+
all_task_ids.concat task_ids
|
116
92
|
end
|
117
93
|
}
|
94
|
+
log_task_ids(all_task_ids)
|
95
|
+
tasks = load_tasks(conn, all_task_ids)
|
118
96
|
}
|
119
|
-
|
97
|
+
tasks
|
120
98
|
end
|
121
99
|
|
122
100
|
private
|
@@ -134,7 +112,7 @@ module Bricolage
|
|
134
112
|
values
|
135
113
|
( #{s obj.url}
|
136
114
|
, #{obj.size}
|
137
|
-
, #{s obj.
|
115
|
+
, #{s obj.stream_name}
|
138
116
|
, #{s obj.message_id}
|
139
117
|
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
140
118
|
, current_timestamp
|
@@ -148,7 +126,6 @@ module Bricolage
|
|
148
126
|
end
|
149
127
|
|
150
128
|
def insert_dup_object(conn, obj)
|
151
|
-
@logger.info "Duplicated object recieved: object_url=#{obj.url}"
|
152
129
|
conn.update(<<-EndSQL)
|
153
130
|
insert into strload_dup_objects
|
154
131
|
( object_url
|
@@ -161,7 +138,7 @@ module Bricolage
|
|
161
138
|
values
|
162
139
|
( #{s obj.url}
|
163
140
|
, #{obj.size}
|
164
|
-
, #{s obj.
|
141
|
+
, #{s obj.stream_name}
|
165
142
|
, #{s obj.message_id}
|
166
143
|
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
167
144
|
, current_timestamp
|
@@ -184,10 +161,6 @@ module Bricolage
|
|
184
161
|
EndSQL
|
185
162
|
end
|
186
163
|
|
187
|
-
def insert_tasks_force(conn)
|
188
|
-
insert_tasks(conn, force: true)
|
189
|
-
end
|
190
|
-
|
191
164
|
def insert_tasks(conn, force: false)
|
192
165
|
task_ids = conn.query_values(<<-EndSQL)
|
193
166
|
insert into strload_tasks
|
@@ -239,11 +212,10 @@ module Bricolage
|
|
239
212
|
;
|
240
213
|
EndSQL
|
241
214
|
|
242
|
-
log_created_tasks task_ids
|
243
215
|
task_ids
|
244
216
|
end
|
245
217
|
|
246
|
-
def
|
218
|
+
def insert_tasks_for_stream(conn, stream_name)
|
247
219
|
task_ids = conn.query_values(<<-EndSQL)
|
248
220
|
insert into strload_tasks
|
249
221
|
( task_class
|
@@ -273,17 +245,15 @@ module Bricolage
|
|
273
245
|
using (data_source_id)
|
274
246
|
where
|
275
247
|
-- does not check disabled
|
276
|
-
data_source_id = #{s
|
248
|
+
data_source_id = #{s stream_name}
|
277
249
|
returning task_id
|
278
250
|
;
|
279
251
|
EndSQL
|
280
252
|
|
281
|
-
# It must be 1
|
282
|
-
log_created_tasks(task_ids)
|
283
253
|
task_ids
|
284
254
|
end
|
285
255
|
|
286
|
-
def
|
256
|
+
def update_task_objects(conn, task_ids)
|
287
257
|
conn.update(<<-EndSQL)
|
288
258
|
update strload_task_objects dst
|
289
259
|
set
|
@@ -309,25 +279,36 @@ module Bricolage
|
|
309
279
|
and tsk_obj.object_seq <= tables.load_batch_size
|
310
280
|
;
|
311
281
|
EndSQL
|
282
|
+
# UPDATE statement cannot return values
|
283
|
+
nil
|
312
284
|
end
|
313
285
|
|
314
|
-
def
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
286
|
+
def load_tasks(conn, task_ids)
|
287
|
+
return [] if task_ids.empty?
|
288
|
+
|
289
|
+
records = suppress_sql_logging {
|
290
|
+
conn.query_rows(<<-EndSQL)
|
291
|
+
select
|
292
|
+
t.task_id
|
293
|
+
, t.object_id
|
294
|
+
, o.object_url
|
295
|
+
, o.object_size
|
296
|
+
from
|
297
|
+
strload_task_objects t
|
298
|
+
inner join strload_objects o using (object_id)
|
299
|
+
where
|
300
|
+
task_id in (#{task_ids.join(',')})
|
301
|
+
;
|
302
|
+
EndSQL
|
303
|
+
}
|
304
|
+
|
305
|
+
records.group_by {|row| row['task_id'] }.map {|task_id, rows|
|
306
|
+
chunks = rows.map {|row|
|
307
|
+
id, url, size = row.values_at('object_id', 'object_url', 'object_size')
|
308
|
+
Chunk.new(id: id, url: url, size: size)
|
309
|
+
}
|
310
|
+
LoadTask.new(id: task_id, chunks: chunks)
|
311
|
+
}
|
331
312
|
end
|
332
313
|
|
333
314
|
def suppress_sql_logging
|
@@ -341,7 +322,7 @@ module Bricolage
|
|
341
322
|
end
|
342
323
|
end
|
343
324
|
|
344
|
-
def
|
325
|
+
def log_task_ids(task_ids)
|
345
326
|
created_task_num = task_ids.size
|
346
327
|
@logger.info "Number of task created: #{created_task_num}"
|
347
328
|
@logger.info "Created task ids: #{task_ids}" if created_task_num > 0
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'bricolage/streamingload/incomingchunk'
|
2
|
+
|
3
|
+
module Bricolage
|
4
|
+
|
5
|
+
module StreamingLoad
|
6
|
+
|
7
|
+
class ChunkRoutingFailed < StandardError; end
|
8
|
+
|
9
|
+
|
10
|
+
class ChunkRouter
|
11
|
+
|
12
|
+
def ChunkRouter.for_config(configs)
|
13
|
+
new(configs.map {|c|
|
14
|
+
Route.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
|
15
|
+
})
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(routes)
|
19
|
+
@routes = routes
|
20
|
+
end
|
21
|
+
|
22
|
+
def route(msg)
|
23
|
+
@routes.each do |route|
|
24
|
+
stream_name = route.match(msg.url)
|
25
|
+
return IncomingChunk.new(msg, stream_name) if stream_name
|
26
|
+
end
|
27
|
+
raise ChunkRoutingFailed, "could not detect stream name: #{url.inspect}"
|
28
|
+
end
|
29
|
+
|
30
|
+
class Route
|
31
|
+
def initialize(url:, schema:, table:)
|
32
|
+
@url_pattern = /\A#{url}\z/
|
33
|
+
@schema = schema
|
34
|
+
@table = table
|
35
|
+
end
|
36
|
+
|
37
|
+
def match(url)
|
38
|
+
m = @url_pattern.match(url) or return nil
|
39
|
+
c1 = get_component(m, @schema)
|
40
|
+
c2 = get_component(m, @table)
|
41
|
+
"#{c1}.#{c2}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_component(m, label)
|
45
|
+
if /\A%/ =~ label
|
46
|
+
m[label[1..-1]]
|
47
|
+
else
|
48
|
+
label
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end # module StreamingLoad
|
56
|
+
|
57
|
+
end # module Bricolage
|
@@ -3,9 +3,10 @@ require 'bricolage/exception'
|
|
3
3
|
require 'bricolage/version'
|
4
4
|
require 'bricolage/sqsdatasource'
|
5
5
|
require 'bricolage/logger'
|
6
|
-
require 'bricolage/streamingload/
|
7
|
-
require 'bricolage/streamingload/
|
8
|
-
require 'bricolage/streamingload/
|
6
|
+
require 'bricolage/streamingload/dispatchermessage'
|
7
|
+
require 'bricolage/streamingload/loadermessage'
|
8
|
+
require 'bricolage/streamingload/chunkrouter'
|
9
|
+
require 'bricolage/streamingload/chunkbuffer'
|
9
10
|
require 'bricolage/streamingload/alertinglogger'
|
10
11
|
require 'aws-sdk'
|
11
12
|
require 'yaml'
|
@@ -40,18 +41,18 @@ module Bricolage
|
|
40
41
|
)
|
41
42
|
end
|
42
43
|
|
43
|
-
|
44
|
+
chunk_buffer = ChunkBuffer.new(
|
44
45
|
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_ctl')),
|
45
46
|
logger: logger
|
46
47
|
)
|
47
48
|
|
48
|
-
|
49
|
+
chunk_router = ChunkRouter.for_config(config.fetch('url_patterns'))
|
49
50
|
|
50
51
|
dispatcher = Dispatcher.new(
|
51
52
|
event_queue: event_queue,
|
52
53
|
task_queue: task_queue,
|
53
|
-
|
54
|
-
|
54
|
+
chunk_router: chunk_router,
|
55
|
+
chunk_buffer: chunk_buffer,
|
55
56
|
dispatch_interval: config.fetch('dispatch-interval', 60),
|
56
57
|
logger: logger
|
57
58
|
)
|
@@ -60,6 +61,8 @@ module Bricolage
|
|
60
61
|
create_pid_file opts.pid_file_path if opts.pid_file_path
|
61
62
|
Dir.chdir '/'
|
62
63
|
dispatcher.event_loop
|
64
|
+
rescue SystemExit
|
65
|
+
;
|
63
66
|
rescue Exception => e
|
64
67
|
logger.exception e
|
65
68
|
logger.error "dispatcher abort: pid=#{$$}"
|
@@ -82,11 +85,11 @@ module Bricolage
|
|
82
85
|
# ignore
|
83
86
|
end
|
84
87
|
|
85
|
-
def initialize(event_queue:, task_queue:,
|
88
|
+
def initialize(event_queue:, task_queue:, chunk_router:, chunk_buffer:, dispatch_interval:, logger:)
|
86
89
|
@event_queue = event_queue
|
87
90
|
@task_queue = task_queue
|
88
|
-
@
|
89
|
-
@
|
91
|
+
@chunk_router = chunk_router
|
92
|
+
@chunk_buffer = chunk_buffer
|
90
93
|
@dispatch_interval = dispatch_interval
|
91
94
|
@dispatch_message_id = nil
|
92
95
|
@logger = logger
|
@@ -99,7 +102,7 @@ module Bricolage
|
|
99
102
|
def event_loop
|
100
103
|
logger.info "*** dispatcher started: pid=#{$$}"
|
101
104
|
set_dispatch_timer
|
102
|
-
@event_queue.handle_messages(handler: self, message_class:
|
105
|
+
@event_queue.handle_messages(handler: self, message_class: DispatcherMessage)
|
103
106
|
@event_queue.process_async_delete_force
|
104
107
|
logger.info "*** shutdown gracefully: pid=#{$$}"
|
105
108
|
end
|
@@ -111,86 +114,90 @@ module Bricolage
|
|
111
114
|
|
112
115
|
if @dispatch_requested
|
113
116
|
logger.info "*** dispatch requested"
|
114
|
-
|
117
|
+
do_handle_dispatch
|
115
118
|
@dispatch_requested = false
|
116
119
|
end
|
117
120
|
|
118
121
|
if @checkpoint_requested
|
119
|
-
|
122
|
+
do_handle_checkpoint
|
120
123
|
@checkpoint_requested = false # is needless, but reset it just in case
|
121
124
|
end
|
122
125
|
end
|
123
126
|
|
124
|
-
def handle_unknown(
|
125
|
-
logger.warn "unknown event: #{
|
126
|
-
@event_queue.delete_message_async(
|
127
|
+
def handle_unknown(msg)
|
128
|
+
logger.warn "unknown event: #{msg.message_body}"
|
129
|
+
@event_queue.delete_message_async(msg)
|
127
130
|
end
|
128
131
|
|
129
|
-
def handle_shutdown(
|
132
|
+
def handle_shutdown(msg)
|
130
133
|
logger.info "*** shutdown requested"
|
131
134
|
@event_queue.initiate_terminate
|
132
135
|
# Delete this event immediately
|
133
|
-
@event_queue.delete_message(
|
136
|
+
@event_queue.delete_message(msg)
|
134
137
|
end
|
135
138
|
|
136
|
-
def handle_checkpoint(
|
139
|
+
def handle_checkpoint(msg)
|
137
140
|
# Delay creating CHECKPOINT after the current message batch,
|
138
141
|
# because any other extra events are already received.
|
139
142
|
@checkpoint_requested = true
|
140
143
|
# Delete this event immediately
|
141
|
-
@event_queue.delete_message(
|
144
|
+
@event_queue.delete_message(msg)
|
142
145
|
end
|
143
146
|
|
144
|
-
def
|
147
|
+
def do_handle_checkpoint
|
145
148
|
logger.info "*** checkpoint requested"
|
146
149
|
logger.info "Force-flushing all objects..."
|
147
|
-
tasks = @
|
148
|
-
|
150
|
+
tasks = @chunk_buffer.flush_all
|
151
|
+
dispatch_tasks tasks
|
149
152
|
logger.info "All objects flushed; shutting down..."
|
150
153
|
@event_queue.initiate_terminate
|
151
154
|
end
|
152
155
|
|
153
|
-
def handle_data(
|
154
|
-
unless
|
155
|
-
@event_queue.delete_message_async(
|
156
|
+
def handle_data(msg)
|
157
|
+
unless msg.created_event?
|
158
|
+
@event_queue.delete_message_async(msg)
|
156
159
|
return
|
157
160
|
end
|
158
|
-
|
159
|
-
@
|
160
|
-
@event_queue.delete_message_async(
|
161
|
+
chunk = @chunk_router.route(msg)
|
162
|
+
@chunk_buffer.save(chunk)
|
163
|
+
@event_queue.delete_message_async(msg)
|
161
164
|
end
|
162
165
|
|
163
|
-
def handle_dispatch(
|
166
|
+
def handle_dispatch(msg)
|
164
167
|
# Dispatching tasks may takes 10 minutes or more, it can exceeds visibility timeout.
|
165
168
|
# To avoid this, delay dispatching until all events of current message batch are processed.
|
166
|
-
if @dispatch_message_id ==
|
169
|
+
if @dispatch_message_id == msg.message_id
|
167
170
|
@dispatch_requested = true
|
168
171
|
end
|
169
|
-
@event_queue.delete_message_async(
|
172
|
+
@event_queue.delete_message_async(msg)
|
170
173
|
end
|
171
174
|
|
172
|
-
def
|
173
|
-
tasks = @
|
174
|
-
|
175
|
+
def do_handle_dispatch
|
176
|
+
tasks = @chunk_buffer.flush_partial
|
177
|
+
dispatch_tasks tasks
|
175
178
|
set_dispatch_timer
|
176
179
|
end
|
177
180
|
|
178
181
|
def set_dispatch_timer
|
179
|
-
res = @event_queue.send_message(
|
182
|
+
res = @event_queue.send_message(DispatchDispatcherMessage.create(delay_seconds: @dispatch_interval))
|
180
183
|
@dispatch_message_id = res.message_id
|
181
184
|
end
|
182
185
|
|
183
|
-
def handle_flushtable(
|
184
|
-
|
185
|
-
|
186
|
-
|
186
|
+
def handle_flushtable(msg)
|
187
|
+
# FIXME: badly named attribute. table_name is really stream_name, which is called as data_source_id, too.
|
188
|
+
stream_name = msg.table_name
|
189
|
+
|
190
|
+
logger.info "*** flushtable requested: stream_name=#{stream_name}"
|
191
|
+
tasks = @chunk_buffer.flush_stream(stream_name)
|
192
|
+
dispatch_tasks tasks
|
187
193
|
# Delete this event immediately
|
188
|
-
@event_queue.delete_message(
|
194
|
+
@event_queue.delete_message(msg)
|
189
195
|
end
|
190
196
|
|
191
|
-
def
|
197
|
+
def dispatch_tasks(tasks)
|
192
198
|
tasks.each do |task|
|
193
|
-
|
199
|
+
msg = StreamingLoadV3LoaderMessage.for_load_task(task)
|
200
|
+
@task_queue.put msg
|
194
201
|
end
|
195
202
|
end
|
196
203
|
|
@@ -207,9 +214,6 @@ module Bricolage
|
|
207
214
|
@rest_arguments = nil
|
208
215
|
|
209
216
|
@opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
|
210
|
-
opts.on('--task-id=id', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
|
211
|
-
@task_id = task_id
|
212
|
-
}
|
213
217
|
opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{Context::DEFAULT_ENV}]") {|env|
|
214
218
|
@environment = env
|
215
219
|
}
|
@@ -1,18 +1,19 @@
|
|
1
|
+
require 'bricolage/streamingload/chunk'
|
1
2
|
require 'bricolage/sqsdatasource'
|
2
3
|
|
3
4
|
module Bricolage
|
4
5
|
|
5
6
|
module StreamingLoad
|
6
7
|
|
7
|
-
class
|
8
|
+
class DispatcherMessage < SQSMessage
|
8
9
|
|
9
|
-
def
|
10
|
+
def DispatcherMessage.get_concrete_class(msg, rec)
|
10
11
|
case
|
11
|
-
when rec['eventName'] == 'shutdown' then
|
12
|
-
when rec['eventName'] == 'dispatch' then
|
13
|
-
when rec['eventName'] == 'flushtable' then
|
14
|
-
when rec['eventName'] == 'checkpoint' then
|
15
|
-
when !!rec['s3'] then
|
12
|
+
when rec['eventName'] == 'shutdown' then ShutdownDispatcherMessage
|
13
|
+
when rec['eventName'] == 'dispatch' then DispatchDispatcherMessage
|
14
|
+
when rec['eventName'] == 'flushtable' then FlushTableDispatcherMessage
|
15
|
+
when rec['eventName'] == 'checkpoint' then CheckPointDispatcherMessage
|
16
|
+
when !!rec['s3'] then S3ObjectDispatcherMessage
|
16
17
|
else UnknownSQSMessage
|
17
18
|
end
|
18
19
|
end
|
@@ -28,13 +29,13 @@ module Bricolage
|
|
28
29
|
end
|
29
30
|
|
30
31
|
|
31
|
-
class
|
32
|
+
class ShutdownDispatcherMessage < DispatcherMessage
|
32
33
|
|
33
|
-
def
|
34
|
+
def ShutdownDispatcherMessage.create
|
34
35
|
super name: 'shutdown'
|
35
36
|
end
|
36
37
|
|
37
|
-
def
|
38
|
+
def ShutdownDispatcherMessage.parse_sqs_record(msg, rec)
|
38
39
|
{}
|
39
40
|
end
|
40
41
|
|
@@ -47,13 +48,13 @@ module Bricolage
|
|
47
48
|
|
48
49
|
|
49
50
|
# Flushes all tables and shutdown
|
50
|
-
class
|
51
|
+
class CheckPointDispatcherMessage < DispatcherMessage
|
51
52
|
|
52
|
-
def
|
53
|
+
def CheckPointDispatcherMessage.create
|
53
54
|
super name: 'checkpoint'
|
54
55
|
end
|
55
56
|
|
56
|
-
def
|
57
|
+
def CheckPointDispatcherMessage.parse_sqs_record(msg, rec)
|
57
58
|
{}
|
58
59
|
end
|
59
60
|
|
@@ -65,13 +66,13 @@ module Bricolage
|
|
65
66
|
end
|
66
67
|
|
67
68
|
|
68
|
-
class
|
69
|
+
class FlushTableDispatcherMessage < DispatcherMessage
|
69
70
|
|
70
|
-
def
|
71
|
+
def FlushTableDispatcherMessage.create(table_name:)
|
71
72
|
super name: 'flushtable', table_name: table_name
|
72
73
|
end
|
73
74
|
|
74
|
-
def
|
75
|
+
def FlushTableDispatcherMessage.parse_sqs_record(msg, rec)
|
75
76
|
{
|
76
77
|
table_name: rec['tableName']
|
77
78
|
}
|
@@ -94,9 +95,9 @@ module Bricolage
|
|
94
95
|
end
|
95
96
|
|
96
97
|
|
97
|
-
class
|
98
|
+
class DispatchDispatcherMessage < DispatcherMessage
|
98
99
|
|
99
|
-
def
|
100
|
+
def DispatchDispatcherMessage.create(delay_seconds:)
|
100
101
|
super name: 'dispatch', delay_seconds: delay_seconds
|
101
102
|
end
|
102
103
|
|
@@ -108,9 +109,9 @@ module Bricolage
|
|
108
109
|
end
|
109
110
|
|
110
111
|
|
111
|
-
class
|
112
|
+
class S3ObjectDispatcherMessage < DispatcherMessage
|
112
113
|
|
113
|
-
def
|
114
|
+
def S3ObjectDispatcherMessage.parse_sqs_record(msg, rec)
|
114
115
|
{
|
115
116
|
region: rec['awsRegion'],
|
116
117
|
bucket: rec['s3']['bucket']['name'],
|
@@ -144,12 +145,12 @@ module Bricolage
|
|
144
145
|
true
|
145
146
|
end
|
146
147
|
|
147
|
-
def
|
148
|
+
def created_event?
|
148
149
|
!!(/\AObjectCreated:(?!Copy)/ =~ @name)
|
149
150
|
end
|
150
151
|
|
151
|
-
def
|
152
|
-
|
152
|
+
def chunk
|
153
|
+
Chunk.new(id: nil, url: url, size: size)
|
153
154
|
end
|
154
155
|
|
155
156
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
3
|
+
module Bricolage
|
4
|
+
|
5
|
+
module StreamingLoad
|
6
|
+
|
7
|
+
# a Chunk which is not saved yet (received from SQS)
|
8
|
+
class IncomingChunk
|
9
|
+
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
def initialize(message, stream_name)
|
13
|
+
@chunk = message.chunk
|
14
|
+
@message = message
|
15
|
+
@stream_name = stream_name
|
16
|
+
end
|
17
|
+
|
18
|
+
def_delegator '@chunk', :id
|
19
|
+
def_delegator '@chunk', :url
|
20
|
+
def_delegator '@chunk', :size
|
21
|
+
|
22
|
+
def_delegator '@message', :message_id
|
23
|
+
def_delegator '@message', :receipt_handle
|
24
|
+
|
25
|
+
def event_time
|
26
|
+
@message.time
|
27
|
+
end
|
28
|
+
|
29
|
+
attr_reader :stream_name
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -4,11 +4,11 @@ module Bricolage
|
|
4
4
|
|
5
5
|
module StreamingLoad
|
6
6
|
|
7
|
-
class
|
7
|
+
class LoaderMessage < SQSMessage
|
8
8
|
|
9
|
-
def
|
9
|
+
def LoaderMessage.get_concrete_class(msg, rec)
|
10
10
|
case
|
11
|
-
when rec['eventName'] == 'streaming_load_v3' then
|
11
|
+
when rec['eventName'] == 'streaming_load_v3' then StreamingLoadV3LoaderMessage
|
12
12
|
else UnknownSQSMessage
|
13
13
|
end
|
14
14
|
end
|
@@ -24,13 +24,17 @@ module Bricolage
|
|
24
24
|
end
|
25
25
|
|
26
26
|
|
27
|
-
class
|
27
|
+
class StreamingLoadV3LoaderMessage < LoaderMessage
|
28
28
|
|
29
|
-
def
|
29
|
+
def StreamingLoadV3LoaderMessage.for_load_task(load_task)
|
30
|
+
create(task_id: load_task.id)
|
31
|
+
end
|
32
|
+
|
33
|
+
def StreamingLoadV3LoaderMessage.create(task_id:, force: false)
|
30
34
|
super name: 'streaming_load_v3', task_id: task_id, force: force
|
31
35
|
end
|
32
36
|
|
33
|
-
def
|
37
|
+
def StreamingLoadV3LoaderMessage.parse_sqs_record(msg, rec)
|
34
38
|
{
|
35
39
|
task_id: rec['taskId'],
|
36
40
|
force: (rec['force'].to_s == 'true')
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'bricolage/context'
|
2
2
|
require 'bricolage/sqsdatasource'
|
3
|
-
require 'bricolage/streamingload/
|
3
|
+
require 'bricolage/streamingload/loadermessage'
|
4
4
|
require 'bricolage/streamingload/job'
|
5
5
|
require 'bricolage/streamingload/alertinglogger'
|
6
6
|
require 'bricolage/logger'
|
@@ -113,7 +113,7 @@ module Bricolage
|
|
113
113
|
end
|
114
114
|
|
115
115
|
def event_loop
|
116
|
-
@task_queue.handle_messages(handler: self, message_class:
|
116
|
+
@task_queue.handle_messages(handler: self, message_class: LoaderMessage)
|
117
117
|
end
|
118
118
|
|
119
119
|
# message handler
|
@@ -3,6 +3,8 @@ require 'bricolage/context'
|
|
3
3
|
require 'bricolage/sqsdatasource'
|
4
4
|
require 'bricolage/sqsmock'
|
5
5
|
require 'bricolage/streamingload/dispatcher'
|
6
|
+
require 'bricolage/streamingload/chunkrouter'
|
7
|
+
require 'bricolage/streamingload/chunkbuffer'
|
6
8
|
|
7
9
|
module Bricolage
|
8
10
|
module StreamingLoad
|
@@ -35,12 +37,12 @@ module Bricolage
|
|
35
37
|
|
36
38
|
task_queue = SQSDataSource.new_mock
|
37
39
|
|
38
|
-
|
40
|
+
chunk_buffer = ChunkBuffer.new(
|
39
41
|
control_data_source: ctl_ds,
|
40
42
|
logger: ctx.logger
|
41
43
|
)
|
42
44
|
|
43
|
-
|
45
|
+
chunk_router = ChunkRouter.for_config([
|
44
46
|
{
|
45
47
|
"url" => %r<\As3://test-bucket/testschema\.desttable/datafile-\d{4}\.json\.gz>.source,
|
46
48
|
"schema" => 'testschema',
|
@@ -51,8 +53,8 @@ module Bricolage
|
|
51
53
|
dispatcher = Dispatcher.new(
|
52
54
|
event_queue: event_queue,
|
53
55
|
task_queue: task_queue,
|
54
|
-
|
55
|
-
|
56
|
+
chunk_buffer: chunk_buffer,
|
57
|
+
chunk_router: chunk_router,
|
56
58
|
dispatch_interval: 600,
|
57
59
|
logger: ctx.logger
|
58
60
|
)
|
@@ -127,12 +129,12 @@ module Bricolage
|
|
127
129
|
|
128
130
|
task_queue = SQSDataSource.new_mock
|
129
131
|
|
130
|
-
|
132
|
+
chunk_buffer = ChunkBuffer.new(
|
131
133
|
control_data_source: ctl_ds,
|
132
134
|
logger: ctx.logger
|
133
135
|
)
|
134
136
|
|
135
|
-
|
137
|
+
chunk_router = ChunkRouter.for_config([
|
136
138
|
{
|
137
139
|
"url" => %r<\As3://test-bucket/testschema\.(?<table>\w+)/datafile-\d{4}\.json\.gz>.source,
|
138
140
|
"schema" => 'testschema',
|
@@ -143,8 +145,8 @@ module Bricolage
|
|
143
145
|
dispatcher = Dispatcher.new(
|
144
146
|
event_queue: event_queue,
|
145
147
|
task_queue: task_queue,
|
146
|
-
|
147
|
-
|
148
|
+
chunk_buffer: chunk_buffer,
|
149
|
+
chunk_router: chunk_router,
|
148
150
|
dispatch_interval: 600,
|
149
151
|
logger: ctx.logger
|
150
152
|
)
|
@@ -1,12 +1,12 @@
|
|
1
1
|
require 'test/unit'
|
2
|
-
require 'bricolage/streamingload/
|
2
|
+
require 'bricolage/streamingload/dispatchermessage'
|
3
3
|
|
4
4
|
module Bricolage::StreamingLoad
|
5
5
|
|
6
|
-
class
|
6
|
+
class TestDispatcherMessage < Test::Unit::TestCase
|
7
7
|
|
8
8
|
def new_s3event(message_id: nil, receipt_handle: nil, name: nil, time: nil, source: nil, region: nil, bucket: nil, key: nil, size: nil)
|
9
|
-
|
9
|
+
S3ObjectDispatcherMessage.new(
|
10
10
|
message_id: message_id,
|
11
11
|
receipt_handle: receipt_handle,
|
12
12
|
name: name,
|
@@ -21,9 +21,9 @@ module Bricolage::StreamingLoad
|
|
21
21
|
|
22
22
|
test "#created?" do
|
23
23
|
e = new_s3event(name: "ObjectCreated:Put")
|
24
|
-
assert_true e.
|
24
|
+
assert_true e.created_event?
|
25
25
|
e = new_s3event(name: "ObjectCreated:Copy")
|
26
|
-
assert_false e.
|
26
|
+
assert_false e.created_event?
|
27
27
|
end
|
28
28
|
|
29
29
|
end
|
data/test/test_sqsdatasource.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bricolage-streamingload
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minero Aoki
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2018-01-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bricolage
|
@@ -120,19 +120,22 @@ files:
|
|
120
120
|
- lib/bricolage/sqsdatasource.rb
|
121
121
|
- lib/bricolage/sqsmock.rb
|
122
122
|
- lib/bricolage/streamingload/alertinglogger.rb
|
123
|
+
- lib/bricolage/streamingload/chunk.rb
|
124
|
+
- lib/bricolage/streamingload/chunkbuffer.rb
|
125
|
+
- lib/bricolage/streamingload/chunkrouter.rb
|
123
126
|
- lib/bricolage/streamingload/dispatcher.rb
|
124
|
-
- lib/bricolage/streamingload/
|
127
|
+
- lib/bricolage/streamingload/dispatchermessage.rb
|
128
|
+
- lib/bricolage/streamingload/incomingchunk.rb
|
125
129
|
- lib/bricolage/streamingload/job.rb
|
126
130
|
- lib/bricolage/streamingload/jobparams.rb
|
131
|
+
- lib/bricolage/streamingload/loadermessage.rb
|
132
|
+
- lib/bricolage/streamingload/loadtask.rb
|
127
133
|
- lib/bricolage/streamingload/manifest.rb
|
128
|
-
- lib/bricolage/streamingload/objectbuffer.rb
|
129
|
-
- lib/bricolage/streamingload/task.rb
|
130
134
|
- lib/bricolage/streamingload/taskhandler.rb
|
131
|
-
- lib/bricolage/streamingload/urlpatterns.rb
|
132
135
|
- lib/bricolage/streamingload/version.rb
|
133
136
|
- test/all.rb
|
134
137
|
- test/streamingload/test_dispatcher.rb
|
135
|
-
- test/streamingload/
|
138
|
+
- test/streamingload/test_dispatchermessage.rb
|
136
139
|
- test/streamingload/test_job.rb
|
137
140
|
- test/test_sqsdatasource.rb
|
138
141
|
homepage: https://github.com/aamine/bricolage-streamingload
|
@@ -1,59 +0,0 @@
|
|
1
|
-
module Bricolage
|
2
|
-
|
3
|
-
module StreamingLoad
|
4
|
-
|
5
|
-
class URLPatternNotMatched < StandardError; end
|
6
|
-
|
7
|
-
|
8
|
-
class URLPatterns
|
9
|
-
|
10
|
-
def URLPatterns.for_config(configs)
|
11
|
-
new(configs.map {|c|
|
12
|
-
Pattern.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
|
13
|
-
})
|
14
|
-
end
|
15
|
-
|
16
|
-
def initialize(patterns)
|
17
|
-
@patterns = patterns
|
18
|
-
end
|
19
|
-
|
20
|
-
def match(url)
|
21
|
-
@patterns.each do |pat|
|
22
|
-
components = pat.match(url)
|
23
|
-
return components if components
|
24
|
-
end
|
25
|
-
raise URLPatternNotMatched, "no URL pattern matches the object url: #{url.inspect}"
|
26
|
-
end
|
27
|
-
|
28
|
-
class Pattern
|
29
|
-
def initialize(url:, schema:, table:)
|
30
|
-
@url_pattern = /\A#{url}\z/
|
31
|
-
@schema = schema
|
32
|
-
@table = table
|
33
|
-
end
|
34
|
-
|
35
|
-
attr_reader :url_pattern
|
36
|
-
attr_reader :schema
|
37
|
-
attr_reader :table
|
38
|
-
|
39
|
-
def match(url)
|
40
|
-
m = @url_pattern.match(url) or return nil
|
41
|
-
Components.new(get_component(m, @schema), get_component(m, @table))
|
42
|
-
end
|
43
|
-
|
44
|
-
def get_component(m, label)
|
45
|
-
if /\A%/ =~ label
|
46
|
-
m[label[1..-1]]
|
47
|
-
else
|
48
|
-
label
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
Components = Struct.new(:schema_name, :table_name)
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
end # module StreamingLoad
|
58
|
-
|
59
|
-
end # module Bricolage
|