bricolage-streamingload 0.12.0 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c8c9c822c1c2d207827cc5d0b6573140254b1638
4
- data.tar.gz: 166188b17014074c82a963aa9aecaff796ced70d
3
+ metadata.gz: 1aacc6209260b4c74e823fc9ea903bdd811c4d2b
4
+ data.tar.gz: c0b869e51e67a22708dd36b22afefb275e9a3169
5
5
  SHA512:
6
- metadata.gz: 17e35b2d5d49312d92a4f9f7e232cd6ff9daeb7975d7a1383936a935d12dd22c89550e1fece87920c36cac46fea6aa5517431c7af56e741cb5fe774a21cd7825
7
- data.tar.gz: 7556f6eb45b74d041891f3d9254aee4bf32c30d12e9aa61093351f5a08af4c7f6a04926b75a37d8cfbde4c359a50e5efc610cd6ceda605f63ca147e3d96d3285
6
+ metadata.gz: b6721bcb5c9dd506c801e71970e450e6a9318eac5171df099906ef91236e516a14f155cdd952240a534b80ab82771a88e22db49b309c8ab9cb210e1ae0747f1a
7
+ data.tar.gz: 95469bdc023a41ab08f4b0d6df776b95ef733949a87fb5e053f9bf0b1d74c75f35cde9b89335fe12ea486a214364cddcb9c263c7340bbdbdbd5f16f541f66653
@@ -0,0 +1,19 @@
1
+ module Bricolage
2
+
3
+ module StreamingLoad
4
+
5
+ class Chunk
6
+ def initialize(id:, url:, size: nil)
7
+ @id = id
8
+ @url = url
9
+ @size = size
10
+ end
11
+
12
+ attr_reader :id
13
+ attr_reader :url
14
+ attr_reader :size
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -1,43 +1,12 @@
1
- require 'bricolage/streamingload/task'
1
+ require 'bricolage/streamingload/loadtask'
2
+ require 'bricolage/streamingload/chunk'
2
3
  require 'bricolage/sqlutils'
3
- require 'forwardable'
4
4
 
5
5
  module Bricolage
6
6
 
7
7
  module StreamingLoad
8
8
 
9
- class LoadableObject
10
-
11
- extend Forwardable
12
-
13
- def initialize(event, components)
14
- @event = event
15
- @components = components
16
- end
17
-
18
- attr_reader :event
19
-
20
- def_delegator '@event', :url
21
- def_delegator '@event', :size
22
- def_delegator '@event', :message_id
23
- def_delegator '@event', :receipt_handle
24
- def_delegator '@components', :schema_name
25
- def_delegator '@components', :table_name
26
-
27
- def data_source_id
28
- "#{schema_name}.#{table_name}"
29
- end
30
-
31
- alias qualified_name data_source_id
32
-
33
- def event_time
34
- @event.time
35
- end
36
-
37
- end
38
-
39
-
40
- class ObjectBuffer
9
+ class ChunkBuffer
41
10
 
42
11
  TASK_GENERATION_TIME_LIMIT = 30 #sec
43
12
 
@@ -49,74 +18,83 @@ module Bricolage
49
18
  @task_generation_time_limit = TASK_GENERATION_TIME_LIMIT
50
19
  end
51
20
 
52
- def put(obj)
21
+ # chunk :: IncomingChunk
22
+ def save(chunk)
53
23
  @ctl_ds.open {|conn|
54
24
  suppress_sql_logging {
55
25
  conn.transaction {
56
- object_id = insert_object(conn, obj)
26
+ object_id = insert_object(conn, chunk)
57
27
  if object_id
58
28
  insert_task_objects(conn, object_id)
59
29
  else
60
- insert_dup_object(conn, obj)
30
+ @logger.info "Duplicated object recieved: url=#{chunk.url}"
31
+ insert_dup_object(conn, chunk)
61
32
  end
62
33
  }
63
34
  }
64
35
  }
65
36
  end
66
37
 
67
- # Flushes multiple tables periodically
68
- def flush_tasks
69
- task_ids = []
70
- warn_slow_task_generation {
71
- @ctl_ds.open {|conn|
38
+ # Flushes chunks of multiple streams, which are met conditions.
39
+ def flush_partial
40
+ task_ids = nil
41
+ tasks = nil
42
+
43
+ @ctl_ds.open {|conn|
44
+ warn_slow_task_generation {
72
45
  conn.transaction {|txn|
73
46
  task_ids = insert_tasks(conn)
74
- unless task_ids.empty?
75
- update_task_object_mappings(conn, task_ids)
76
- log_mapped_object_num(conn, task_ids)
77
- end
47
+ update_task_objects(conn, task_ids) unless task_ids.empty?
78
48
  }
79
49
  }
50
+ log_task_ids(task_ids)
51
+ tasks = load_tasks(conn, task_ids)
80
52
  }
81
- return task_ids.map {|id| LoadTask.create(task_id: id) }
53
+ tasks
82
54
  end
83
55
 
84
- # Flushes all objects of all tables immediately with no
85
- # additional conditions, to create "stream checkpoint".
86
- def flush_tasks_force
87
- task_ids = []
56
+ # Flushes all chunks of all stream with no additional conditions,
57
+ # to create "system checkpoint".
58
+ def flush_all
59
+ all_task_ids = []
60
+ tasks = nil
61
+
88
62
  @ctl_ds.open {|conn|
89
63
  conn.transaction {|txn|
90
- # update_task_object_mappings may not consume all saved objects
64
+ # update_task_objects may not consume all saved objects
91
65
  # (e.g. there are too many objects for one table), we must create
92
- # tasks repeatedly until there are no unassigned objects.
93
- until (ids = insert_tasks_force(conn)).empty?
94
- update_task_object_mappings(conn, ids)
95
- log_mapped_object_num(conn, ids)
96
- task_ids.concat ids
66
+ # tasks repeatedly until all objects are flushed.
67
+ until (task_ids = insert_tasks(conn, force: true)).empty?
68
+ update_task_objects(conn, task_ids)
69
+ all_task_ids.concat task_ids
97
70
  end
98
71
  }
72
+ log_task_ids(all_task_ids)
73
+ tasks = load_tasks(conn, all_task_ids)
99
74
  }
100
- return task_ids.map {|id| LoadTask.create(task_id: id) }
75
+ tasks
101
76
  end
102
77
 
103
- # Flushes the all objects of the specified table immediately
104
- # with no additional conditions, to create "table checkpoint".
105
- def flush_table_force(table_name)
106
- task_ids = []
78
+ # Flushes all chunks of the specified stream with no additional conditions,
79
+ # to create "stream checkpoint".
80
+ def flush_stream(stream_name)
81
+ all_task_ids = []
82
+ tasks = nil
83
+
107
84
  @ctl_ds.open {|conn|
108
85
  conn.transaction {|txn|
109
- # update_task_object_mappings may not consume all saved objects
86
+ # update_task_objects may not consume all saved objects
110
87
  # (e.g. there are too many objects for one table), we must create
111
- # tasks repeatedly until there are no unassigned objects.
112
- until (ids = insert_table_task_force(conn, table_name)).empty?
113
- update_task_object_mappings(conn, ids)
114
- log_mapped_object_num(conn, ids)
115
- task_ids.concat ids
88
+ # tasks repeatedly until all objects are flushed.
89
+ until (task_ids = insert_tasks_for_stream(conn, stream_name)).empty?
90
+ update_task_objects(conn, task_ids)
91
+ all_task_ids.concat task_ids
116
92
  end
117
93
  }
94
+ log_task_ids(all_task_ids)
95
+ tasks = load_tasks(conn, all_task_ids)
118
96
  }
119
- return task_ids.map {|id| LoadTask.create(task_id: id) }
97
+ tasks
120
98
  end
121
99
 
122
100
  private
@@ -134,7 +112,7 @@ module Bricolage
134
112
  values
135
113
  ( #{s obj.url}
136
114
  , #{obj.size}
137
- , #{s obj.data_source_id}
115
+ , #{s obj.stream_name}
138
116
  , #{s obj.message_id}
139
117
  , '#{obj.event_time}' AT TIME ZONE 'JST'
140
118
  , current_timestamp
@@ -148,7 +126,6 @@ module Bricolage
148
126
  end
149
127
 
150
128
  def insert_dup_object(conn, obj)
151
- @logger.info "Duplicated object recieved: object_url=#{obj.url}"
152
129
  conn.update(<<-EndSQL)
153
130
  insert into strload_dup_objects
154
131
  ( object_url
@@ -161,7 +138,7 @@ module Bricolage
161
138
  values
162
139
  ( #{s obj.url}
163
140
  , #{obj.size}
164
- , #{s obj.data_source_id}
141
+ , #{s obj.stream_name}
165
142
  , #{s obj.message_id}
166
143
  , '#{obj.event_time}' AT TIME ZONE 'JST'
167
144
  , current_timestamp
@@ -184,10 +161,6 @@ module Bricolage
184
161
  EndSQL
185
162
  end
186
163
 
187
- def insert_tasks_force(conn)
188
- insert_tasks(conn, force: true)
189
- end
190
-
191
164
  def insert_tasks(conn, force: false)
192
165
  task_ids = conn.query_values(<<-EndSQL)
193
166
  insert into strload_tasks
@@ -239,11 +212,10 @@ module Bricolage
239
212
  ;
240
213
  EndSQL
241
214
 
242
- log_created_tasks task_ids
243
215
  task_ids
244
216
  end
245
217
 
246
- def insert_table_task_force(conn, table_name)
218
+ def insert_tasks_for_stream(conn, stream_name)
247
219
  task_ids = conn.query_values(<<-EndSQL)
248
220
  insert into strload_tasks
249
221
  ( task_class
@@ -273,17 +245,15 @@ module Bricolage
273
245
  using (data_source_id)
274
246
  where
275
247
  -- does not check disabled
276
- data_source_id = #{s table_name}
248
+ data_source_id = #{s stream_name}
277
249
  returning task_id
278
250
  ;
279
251
  EndSQL
280
252
 
281
- # It must be 1
282
- log_created_tasks(task_ids)
283
253
  task_ids
284
254
  end
285
255
 
286
- def update_task_object_mappings(conn, task_ids)
256
+ def update_task_objects(conn, task_ids)
287
257
  conn.update(<<-EndSQL)
288
258
  update strload_task_objects dst
289
259
  set
@@ -309,25 +279,36 @@ module Bricolage
309
279
  and tsk_obj.object_seq <= tables.load_batch_size
310
280
  ;
311
281
  EndSQL
282
+ # UPDATE statement cannot return values
283
+ nil
312
284
  end
313
285
 
314
- def log_mapped_object_num(conn, task_ids)
315
- # This method is required since UPDATE does not "returning" multiple values
316
- rows = conn.query_values(<<-EndSQL)
317
- select
318
- task_id
319
- , count(*)
320
- from
321
- strload_task_objects
322
- where
323
- task_id in (#{task_ids.join(',')})
324
- group by
325
- task_id
326
- ;
327
- EndSQL
328
- rows.each_slice(2) do |task_id, object_count|
329
- @logger.info "Number of objects assigned to task: task_id=#{task_id} object_count=#{object_count}"
330
- end
286
+ def load_tasks(conn, task_ids)
287
+ return [] if task_ids.empty?
288
+
289
+ records = suppress_sql_logging {
290
+ conn.query_rows(<<-EndSQL)
291
+ select
292
+ t.task_id
293
+ , t.object_id
294
+ , o.object_url
295
+ , o.object_size
296
+ from
297
+ strload_task_objects t
298
+ inner join strload_objects o using (object_id)
299
+ where
300
+ task_id in (#{task_ids.join(',')})
301
+ ;
302
+ EndSQL
303
+ }
304
+
305
+ records.group_by {|row| row['task_id'] }.map {|task_id, rows|
306
+ chunks = rows.map {|row|
307
+ id, url, size = row.values_at('object_id', 'object_url', 'object_size')
308
+ Chunk.new(id: id, url: url, size: size)
309
+ }
310
+ LoadTask.new(id: task_id, chunks: chunks)
311
+ }
331
312
  end
332
313
 
333
314
  def suppress_sql_logging
@@ -341,7 +322,7 @@ module Bricolage
341
322
  end
342
323
  end
343
324
 
344
- def log_created_tasks(task_ids)
325
+ def log_task_ids(task_ids)
345
326
  created_task_num = task_ids.size
346
327
  @logger.info "Number of task created: #{created_task_num}"
347
328
  @logger.info "Created task ids: #{task_ids}" if created_task_num > 0
@@ -0,0 +1,57 @@
1
+ require 'bricolage/streamingload/incomingchunk'
2
+
3
+ module Bricolage
4
+
5
+ module StreamingLoad
6
+
7
+ class ChunkRoutingFailed < StandardError; end
8
+
9
+
10
+ class ChunkRouter
11
+
12
+ def ChunkRouter.for_config(configs)
13
+ new(configs.map {|c|
14
+ Route.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
15
+ })
16
+ end
17
+
18
+ def initialize(routes)
19
+ @routes = routes
20
+ end
21
+
22
+ def route(msg)
23
+ @routes.each do |route|
24
+ stream_name = route.match(msg.url)
25
+ return IncomingChunk.new(msg, stream_name) if stream_name
26
+ end
27
+ raise ChunkRoutingFailed, "could not detect stream name: #{url.inspect}"
28
+ end
29
+
30
+ class Route
31
+ def initialize(url:, schema:, table:)
32
+ @url_pattern = /\A#{url}\z/
33
+ @schema = schema
34
+ @table = table
35
+ end
36
+
37
+ def match(url)
38
+ m = @url_pattern.match(url) or return nil
39
+ c1 = get_component(m, @schema)
40
+ c2 = get_component(m, @table)
41
+ "#{c1}.#{c2}"
42
+ end
43
+
44
+ def get_component(m, label)
45
+ if /\A%/ =~ label
46
+ m[label[1..-1]]
47
+ else
48
+ label
49
+ end
50
+ end
51
+ end
52
+
53
+ end
54
+
55
+ end # module StreamingLoad
56
+
57
+ end # module Bricolage
@@ -3,9 +3,10 @@ require 'bricolage/exception'
3
3
  require 'bricolage/version'
4
4
  require 'bricolage/sqsdatasource'
5
5
  require 'bricolage/logger'
6
- require 'bricolage/streamingload/event'
7
- require 'bricolage/streamingload/objectbuffer'
8
- require 'bricolage/streamingload/urlpatterns'
6
+ require 'bricolage/streamingload/dispatchermessage'
7
+ require 'bricolage/streamingload/loadermessage'
8
+ require 'bricolage/streamingload/chunkrouter'
9
+ require 'bricolage/streamingload/chunkbuffer'
9
10
  require 'bricolage/streamingload/alertinglogger'
10
11
  require 'aws-sdk'
11
12
  require 'yaml'
@@ -40,18 +41,18 @@ module Bricolage
40
41
  )
41
42
  end
42
43
 
43
- object_buffer = ObjectBuffer.new(
44
+ chunk_buffer = ChunkBuffer.new(
44
45
  control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_ctl')),
45
46
  logger: logger
46
47
  )
47
48
 
48
- url_patterns = URLPatterns.for_config(config.fetch('url_patterns'))
49
+ chunk_router = ChunkRouter.for_config(config.fetch('url_patterns'))
49
50
 
50
51
  dispatcher = Dispatcher.new(
51
52
  event_queue: event_queue,
52
53
  task_queue: task_queue,
53
- object_buffer: object_buffer,
54
- url_patterns: url_patterns,
54
+ chunk_router: chunk_router,
55
+ chunk_buffer: chunk_buffer,
55
56
  dispatch_interval: config.fetch('dispatch-interval', 60),
56
57
  logger: logger
57
58
  )
@@ -60,6 +61,8 @@ module Bricolage
60
61
  create_pid_file opts.pid_file_path if opts.pid_file_path
61
62
  Dir.chdir '/'
62
63
  dispatcher.event_loop
64
+ rescue SystemExit
65
+ ;
63
66
  rescue Exception => e
64
67
  logger.exception e
65
68
  logger.error "dispatcher abort: pid=#{$$}"
@@ -82,11 +85,11 @@ module Bricolage
82
85
  # ignore
83
86
  end
84
87
 
85
- def initialize(event_queue:, task_queue:, object_buffer:, url_patterns:, dispatch_interval:, logger:)
88
+ def initialize(event_queue:, task_queue:, chunk_router:, chunk_buffer:, dispatch_interval:, logger:)
86
89
  @event_queue = event_queue
87
90
  @task_queue = task_queue
88
- @object_buffer = object_buffer
89
- @url_patterns = url_patterns
91
+ @chunk_router = chunk_router
92
+ @chunk_buffer = chunk_buffer
90
93
  @dispatch_interval = dispatch_interval
91
94
  @dispatch_message_id = nil
92
95
  @logger = logger
@@ -99,7 +102,7 @@ module Bricolage
99
102
  def event_loop
100
103
  logger.info "*** dispatcher started: pid=#{$$}"
101
104
  set_dispatch_timer
102
- @event_queue.handle_messages(handler: self, message_class: Event)
105
+ @event_queue.handle_messages(handler: self, message_class: DispatcherMessage)
103
106
  @event_queue.process_async_delete_force
104
107
  logger.info "*** shutdown gracefully: pid=#{$$}"
105
108
  end
@@ -111,86 +114,90 @@ module Bricolage
111
114
 
112
115
  if @dispatch_requested
113
116
  logger.info "*** dispatch requested"
114
- dispatch_tasks
117
+ do_handle_dispatch
115
118
  @dispatch_requested = false
116
119
  end
117
120
 
118
121
  if @checkpoint_requested
119
- create_checkpoint
122
+ do_handle_checkpoint
120
123
  @checkpoint_requested = false # is needless, but reset it just in case
121
124
  end
122
125
  end
123
126
 
124
- def handle_unknown(e)
125
- logger.warn "unknown event: #{e.message_body}"
126
- @event_queue.delete_message_async(e)
127
+ def handle_unknown(msg)
128
+ logger.warn "unknown event: #{msg.message_body}"
129
+ @event_queue.delete_message_async(msg)
127
130
  end
128
131
 
129
- def handle_shutdown(e)
132
+ def handle_shutdown(msg)
130
133
  logger.info "*** shutdown requested"
131
134
  @event_queue.initiate_terminate
132
135
  # Delete this event immediately
133
- @event_queue.delete_message(e)
136
+ @event_queue.delete_message(msg)
134
137
  end
135
138
 
136
- def handle_checkpoint(e)
139
+ def handle_checkpoint(msg)
137
140
  # Delay creating CHECKPOINT after the current message batch,
138
141
  # because any other extra events are already received.
139
142
  @checkpoint_requested = true
140
143
  # Delete this event immediately
141
- @event_queue.delete_message(e)
144
+ @event_queue.delete_message(msg)
142
145
  end
143
146
 
144
- def create_checkpoint
147
+ def do_handle_checkpoint
145
148
  logger.info "*** checkpoint requested"
146
149
  logger.info "Force-flushing all objects..."
147
- tasks = @object_buffer.flush_tasks_force
148
- send_tasks tasks
150
+ tasks = @chunk_buffer.flush_all
151
+ dispatch_tasks tasks
149
152
  logger.info "All objects flushed; shutting down..."
150
153
  @event_queue.initiate_terminate
151
154
  end
152
155
 
153
- def handle_data(e)
154
- unless e.created?
155
- @event_queue.delete_message_async(e)
156
+ def handle_data(msg)
157
+ unless msg.created_event?
158
+ @event_queue.delete_message_async(msg)
156
159
  return
157
160
  end
158
- obj = e.loadable_object(@url_patterns)
159
- @object_buffer.put(obj)
160
- @event_queue.delete_message_async(e)
161
+ chunk = @chunk_router.route(msg)
162
+ @chunk_buffer.save(chunk)
163
+ @event_queue.delete_message_async(msg)
161
164
  end
162
165
 
163
- def handle_dispatch(e)
166
+ def handle_dispatch(msg)
164
167
  # Dispatching tasks may takes 10 minutes or more, it can exceeds visibility timeout.
165
168
  # To avoid this, delay dispatching until all events of current message batch are processed.
166
- if @dispatch_message_id == e.message_id
169
+ if @dispatch_message_id == msg.message_id
167
170
  @dispatch_requested = true
168
171
  end
169
- @event_queue.delete_message_async(e)
172
+ @event_queue.delete_message_async(msg)
170
173
  end
171
174
 
172
- def dispatch_tasks
173
- tasks = @object_buffer.flush_tasks
174
- send_tasks tasks
175
+ def do_handle_dispatch
176
+ tasks = @chunk_buffer.flush_partial
177
+ dispatch_tasks tasks
175
178
  set_dispatch_timer
176
179
  end
177
180
 
178
181
  def set_dispatch_timer
179
- res = @event_queue.send_message(DispatchEvent.create(delay_seconds: @dispatch_interval))
182
+ res = @event_queue.send_message(DispatchDispatcherMessage.create(delay_seconds: @dispatch_interval))
180
183
  @dispatch_message_id = res.message_id
181
184
  end
182
185
 
183
- def handle_flushtable(e)
184
- logger.info "*** flushtable requested: table=#{e.table_name}"
185
- tasks = @object_buffer.flush_table_force(e.table_name)
186
- send_tasks tasks
186
+ def handle_flushtable(msg)
187
+ # FIXME: badly named attribute. table_name is really stream_name, which is called as data_source_id, too.
188
+ stream_name = msg.table_name
189
+
190
+ logger.info "*** flushtable requested: stream_name=#{stream_name}"
191
+ tasks = @chunk_buffer.flush_stream(stream_name)
192
+ dispatch_tasks tasks
187
193
  # Delete this event immediately
188
- @event_queue.delete_message(e)
194
+ @event_queue.delete_message(msg)
189
195
  end
190
196
 
191
- def send_tasks(tasks)
197
+ def dispatch_tasks(tasks)
192
198
  tasks.each do |task|
193
- @task_queue.put task
199
+ msg = StreamingLoadV3LoaderMessage.for_load_task(task)
200
+ @task_queue.put msg
194
201
  end
195
202
  end
196
203
 
@@ -207,9 +214,6 @@ module Bricolage
207
214
  @rest_arguments = nil
208
215
 
209
216
  @opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
210
- opts.on('--task-id=id', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
211
- @task_id = task_id
212
- }
213
217
  opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{Context::DEFAULT_ENV}]") {|env|
214
218
  @environment = env
215
219
  }
@@ -1,18 +1,19 @@
1
+ require 'bricolage/streamingload/chunk'
1
2
  require 'bricolage/sqsdatasource'
2
3
 
3
4
  module Bricolage
4
5
 
5
6
  module StreamingLoad
6
7
 
7
- class Event < SQSMessage
8
+ class DispatcherMessage < SQSMessage
8
9
 
9
- def Event.get_concrete_class(msg, rec)
10
+ def DispatcherMessage.get_concrete_class(msg, rec)
10
11
  case
11
- when rec['eventName'] == 'shutdown' then ShutdownEvent
12
- when rec['eventName'] == 'dispatch' then DispatchEvent
13
- when rec['eventName'] == 'flushtable' then FlushTableEvent
14
- when rec['eventName'] == 'checkpoint' then CheckPointEvent
15
- when !!rec['s3'] then S3ObjectEvent
12
+ when rec['eventName'] == 'shutdown' then ShutdownDispatcherMessage
13
+ when rec['eventName'] == 'dispatch' then DispatchDispatcherMessage
14
+ when rec['eventName'] == 'flushtable' then FlushTableDispatcherMessage
15
+ when rec['eventName'] == 'checkpoint' then CheckPointDispatcherMessage
16
+ when !!rec['s3'] then S3ObjectDispatcherMessage
16
17
  else UnknownSQSMessage
17
18
  end
18
19
  end
@@ -28,13 +29,13 @@ module Bricolage
28
29
  end
29
30
 
30
31
 
31
- class ShutdownEvent < Event
32
+ class ShutdownDispatcherMessage < DispatcherMessage
32
33
 
33
- def ShutdownEvent.create
34
+ def ShutdownDispatcherMessage.create
34
35
  super name: 'shutdown'
35
36
  end
36
37
 
37
- def ShutdownEvent.parse_sqs_record(msg, rec)
38
+ def ShutdownDispatcherMessage.parse_sqs_record(msg, rec)
38
39
  {}
39
40
  end
40
41
 
@@ -47,13 +48,13 @@ module Bricolage
47
48
 
48
49
 
49
50
  # Flushes all tables and shutdown
50
- class CheckPointEvent < Event
51
+ class CheckPointDispatcherMessage < DispatcherMessage
51
52
 
52
- def CheckPointEvent.create
53
+ def CheckPointDispatcherMessage.create
53
54
  super name: 'checkpoint'
54
55
  end
55
56
 
56
- def CheckPointEvent.parse_sqs_record(msg, rec)
57
+ def CheckPointDispatcherMessage.parse_sqs_record(msg, rec)
57
58
  {}
58
59
  end
59
60
 
@@ -65,13 +66,13 @@ module Bricolage
65
66
  end
66
67
 
67
68
 
68
- class FlushTableEvent < Event
69
+ class FlushTableDispatcherMessage < DispatcherMessage
69
70
 
70
- def FlushTableEvent.create(table_name:)
71
+ def FlushTableDispatcherMessage.create(table_name:)
71
72
  super name: 'flushtable', table_name: table_name
72
73
  end
73
74
 
74
- def FlushTableEvent.parse_sqs_record(msg, rec)
75
+ def FlushTableDispatcherMessage.parse_sqs_record(msg, rec)
75
76
  {
76
77
  table_name: rec['tableName']
77
78
  }
@@ -94,9 +95,9 @@ module Bricolage
94
95
  end
95
96
 
96
97
 
97
- class DispatchEvent < Event
98
+ class DispatchDispatcherMessage < DispatcherMessage
98
99
 
99
- def DispatchEvent.create(delay_seconds:)
100
+ def DispatchDispatcherMessage.create(delay_seconds:)
100
101
  super name: 'dispatch', delay_seconds: delay_seconds
101
102
  end
102
103
 
@@ -108,9 +109,9 @@ module Bricolage
108
109
  end
109
110
 
110
111
 
111
- class S3ObjectEvent < Event
112
+ class S3ObjectDispatcherMessage < DispatcherMessage
112
113
 
113
- def S3ObjectEvent.parse_sqs_record(msg, rec)
114
+ def S3ObjectDispatcherMessage.parse_sqs_record(msg, rec)
114
115
  {
115
116
  region: rec['awsRegion'],
116
117
  bucket: rec['s3']['bucket']['name'],
@@ -144,12 +145,12 @@ module Bricolage
144
145
  true
145
146
  end
146
147
 
147
- def created?
148
+ def created_event?
148
149
  !!(/\AObjectCreated:(?!Copy)/ =~ @name)
149
150
  end
150
151
 
151
- def loadable_object(url_patterns)
152
- LoadableObject.new(self, url_patterns.match(url))
152
+ def chunk
153
+ Chunk.new(id: nil, url: url, size: size)
153
154
  end
154
155
 
155
156
  end
@@ -0,0 +1,35 @@
1
+ require 'forwardable'
2
+
3
+ module Bricolage
4
+
5
+ module StreamingLoad
6
+
7
+ # a Chunk which is not saved yet (received from SQS)
8
+ class IncomingChunk
9
+
10
+ extend Forwardable
11
+
12
+ def initialize(message, stream_name)
13
+ @chunk = message.chunk
14
+ @message = message
15
+ @stream_name = stream_name
16
+ end
17
+
18
+ def_delegator '@chunk', :id
19
+ def_delegator '@chunk', :url
20
+ def_delegator '@chunk', :size
21
+
22
+ def_delegator '@message', :message_id
23
+ def_delegator '@message', :receipt_handle
24
+
25
+ def event_time
26
+ @message.time
27
+ end
28
+
29
+ attr_reader :stream_name
30
+
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -4,11 +4,11 @@ module Bricolage
4
4
 
5
5
  module StreamingLoad
6
6
 
7
- class Task < SQSMessage
7
+ class LoaderMessage < SQSMessage
8
8
 
9
- def Task.get_concrete_class(msg, rec)
9
+ def LoaderMessage.get_concrete_class(msg, rec)
10
10
  case
11
- when rec['eventName'] == 'streaming_load_v3' then LoadTask
11
+ when rec['eventName'] == 'streaming_load_v3' then StreamingLoadV3LoaderMessage
12
12
  else UnknownSQSMessage
13
13
  end
14
14
  end
@@ -24,13 +24,17 @@ module Bricolage
24
24
  end
25
25
 
26
26
 
27
- class LoadTask < Task
27
+ class StreamingLoadV3LoaderMessage < LoaderMessage
28
28
 
29
- def LoadTask.create(task_id:, force: false)
29
+ def StreamingLoadV3LoaderMessage.for_load_task(load_task)
30
+ create(task_id: load_task.id)
31
+ end
32
+
33
+ def StreamingLoadV3LoaderMessage.create(task_id:, force: false)
30
34
  super name: 'streaming_load_v3', task_id: task_id, force: force
31
35
  end
32
36
 
33
- def LoadTask.parse_sqs_record(msg, rec)
37
+ def StreamingLoadV3LoaderMessage.parse_sqs_record(msg, rec)
34
38
  {
35
39
  task_id: rec['taskId'],
36
40
  force: (rec['force'].to_s == 'true')
@@ -0,0 +1,17 @@
1
+ module Bricolage
2
+
3
+ module StreamingLoad
4
+
5
+ class LoadTask
6
+ def initialize(id:, chunks: [])
7
+ @id = id
8
+ @chunks = chunks
9
+ end
10
+
11
+ attr_reader :id
12
+ attr_reader :chunks
13
+ end
14
+
15
+ end
16
+
17
+ end
@@ -1,6 +1,6 @@
1
1
  require 'bricolage/context'
2
2
  require 'bricolage/sqsdatasource'
3
- require 'bricolage/streamingload/task'
3
+ require 'bricolage/streamingload/loadermessage'
4
4
  require 'bricolage/streamingload/job'
5
5
  require 'bricolage/streamingload/alertinglogger'
6
6
  require 'bricolage/logger'
@@ -113,7 +113,7 @@ module Bricolage
113
113
  end
114
114
 
115
115
  def event_loop
116
- @task_queue.handle_messages(handler: self, message_class: Task)
116
+ @task_queue.handle_messages(handler: self, message_class: LoaderMessage)
117
117
  end
118
118
 
119
119
  # message handler
@@ -1,5 +1,5 @@
1
1
  module Bricolage
2
2
  module StreamingLoad
3
- VERSION = '0.12.0'
3
+ VERSION = '0.13.0'
4
4
  end
5
5
  end
@@ -3,6 +3,8 @@ require 'bricolage/context'
3
3
  require 'bricolage/sqsdatasource'
4
4
  require 'bricolage/sqsmock'
5
5
  require 'bricolage/streamingload/dispatcher'
6
+ require 'bricolage/streamingload/chunkrouter'
7
+ require 'bricolage/streamingload/chunkbuffer'
6
8
 
7
9
  module Bricolage
8
10
  module StreamingLoad
@@ -35,12 +37,12 @@ module Bricolage
35
37
 
36
38
  task_queue = SQSDataSource.new_mock
37
39
 
38
- object_buffer = ObjectBuffer.new(
40
+ chunk_buffer = ChunkBuffer.new(
39
41
  control_data_source: ctl_ds,
40
42
  logger: ctx.logger
41
43
  )
42
44
 
43
- url_patterns = URLPatterns.for_config([
45
+ chunk_router = ChunkRouter.for_config([
44
46
  {
45
47
  "url" => %r<\As3://test-bucket/testschema\.desttable/datafile-\d{4}\.json\.gz>.source,
46
48
  "schema" => 'testschema',
@@ -51,8 +53,8 @@ module Bricolage
51
53
  dispatcher = Dispatcher.new(
52
54
  event_queue: event_queue,
53
55
  task_queue: task_queue,
54
- object_buffer: object_buffer,
55
- url_patterns: url_patterns,
56
+ chunk_buffer: chunk_buffer,
57
+ chunk_router: chunk_router,
56
58
  dispatch_interval: 600,
57
59
  logger: ctx.logger
58
60
  )
@@ -127,12 +129,12 @@ module Bricolage
127
129
 
128
130
  task_queue = SQSDataSource.new_mock
129
131
 
130
- object_buffer = ObjectBuffer.new(
132
+ chunk_buffer = ChunkBuffer.new(
131
133
  control_data_source: ctl_ds,
132
134
  logger: ctx.logger
133
135
  )
134
136
 
135
- url_patterns = URLPatterns.for_config([
137
+ chunk_router = ChunkRouter.for_config([
136
138
  {
137
139
  "url" => %r<\As3://test-bucket/testschema\.(?<table>\w+)/datafile-\d{4}\.json\.gz>.source,
138
140
  "schema" => 'testschema',
@@ -143,8 +145,8 @@ module Bricolage
143
145
  dispatcher = Dispatcher.new(
144
146
  event_queue: event_queue,
145
147
  task_queue: task_queue,
146
- object_buffer: object_buffer,
147
- url_patterns: url_patterns,
148
+ chunk_buffer: chunk_buffer,
149
+ chunk_router: chunk_router,
148
150
  dispatch_interval: 600,
149
151
  logger: ctx.logger
150
152
  )
@@ -1,12 +1,12 @@
1
1
  require 'test/unit'
2
- require 'bricolage/streamingload/event'
2
+ require 'bricolage/streamingload/dispatchermessage'
3
3
 
4
4
  module Bricolage::StreamingLoad
5
5
 
6
- class TestEvent < Test::Unit::TestCase
6
+ class TestDispatcherMessage < Test::Unit::TestCase
7
7
 
8
8
  def new_s3event(message_id: nil, receipt_handle: nil, name: nil, time: nil, source: nil, region: nil, bucket: nil, key: nil, size: nil)
9
- S3ObjectEvent.new(
9
+ S3ObjectDispatcherMessage.new(
10
10
  message_id: message_id,
11
11
  receipt_handle: receipt_handle,
12
12
  name: name,
@@ -21,9 +21,9 @@ module Bricolage::StreamingLoad
21
21
 
22
22
  test "#created?" do
23
23
  e = new_s3event(name: "ObjectCreated:Put")
24
- assert_true e.created?
24
+ assert_true e.created_event?
25
25
  e = new_s3event(name: "ObjectCreated:Copy")
26
- assert_false e.created?
26
+ assert_false e.created_event?
27
27
  end
28
28
 
29
29
  end
@@ -1,5 +1,4 @@
1
1
  require 'test/unit'
2
- require 'bricolage/streamingload/event'
3
2
  require 'bricolage/sqsmock'
4
3
  require 'bricolage/logger'
5
4
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage-streamingload
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-12-25 00:00:00.000000000 Z
12
+ date: 2018-01-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bricolage
@@ -120,19 +120,22 @@ files:
120
120
  - lib/bricolage/sqsdatasource.rb
121
121
  - lib/bricolage/sqsmock.rb
122
122
  - lib/bricolage/streamingload/alertinglogger.rb
123
+ - lib/bricolage/streamingload/chunk.rb
124
+ - lib/bricolage/streamingload/chunkbuffer.rb
125
+ - lib/bricolage/streamingload/chunkrouter.rb
123
126
  - lib/bricolage/streamingload/dispatcher.rb
124
- - lib/bricolage/streamingload/event.rb
127
+ - lib/bricolage/streamingload/dispatchermessage.rb
128
+ - lib/bricolage/streamingload/incomingchunk.rb
125
129
  - lib/bricolage/streamingload/job.rb
126
130
  - lib/bricolage/streamingload/jobparams.rb
131
+ - lib/bricolage/streamingload/loadermessage.rb
132
+ - lib/bricolage/streamingload/loadtask.rb
127
133
  - lib/bricolage/streamingload/manifest.rb
128
- - lib/bricolage/streamingload/objectbuffer.rb
129
- - lib/bricolage/streamingload/task.rb
130
134
  - lib/bricolage/streamingload/taskhandler.rb
131
- - lib/bricolage/streamingload/urlpatterns.rb
132
135
  - lib/bricolage/streamingload/version.rb
133
136
  - test/all.rb
134
137
  - test/streamingload/test_dispatcher.rb
135
- - test/streamingload/test_event.rb
138
+ - test/streamingload/test_dispatchermessage.rb
136
139
  - test/streamingload/test_job.rb
137
140
  - test/test_sqsdatasource.rb
138
141
  homepage: https://github.com/aamine/bricolage-streamingload
@@ -1,59 +0,0 @@
1
- module Bricolage
2
-
3
- module StreamingLoad
4
-
5
- class URLPatternNotMatched < StandardError; end
6
-
7
-
8
- class URLPatterns
9
-
10
- def URLPatterns.for_config(configs)
11
- new(configs.map {|c|
12
- Pattern.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
13
- })
14
- end
15
-
16
- def initialize(patterns)
17
- @patterns = patterns
18
- end
19
-
20
- def match(url)
21
- @patterns.each do |pat|
22
- components = pat.match(url)
23
- return components if components
24
- end
25
- raise URLPatternNotMatched, "no URL pattern matches the object url: #{url.inspect}"
26
- end
27
-
28
- class Pattern
29
- def initialize(url:, schema:, table:)
30
- @url_pattern = /\A#{url}\z/
31
- @schema = schema
32
- @table = table
33
- end
34
-
35
- attr_reader :url_pattern
36
- attr_reader :schema
37
- attr_reader :table
38
-
39
- def match(url)
40
- m = @url_pattern.match(url) or return nil
41
- Components.new(get_component(m, @schema), get_component(m, @table))
42
- end
43
-
44
- def get_component(m, label)
45
- if /\A%/ =~ label
46
- m[label[1..-1]]
47
- else
48
- label
49
- end
50
- end
51
- end
52
-
53
- Components = Struct.new(:schema_name, :table_name)
54
-
55
- end
56
-
57
- end # module StreamingLoad
58
-
59
- end # module Bricolage