bricolage-streamingload 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c8c9c822c1c2d207827cc5d0b6573140254b1638
4
- data.tar.gz: 166188b17014074c82a963aa9aecaff796ced70d
3
+ metadata.gz: 1aacc6209260b4c74e823fc9ea903bdd811c4d2b
4
+ data.tar.gz: c0b869e51e67a22708dd36b22afefb275e9a3169
5
5
  SHA512:
6
- metadata.gz: 17e35b2d5d49312d92a4f9f7e232cd6ff9daeb7975d7a1383936a935d12dd22c89550e1fece87920c36cac46fea6aa5517431c7af56e741cb5fe774a21cd7825
7
- data.tar.gz: 7556f6eb45b74d041891f3d9254aee4bf32c30d12e9aa61093351f5a08af4c7f6a04926b75a37d8cfbde4c359a50e5efc610cd6ceda605f63ca147e3d96d3285
6
+ metadata.gz: b6721bcb5c9dd506c801e71970e450e6a9318eac5171df099906ef91236e516a14f155cdd952240a534b80ab82771a88e22db49b309c8ab9cb210e1ae0747f1a
7
+ data.tar.gz: 95469bdc023a41ab08f4b0d6df776b95ef733949a87fb5e053f9bf0b1d74c75f35cde9b89335fe12ea486a214364cddcb9c263c7340bbdbdbd5f16f541f66653
@@ -0,0 +1,19 @@
1
+ module Bricolage
2
+
3
+ module StreamingLoad
4
+
5
+ class Chunk
6
+ def initialize(id:, url:, size: nil)
7
+ @id = id
8
+ @url = url
9
+ @size = size
10
+ end
11
+
12
+ attr_reader :id
13
+ attr_reader :url
14
+ attr_reader :size
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -1,43 +1,12 @@
1
- require 'bricolage/streamingload/task'
1
+ require 'bricolage/streamingload/loadtask'
2
+ require 'bricolage/streamingload/chunk'
2
3
  require 'bricolage/sqlutils'
3
- require 'forwardable'
4
4
 
5
5
  module Bricolage
6
6
 
7
7
  module StreamingLoad
8
8
 
9
- class LoadableObject
10
-
11
- extend Forwardable
12
-
13
- def initialize(event, components)
14
- @event = event
15
- @components = components
16
- end
17
-
18
- attr_reader :event
19
-
20
- def_delegator '@event', :url
21
- def_delegator '@event', :size
22
- def_delegator '@event', :message_id
23
- def_delegator '@event', :receipt_handle
24
- def_delegator '@components', :schema_name
25
- def_delegator '@components', :table_name
26
-
27
- def data_source_id
28
- "#{schema_name}.#{table_name}"
29
- end
30
-
31
- alias qualified_name data_source_id
32
-
33
- def event_time
34
- @event.time
35
- end
36
-
37
- end
38
-
39
-
40
- class ObjectBuffer
9
+ class ChunkBuffer
41
10
 
42
11
  TASK_GENERATION_TIME_LIMIT = 30 #sec
43
12
 
@@ -49,74 +18,83 @@ module Bricolage
49
18
  @task_generation_time_limit = TASK_GENERATION_TIME_LIMIT
50
19
  end
51
20
 
52
- def put(obj)
21
+ # chunk :: IncomingChunk
22
+ def save(chunk)
53
23
  @ctl_ds.open {|conn|
54
24
  suppress_sql_logging {
55
25
  conn.transaction {
56
- object_id = insert_object(conn, obj)
26
+ object_id = insert_object(conn, chunk)
57
27
  if object_id
58
28
  insert_task_objects(conn, object_id)
59
29
  else
60
- insert_dup_object(conn, obj)
30
+ @logger.info "Duplicated object recieved: url=#{chunk.url}"
31
+ insert_dup_object(conn, chunk)
61
32
  end
62
33
  }
63
34
  }
64
35
  }
65
36
  end
66
37
 
67
- # Flushes multiple tables periodically
68
- def flush_tasks
69
- task_ids = []
70
- warn_slow_task_generation {
71
- @ctl_ds.open {|conn|
38
+ # Flushes chunks of multiple streams, which are met conditions.
39
+ def flush_partial
40
+ task_ids = nil
41
+ tasks = nil
42
+
43
+ @ctl_ds.open {|conn|
44
+ warn_slow_task_generation {
72
45
  conn.transaction {|txn|
73
46
  task_ids = insert_tasks(conn)
74
- unless task_ids.empty?
75
- update_task_object_mappings(conn, task_ids)
76
- log_mapped_object_num(conn, task_ids)
77
- end
47
+ update_task_objects(conn, task_ids) unless task_ids.empty?
78
48
  }
79
49
  }
50
+ log_task_ids(task_ids)
51
+ tasks = load_tasks(conn, task_ids)
80
52
  }
81
- return task_ids.map {|id| LoadTask.create(task_id: id) }
53
+ tasks
82
54
  end
83
55
 
84
- # Flushes all objects of all tables immediately with no
85
- # additional conditions, to create "stream checkpoint".
86
- def flush_tasks_force
87
- task_ids = []
56
+ # Flushes all chunks of all stream with no additional conditions,
57
+ # to create "system checkpoint".
58
+ def flush_all
59
+ all_task_ids = []
60
+ tasks = nil
61
+
88
62
  @ctl_ds.open {|conn|
89
63
  conn.transaction {|txn|
90
- # update_task_object_mappings may not consume all saved objects
64
+ # update_task_objects may not consume all saved objects
91
65
  # (e.g. there are too many objects for one table), we must create
92
- # tasks repeatedly until there are no unassigned objects.
93
- until (ids = insert_tasks_force(conn)).empty?
94
- update_task_object_mappings(conn, ids)
95
- log_mapped_object_num(conn, ids)
96
- task_ids.concat ids
66
+ # tasks repeatedly until all objects are flushed.
67
+ until (task_ids = insert_tasks(conn, force: true)).empty?
68
+ update_task_objects(conn, task_ids)
69
+ all_task_ids.concat task_ids
97
70
  end
98
71
  }
72
+ log_task_ids(all_task_ids)
73
+ tasks = load_tasks(conn, all_task_ids)
99
74
  }
100
- return task_ids.map {|id| LoadTask.create(task_id: id) }
75
+ tasks
101
76
  end
102
77
 
103
- # Flushes the all objects of the specified table immediately
104
- # with no additional conditions, to create "table checkpoint".
105
- def flush_table_force(table_name)
106
- task_ids = []
78
+ # Flushes all chunks of the specified stream with no additional conditions,
79
+ # to create "stream checkpoint".
80
+ def flush_stream(stream_name)
81
+ all_task_ids = []
82
+ tasks = nil
83
+
107
84
  @ctl_ds.open {|conn|
108
85
  conn.transaction {|txn|
109
- # update_task_object_mappings may not consume all saved objects
86
+ # update_task_objects may not consume all saved objects
110
87
  # (e.g. there are too many objects for one table), we must create
111
- # tasks repeatedly until there are no unassigned objects.
112
- until (ids = insert_table_task_force(conn, table_name)).empty?
113
- update_task_object_mappings(conn, ids)
114
- log_mapped_object_num(conn, ids)
115
- task_ids.concat ids
88
+ # tasks repeatedly until all objects are flushed.
89
+ until (task_ids = insert_tasks_for_stream(conn, stream_name)).empty?
90
+ update_task_objects(conn, task_ids)
91
+ all_task_ids.concat task_ids
116
92
  end
117
93
  }
94
+ log_task_ids(all_task_ids)
95
+ tasks = load_tasks(conn, all_task_ids)
118
96
  }
119
- return task_ids.map {|id| LoadTask.create(task_id: id) }
97
+ tasks
120
98
  end
121
99
 
122
100
  private
@@ -134,7 +112,7 @@ module Bricolage
134
112
  values
135
113
  ( #{s obj.url}
136
114
  , #{obj.size}
137
- , #{s obj.data_source_id}
115
+ , #{s obj.stream_name}
138
116
  , #{s obj.message_id}
139
117
  , '#{obj.event_time}' AT TIME ZONE 'JST'
140
118
  , current_timestamp
@@ -148,7 +126,6 @@ module Bricolage
148
126
  end
149
127
 
150
128
  def insert_dup_object(conn, obj)
151
- @logger.info "Duplicated object recieved: object_url=#{obj.url}"
152
129
  conn.update(<<-EndSQL)
153
130
  insert into strload_dup_objects
154
131
  ( object_url
@@ -161,7 +138,7 @@ module Bricolage
161
138
  values
162
139
  ( #{s obj.url}
163
140
  , #{obj.size}
164
- , #{s obj.data_source_id}
141
+ , #{s obj.stream_name}
165
142
  , #{s obj.message_id}
166
143
  , '#{obj.event_time}' AT TIME ZONE 'JST'
167
144
  , current_timestamp
@@ -184,10 +161,6 @@ module Bricolage
184
161
  EndSQL
185
162
  end
186
163
 
187
- def insert_tasks_force(conn)
188
- insert_tasks(conn, force: true)
189
- end
190
-
191
164
  def insert_tasks(conn, force: false)
192
165
  task_ids = conn.query_values(<<-EndSQL)
193
166
  insert into strload_tasks
@@ -239,11 +212,10 @@ module Bricolage
239
212
  ;
240
213
  EndSQL
241
214
 
242
- log_created_tasks task_ids
243
215
  task_ids
244
216
  end
245
217
 
246
- def insert_table_task_force(conn, table_name)
218
+ def insert_tasks_for_stream(conn, stream_name)
247
219
  task_ids = conn.query_values(<<-EndSQL)
248
220
  insert into strload_tasks
249
221
  ( task_class
@@ -273,17 +245,15 @@ module Bricolage
273
245
  using (data_source_id)
274
246
  where
275
247
  -- does not check disabled
276
- data_source_id = #{s table_name}
248
+ data_source_id = #{s stream_name}
277
249
  returning task_id
278
250
  ;
279
251
  EndSQL
280
252
 
281
- # It must be 1
282
- log_created_tasks(task_ids)
283
253
  task_ids
284
254
  end
285
255
 
286
- def update_task_object_mappings(conn, task_ids)
256
+ def update_task_objects(conn, task_ids)
287
257
  conn.update(<<-EndSQL)
288
258
  update strload_task_objects dst
289
259
  set
@@ -309,25 +279,36 @@ module Bricolage
309
279
  and tsk_obj.object_seq <= tables.load_batch_size
310
280
  ;
311
281
  EndSQL
282
+ # UPDATE statement cannot return values
283
+ nil
312
284
  end
313
285
 
314
- def log_mapped_object_num(conn, task_ids)
315
- # This method is required since UPDATE does not "returning" multiple values
316
- rows = conn.query_values(<<-EndSQL)
317
- select
318
- task_id
319
- , count(*)
320
- from
321
- strload_task_objects
322
- where
323
- task_id in (#{task_ids.join(',')})
324
- group by
325
- task_id
326
- ;
327
- EndSQL
328
- rows.each_slice(2) do |task_id, object_count|
329
- @logger.info "Number of objects assigned to task: task_id=#{task_id} object_count=#{object_count}"
330
- end
286
+ def load_tasks(conn, task_ids)
287
+ return [] if task_ids.empty?
288
+
289
+ records = suppress_sql_logging {
290
+ conn.query_rows(<<-EndSQL)
291
+ select
292
+ t.task_id
293
+ , t.object_id
294
+ , o.object_url
295
+ , o.object_size
296
+ from
297
+ strload_task_objects t
298
+ inner join strload_objects o using (object_id)
299
+ where
300
+ task_id in (#{task_ids.join(',')})
301
+ ;
302
+ EndSQL
303
+ }
304
+
305
+ records.group_by {|row| row['task_id'] }.map {|task_id, rows|
306
+ chunks = rows.map {|row|
307
+ id, url, size = row.values_at('object_id', 'object_url', 'object_size')
308
+ Chunk.new(id: id, url: url, size: size)
309
+ }
310
+ LoadTask.new(id: task_id, chunks: chunks)
311
+ }
331
312
  end
332
313
 
333
314
  def suppress_sql_logging
@@ -341,7 +322,7 @@ module Bricolage
341
322
  end
342
323
  end
343
324
 
344
- def log_created_tasks(task_ids)
325
+ def log_task_ids(task_ids)
345
326
  created_task_num = task_ids.size
346
327
  @logger.info "Number of task created: #{created_task_num}"
347
328
  @logger.info "Created task ids: #{task_ids}" if created_task_num > 0
@@ -0,0 +1,57 @@
1
+ require 'bricolage/streamingload/incomingchunk'
2
+
3
+ module Bricolage
4
+
5
+ module StreamingLoad
6
+
7
+ class ChunkRoutingFailed < StandardError; end
8
+
9
+
10
+ class ChunkRouter
11
+
12
+ def ChunkRouter.for_config(configs)
13
+ new(configs.map {|c|
14
+ Route.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
15
+ })
16
+ end
17
+
18
+ def initialize(routes)
19
+ @routes = routes
20
+ end
21
+
22
+ def route(msg)
23
+ @routes.each do |route|
24
+ stream_name = route.match(msg.url)
25
+ return IncomingChunk.new(msg, stream_name) if stream_name
26
+ end
27
+ raise ChunkRoutingFailed, "could not detect stream name: #{url.inspect}"
28
+ end
29
+
30
+ class Route
31
+ def initialize(url:, schema:, table:)
32
+ @url_pattern = /\A#{url}\z/
33
+ @schema = schema
34
+ @table = table
35
+ end
36
+
37
+ def match(url)
38
+ m = @url_pattern.match(url) or return nil
39
+ c1 = get_component(m, @schema)
40
+ c2 = get_component(m, @table)
41
+ "#{c1}.#{c2}"
42
+ end
43
+
44
+ def get_component(m, label)
45
+ if /\A%/ =~ label
46
+ m[label[1..-1]]
47
+ else
48
+ label
49
+ end
50
+ end
51
+ end
52
+
53
+ end
54
+
55
+ end # module StreamingLoad
56
+
57
+ end # module Bricolage
@@ -3,9 +3,10 @@ require 'bricolage/exception'
3
3
  require 'bricolage/version'
4
4
  require 'bricolage/sqsdatasource'
5
5
  require 'bricolage/logger'
6
- require 'bricolage/streamingload/event'
7
- require 'bricolage/streamingload/objectbuffer'
8
- require 'bricolage/streamingload/urlpatterns'
6
+ require 'bricolage/streamingload/dispatchermessage'
7
+ require 'bricolage/streamingload/loadermessage'
8
+ require 'bricolage/streamingload/chunkrouter'
9
+ require 'bricolage/streamingload/chunkbuffer'
9
10
  require 'bricolage/streamingload/alertinglogger'
10
11
  require 'aws-sdk'
11
12
  require 'yaml'
@@ -40,18 +41,18 @@ module Bricolage
40
41
  )
41
42
  end
42
43
 
43
- object_buffer = ObjectBuffer.new(
44
+ chunk_buffer = ChunkBuffer.new(
44
45
  control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_ctl')),
45
46
  logger: logger
46
47
  )
47
48
 
48
- url_patterns = URLPatterns.for_config(config.fetch('url_patterns'))
49
+ chunk_router = ChunkRouter.for_config(config.fetch('url_patterns'))
49
50
 
50
51
  dispatcher = Dispatcher.new(
51
52
  event_queue: event_queue,
52
53
  task_queue: task_queue,
53
- object_buffer: object_buffer,
54
- url_patterns: url_patterns,
54
+ chunk_router: chunk_router,
55
+ chunk_buffer: chunk_buffer,
55
56
  dispatch_interval: config.fetch('dispatch-interval', 60),
56
57
  logger: logger
57
58
  )
@@ -60,6 +61,8 @@ module Bricolage
60
61
  create_pid_file opts.pid_file_path if opts.pid_file_path
61
62
  Dir.chdir '/'
62
63
  dispatcher.event_loop
64
+ rescue SystemExit
65
+ ;
63
66
  rescue Exception => e
64
67
  logger.exception e
65
68
  logger.error "dispatcher abort: pid=#{$$}"
@@ -82,11 +85,11 @@ module Bricolage
82
85
  # ignore
83
86
  end
84
87
 
85
- def initialize(event_queue:, task_queue:, object_buffer:, url_patterns:, dispatch_interval:, logger:)
88
+ def initialize(event_queue:, task_queue:, chunk_router:, chunk_buffer:, dispatch_interval:, logger:)
86
89
  @event_queue = event_queue
87
90
  @task_queue = task_queue
88
- @object_buffer = object_buffer
89
- @url_patterns = url_patterns
91
+ @chunk_router = chunk_router
92
+ @chunk_buffer = chunk_buffer
90
93
  @dispatch_interval = dispatch_interval
91
94
  @dispatch_message_id = nil
92
95
  @logger = logger
@@ -99,7 +102,7 @@ module Bricolage
99
102
  def event_loop
100
103
  logger.info "*** dispatcher started: pid=#{$$}"
101
104
  set_dispatch_timer
102
- @event_queue.handle_messages(handler: self, message_class: Event)
105
+ @event_queue.handle_messages(handler: self, message_class: DispatcherMessage)
103
106
  @event_queue.process_async_delete_force
104
107
  logger.info "*** shutdown gracefully: pid=#{$$}"
105
108
  end
@@ -111,86 +114,90 @@ module Bricolage
111
114
 
112
115
  if @dispatch_requested
113
116
  logger.info "*** dispatch requested"
114
- dispatch_tasks
117
+ do_handle_dispatch
115
118
  @dispatch_requested = false
116
119
  end
117
120
 
118
121
  if @checkpoint_requested
119
- create_checkpoint
122
+ do_handle_checkpoint
120
123
  @checkpoint_requested = false # is needless, but reset it just in case
121
124
  end
122
125
  end
123
126
 
124
- def handle_unknown(e)
125
- logger.warn "unknown event: #{e.message_body}"
126
- @event_queue.delete_message_async(e)
127
+ def handle_unknown(msg)
128
+ logger.warn "unknown event: #{msg.message_body}"
129
+ @event_queue.delete_message_async(msg)
127
130
  end
128
131
 
129
- def handle_shutdown(e)
132
+ def handle_shutdown(msg)
130
133
  logger.info "*** shutdown requested"
131
134
  @event_queue.initiate_terminate
132
135
  # Delete this event immediately
133
- @event_queue.delete_message(e)
136
+ @event_queue.delete_message(msg)
134
137
  end
135
138
 
136
- def handle_checkpoint(e)
139
+ def handle_checkpoint(msg)
137
140
  # Delay creating CHECKPOINT after the current message batch,
138
141
  # because any other extra events are already received.
139
142
  @checkpoint_requested = true
140
143
  # Delete this event immediately
141
- @event_queue.delete_message(e)
144
+ @event_queue.delete_message(msg)
142
145
  end
143
146
 
144
- def create_checkpoint
147
+ def do_handle_checkpoint
145
148
  logger.info "*** checkpoint requested"
146
149
  logger.info "Force-flushing all objects..."
147
- tasks = @object_buffer.flush_tasks_force
148
- send_tasks tasks
150
+ tasks = @chunk_buffer.flush_all
151
+ dispatch_tasks tasks
149
152
  logger.info "All objects flushed; shutting down..."
150
153
  @event_queue.initiate_terminate
151
154
  end
152
155
 
153
- def handle_data(e)
154
- unless e.created?
155
- @event_queue.delete_message_async(e)
156
+ def handle_data(msg)
157
+ unless msg.created_event?
158
+ @event_queue.delete_message_async(msg)
156
159
  return
157
160
  end
158
- obj = e.loadable_object(@url_patterns)
159
- @object_buffer.put(obj)
160
- @event_queue.delete_message_async(e)
161
+ chunk = @chunk_router.route(msg)
162
+ @chunk_buffer.save(chunk)
163
+ @event_queue.delete_message_async(msg)
161
164
  end
162
165
 
163
- def handle_dispatch(e)
166
+ def handle_dispatch(msg)
164
167
  # Dispatching tasks may takes 10 minutes or more, it can exceeds visibility timeout.
165
168
  # To avoid this, delay dispatching until all events of current message batch are processed.
166
- if @dispatch_message_id == e.message_id
169
+ if @dispatch_message_id == msg.message_id
167
170
  @dispatch_requested = true
168
171
  end
169
- @event_queue.delete_message_async(e)
172
+ @event_queue.delete_message_async(msg)
170
173
  end
171
174
 
172
- def dispatch_tasks
173
- tasks = @object_buffer.flush_tasks
174
- send_tasks tasks
175
+ def do_handle_dispatch
176
+ tasks = @chunk_buffer.flush_partial
177
+ dispatch_tasks tasks
175
178
  set_dispatch_timer
176
179
  end
177
180
 
178
181
  def set_dispatch_timer
179
- res = @event_queue.send_message(DispatchEvent.create(delay_seconds: @dispatch_interval))
182
+ res = @event_queue.send_message(DispatchDispatcherMessage.create(delay_seconds: @dispatch_interval))
180
183
  @dispatch_message_id = res.message_id
181
184
  end
182
185
 
183
- def handle_flushtable(e)
184
- logger.info "*** flushtable requested: table=#{e.table_name}"
185
- tasks = @object_buffer.flush_table_force(e.table_name)
186
- send_tasks tasks
186
+ def handle_flushtable(msg)
187
+ # FIXME: badly named attribute. table_name is really stream_name, which is called as data_source_id, too.
188
+ stream_name = msg.table_name
189
+
190
+ logger.info "*** flushtable requested: stream_name=#{stream_name}"
191
+ tasks = @chunk_buffer.flush_stream(stream_name)
192
+ dispatch_tasks tasks
187
193
  # Delete this event immediately
188
- @event_queue.delete_message(e)
194
+ @event_queue.delete_message(msg)
189
195
  end
190
196
 
191
- def send_tasks(tasks)
197
+ def dispatch_tasks(tasks)
192
198
  tasks.each do |task|
193
- @task_queue.put task
199
+ msg = StreamingLoadV3LoaderMessage.for_load_task(task)
200
+ @task_queue.put msg
194
201
  end
195
202
  end
196
203
 
@@ -207,9 +214,6 @@ module Bricolage
207
214
  @rest_arguments = nil
208
215
 
209
216
  @opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
210
- opts.on('--task-id=id', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
211
- @task_id = task_id
212
- }
213
217
  opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{Context::DEFAULT_ENV}]") {|env|
214
218
  @environment = env
215
219
  }
@@ -1,18 +1,19 @@
1
+ require 'bricolage/streamingload/chunk'
1
2
  require 'bricolage/sqsdatasource'
2
3
 
3
4
  module Bricolage
4
5
 
5
6
  module StreamingLoad
6
7
 
7
- class Event < SQSMessage
8
+ class DispatcherMessage < SQSMessage
8
9
 
9
- def Event.get_concrete_class(msg, rec)
10
+ def DispatcherMessage.get_concrete_class(msg, rec)
10
11
  case
11
- when rec['eventName'] == 'shutdown' then ShutdownEvent
12
- when rec['eventName'] == 'dispatch' then DispatchEvent
13
- when rec['eventName'] == 'flushtable' then FlushTableEvent
14
- when rec['eventName'] == 'checkpoint' then CheckPointEvent
15
- when !!rec['s3'] then S3ObjectEvent
12
+ when rec['eventName'] == 'shutdown' then ShutdownDispatcherMessage
13
+ when rec['eventName'] == 'dispatch' then DispatchDispatcherMessage
14
+ when rec['eventName'] == 'flushtable' then FlushTableDispatcherMessage
15
+ when rec['eventName'] == 'checkpoint' then CheckPointDispatcherMessage
16
+ when !!rec['s3'] then S3ObjectDispatcherMessage
16
17
  else UnknownSQSMessage
17
18
  end
18
19
  end
@@ -28,13 +29,13 @@ module Bricolage
28
29
  end
29
30
 
30
31
 
31
- class ShutdownEvent < Event
32
+ class ShutdownDispatcherMessage < DispatcherMessage
32
33
 
33
- def ShutdownEvent.create
34
+ def ShutdownDispatcherMessage.create
34
35
  super name: 'shutdown'
35
36
  end
36
37
 
37
- def ShutdownEvent.parse_sqs_record(msg, rec)
38
+ def ShutdownDispatcherMessage.parse_sqs_record(msg, rec)
38
39
  {}
39
40
  end
40
41
 
@@ -47,13 +48,13 @@ module Bricolage
47
48
 
48
49
 
49
50
  # Flushes all tables and shutdown
50
- class CheckPointEvent < Event
51
+ class CheckPointDispatcherMessage < DispatcherMessage
51
52
 
52
- def CheckPointEvent.create
53
+ def CheckPointDispatcherMessage.create
53
54
  super name: 'checkpoint'
54
55
  end
55
56
 
56
- def CheckPointEvent.parse_sqs_record(msg, rec)
57
+ def CheckPointDispatcherMessage.parse_sqs_record(msg, rec)
57
58
  {}
58
59
  end
59
60
 
@@ -65,13 +66,13 @@ module Bricolage
65
66
  end
66
67
 
67
68
 
68
- class FlushTableEvent < Event
69
+ class FlushTableDispatcherMessage < DispatcherMessage
69
70
 
70
- def FlushTableEvent.create(table_name:)
71
+ def FlushTableDispatcherMessage.create(table_name:)
71
72
  super name: 'flushtable', table_name: table_name
72
73
  end
73
74
 
74
- def FlushTableEvent.parse_sqs_record(msg, rec)
75
+ def FlushTableDispatcherMessage.parse_sqs_record(msg, rec)
75
76
  {
76
77
  table_name: rec['tableName']
77
78
  }
@@ -94,9 +95,9 @@ module Bricolage
94
95
  end
95
96
 
96
97
 
97
- class DispatchEvent < Event
98
+ class DispatchDispatcherMessage < DispatcherMessage
98
99
 
99
- def DispatchEvent.create(delay_seconds:)
100
+ def DispatchDispatcherMessage.create(delay_seconds:)
100
101
  super name: 'dispatch', delay_seconds: delay_seconds
101
102
  end
102
103
 
@@ -108,9 +109,9 @@ module Bricolage
108
109
  end
109
110
 
110
111
 
111
- class S3ObjectEvent < Event
112
+ class S3ObjectDispatcherMessage < DispatcherMessage
112
113
 
113
- def S3ObjectEvent.parse_sqs_record(msg, rec)
114
+ def S3ObjectDispatcherMessage.parse_sqs_record(msg, rec)
114
115
  {
115
116
  region: rec['awsRegion'],
116
117
  bucket: rec['s3']['bucket']['name'],
@@ -144,12 +145,12 @@ module Bricolage
144
145
  true
145
146
  end
146
147
 
147
- def created?
148
+ def created_event?
148
149
  !!(/\AObjectCreated:(?!Copy)/ =~ @name)
149
150
  end
150
151
 
151
- def loadable_object(url_patterns)
152
- LoadableObject.new(self, url_patterns.match(url))
152
+ def chunk
153
+ Chunk.new(id: nil, url: url, size: size)
153
154
  end
154
155
 
155
156
  end
@@ -0,0 +1,35 @@
1
+ require 'forwardable'
2
+
3
+ module Bricolage
4
+
5
+ module StreamingLoad
6
+
7
+ # a Chunk which is not saved yet (received from SQS)
8
+ class IncomingChunk
9
+
10
+ extend Forwardable
11
+
12
+ def initialize(message, stream_name)
13
+ @chunk = message.chunk
14
+ @message = message
15
+ @stream_name = stream_name
16
+ end
17
+
18
+ def_delegator '@chunk', :id
19
+ def_delegator '@chunk', :url
20
+ def_delegator '@chunk', :size
21
+
22
+ def_delegator '@message', :message_id
23
+ def_delegator '@message', :receipt_handle
24
+
25
+ def event_time
26
+ @message.time
27
+ end
28
+
29
+ attr_reader :stream_name
30
+
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -4,11 +4,11 @@ module Bricolage
4
4
 
5
5
  module StreamingLoad
6
6
 
7
- class Task < SQSMessage
7
+ class LoaderMessage < SQSMessage
8
8
 
9
- def Task.get_concrete_class(msg, rec)
9
+ def LoaderMessage.get_concrete_class(msg, rec)
10
10
  case
11
- when rec['eventName'] == 'streaming_load_v3' then LoadTask
11
+ when rec['eventName'] == 'streaming_load_v3' then StreamingLoadV3LoaderMessage
12
12
  else UnknownSQSMessage
13
13
  end
14
14
  end
@@ -24,13 +24,17 @@ module Bricolage
24
24
  end
25
25
 
26
26
 
27
- class LoadTask < Task
27
+ class StreamingLoadV3LoaderMessage < LoaderMessage
28
28
 
29
- def LoadTask.create(task_id:, force: false)
29
+ def StreamingLoadV3LoaderMessage.for_load_task(load_task)
30
+ create(task_id: load_task.id)
31
+ end
32
+
33
+ def StreamingLoadV3LoaderMessage.create(task_id:, force: false)
30
34
  super name: 'streaming_load_v3', task_id: task_id, force: force
31
35
  end
32
36
 
33
- def LoadTask.parse_sqs_record(msg, rec)
37
+ def StreamingLoadV3LoaderMessage.parse_sqs_record(msg, rec)
34
38
  {
35
39
  task_id: rec['taskId'],
36
40
  force: (rec['force'].to_s == 'true')
@@ -0,0 +1,17 @@
1
+ module Bricolage
2
+
3
+ module StreamingLoad
4
+
5
+ class LoadTask
6
+ def initialize(id:, chunks: [])
7
+ @id = id
8
+ @chunks = chunks
9
+ end
10
+
11
+ attr_reader :id
12
+ attr_reader :chunks
13
+ end
14
+
15
+ end
16
+
17
+ end
@@ -1,6 +1,6 @@
1
1
  require 'bricolage/context'
2
2
  require 'bricolage/sqsdatasource'
3
- require 'bricolage/streamingload/task'
3
+ require 'bricolage/streamingload/loadermessage'
4
4
  require 'bricolage/streamingload/job'
5
5
  require 'bricolage/streamingload/alertinglogger'
6
6
  require 'bricolage/logger'
@@ -113,7 +113,7 @@ module Bricolage
113
113
  end
114
114
 
115
115
  def event_loop
116
- @task_queue.handle_messages(handler: self, message_class: Task)
116
+ @task_queue.handle_messages(handler: self, message_class: LoaderMessage)
117
117
  end
118
118
 
119
119
  # message handler
@@ -1,5 +1,5 @@
1
1
  module Bricolage
2
2
  module StreamingLoad
3
- VERSION = '0.12.0'
3
+ VERSION = '0.13.0'
4
4
  end
5
5
  end
@@ -3,6 +3,8 @@ require 'bricolage/context'
3
3
  require 'bricolage/sqsdatasource'
4
4
  require 'bricolage/sqsmock'
5
5
  require 'bricolage/streamingload/dispatcher'
6
+ require 'bricolage/streamingload/chunkrouter'
7
+ require 'bricolage/streamingload/chunkbuffer'
6
8
 
7
9
  module Bricolage
8
10
  module StreamingLoad
@@ -35,12 +37,12 @@ module Bricolage
35
37
 
36
38
  task_queue = SQSDataSource.new_mock
37
39
 
38
- object_buffer = ObjectBuffer.new(
40
+ chunk_buffer = ChunkBuffer.new(
39
41
  control_data_source: ctl_ds,
40
42
  logger: ctx.logger
41
43
  )
42
44
 
43
- url_patterns = URLPatterns.for_config([
45
+ chunk_router = ChunkRouter.for_config([
44
46
  {
45
47
  "url" => %r<\As3://test-bucket/testschema\.desttable/datafile-\d{4}\.json\.gz>.source,
46
48
  "schema" => 'testschema',
@@ -51,8 +53,8 @@ module Bricolage
51
53
  dispatcher = Dispatcher.new(
52
54
  event_queue: event_queue,
53
55
  task_queue: task_queue,
54
- object_buffer: object_buffer,
55
- url_patterns: url_patterns,
56
+ chunk_buffer: chunk_buffer,
57
+ chunk_router: chunk_router,
56
58
  dispatch_interval: 600,
57
59
  logger: ctx.logger
58
60
  )
@@ -127,12 +129,12 @@ module Bricolage
127
129
 
128
130
  task_queue = SQSDataSource.new_mock
129
131
 
130
- object_buffer = ObjectBuffer.new(
132
+ chunk_buffer = ChunkBuffer.new(
131
133
  control_data_source: ctl_ds,
132
134
  logger: ctx.logger
133
135
  )
134
136
 
135
- url_patterns = URLPatterns.for_config([
137
+ chunk_router = ChunkRouter.for_config([
136
138
  {
137
139
  "url" => %r<\As3://test-bucket/testschema\.(?<table>\w+)/datafile-\d{4}\.json\.gz>.source,
138
140
  "schema" => 'testschema',
@@ -143,8 +145,8 @@ module Bricolage
143
145
  dispatcher = Dispatcher.new(
144
146
  event_queue: event_queue,
145
147
  task_queue: task_queue,
146
- object_buffer: object_buffer,
147
- url_patterns: url_patterns,
148
+ chunk_buffer: chunk_buffer,
149
+ chunk_router: chunk_router,
148
150
  dispatch_interval: 600,
149
151
  logger: ctx.logger
150
152
  )
@@ -1,12 +1,12 @@
1
1
  require 'test/unit'
2
- require 'bricolage/streamingload/event'
2
+ require 'bricolage/streamingload/dispatchermessage'
3
3
 
4
4
  module Bricolage::StreamingLoad
5
5
 
6
- class TestEvent < Test::Unit::TestCase
6
+ class TestDispatcherMessage < Test::Unit::TestCase
7
7
 
8
8
  def new_s3event(message_id: nil, receipt_handle: nil, name: nil, time: nil, source: nil, region: nil, bucket: nil, key: nil, size: nil)
9
- S3ObjectEvent.new(
9
+ S3ObjectDispatcherMessage.new(
10
10
  message_id: message_id,
11
11
  receipt_handle: receipt_handle,
12
12
  name: name,
@@ -21,9 +21,9 @@ module Bricolage::StreamingLoad
21
21
 
22
22
  test "#created?" do
23
23
  e = new_s3event(name: "ObjectCreated:Put")
24
- assert_true e.created?
24
+ assert_true e.created_event?
25
25
  e = new_s3event(name: "ObjectCreated:Copy")
26
- assert_false e.created?
26
+ assert_false e.created_event?
27
27
  end
28
28
 
29
29
  end
@@ -1,5 +1,4 @@
1
1
  require 'test/unit'
2
- require 'bricolage/streamingload/event'
3
2
  require 'bricolage/sqsmock'
4
3
  require 'bricolage/logger'
5
4
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage-streamingload
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-12-25 00:00:00.000000000 Z
12
+ date: 2018-01-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bricolage
@@ -120,19 +120,22 @@ files:
120
120
  - lib/bricolage/sqsdatasource.rb
121
121
  - lib/bricolage/sqsmock.rb
122
122
  - lib/bricolage/streamingload/alertinglogger.rb
123
+ - lib/bricolage/streamingload/chunk.rb
124
+ - lib/bricolage/streamingload/chunkbuffer.rb
125
+ - lib/bricolage/streamingload/chunkrouter.rb
123
126
  - lib/bricolage/streamingload/dispatcher.rb
124
- - lib/bricolage/streamingload/event.rb
127
+ - lib/bricolage/streamingload/dispatchermessage.rb
128
+ - lib/bricolage/streamingload/incomingchunk.rb
125
129
  - lib/bricolage/streamingload/job.rb
126
130
  - lib/bricolage/streamingload/jobparams.rb
131
+ - lib/bricolage/streamingload/loadermessage.rb
132
+ - lib/bricolage/streamingload/loadtask.rb
127
133
  - lib/bricolage/streamingload/manifest.rb
128
- - lib/bricolage/streamingload/objectbuffer.rb
129
- - lib/bricolage/streamingload/task.rb
130
134
  - lib/bricolage/streamingload/taskhandler.rb
131
- - lib/bricolage/streamingload/urlpatterns.rb
132
135
  - lib/bricolage/streamingload/version.rb
133
136
  - test/all.rb
134
137
  - test/streamingload/test_dispatcher.rb
135
- - test/streamingload/test_event.rb
138
+ - test/streamingload/test_dispatchermessage.rb
136
139
  - test/streamingload/test_job.rb
137
140
  - test/test_sqsdatasource.rb
138
141
  homepage: https://github.com/aamine/bricolage-streamingload
@@ -1,59 +0,0 @@
1
- module Bricolage
2
-
3
- module StreamingLoad
4
-
5
- class URLPatternNotMatched < StandardError; end
6
-
7
-
8
- class URLPatterns
9
-
10
- def URLPatterns.for_config(configs)
11
- new(configs.map {|c|
12
- Pattern.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
13
- })
14
- end
15
-
16
- def initialize(patterns)
17
- @patterns = patterns
18
- end
19
-
20
- def match(url)
21
- @patterns.each do |pat|
22
- components = pat.match(url)
23
- return components if components
24
- end
25
- raise URLPatternNotMatched, "no URL pattern matches the object url: #{url.inspect}"
26
- end
27
-
28
- class Pattern
29
- def initialize(url:, schema:, table:)
30
- @url_pattern = /\A#{url}\z/
31
- @schema = schema
32
- @table = table
33
- end
34
-
35
- attr_reader :url_pattern
36
- attr_reader :schema
37
- attr_reader :table
38
-
39
- def match(url)
40
- m = @url_pattern.match(url) or return nil
41
- Components.new(get_component(m, @schema), get_component(m, @table))
42
- end
43
-
44
- def get_component(m, label)
45
- if /\A%/ =~ label
46
- m[label[1..-1]]
47
- else
48
- label
49
- end
50
- end
51
- end
52
-
53
- Components = Struct.new(:schema_name, :table_name)
54
-
55
- end
56
-
57
- end # module StreamingLoad
58
-
59
- end # module Bricolage