bricolage-streamingload 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6fcd9200426dc3c2389566e0b6c5618d7ee2ae2f
4
- data.tar.gz: 6880ed950da15b2550c706bb0525364286889e23
3
+ metadata.gz: 17aa54eda4d063cb571f3a7671a4e6413ea079e1
4
+ data.tar.gz: 1e879a10e505c01a9f66393a079e18de997a3478
5
5
  SHA512:
6
- metadata.gz: e278f2dbf3e9cc26b3c1420bea5a4e580af7eaf006b01e2bfecd641efa553214583b3291160d80911c9b7ed06ee00164eb76a64b13a4ef7ab586dcfaae790de7
7
- data.tar.gz: ea493c3e3f2ee8644c3615dc1667fe2902120c8af5c86bfe71b882020aa967801c70b12eff69c0032f6ee486781fcde475a7c0e18e9d73bdb84ec2a32c62c535
6
+ metadata.gz: 5f5778b2ecff8e2bf8d06e1ab00e8e1ee394772a24e4812b74b921c8303566638a212632c896b9b46c7ed5918357db89c65116595664628dba1c2e3cfbef3375
7
+ data.tar.gz: ebb53db3b87334f5c725e28665f9716f2246d1aef4389d635627e31b07c356e0f9a9edea451162ca39505da82dba22f5fd710d8bbdcb24f8e0fd583c2c734955
@@ -1,19 +1,37 @@
1
+ require 'bricolage/logger'
2
+ require 'logger'
3
+ require 'forwardable'
4
+
1
5
  module Bricolage
2
6
  module StreamingLoad
3
7
  class AlertingLogger
4
8
  extend Forwardable
5
9
 
6
- def initialize(logger: , sns_datasource: , alert_level: 'warn')
10
+ def initialize(logger:, sns_datasource:, alert_level: 'warn')
7
11
  @logger = logger
8
- @sns_logger = Bricolage::Logger.new(device: sns_datasource)
9
- @sns_logger.level = Kernel.const_get("Logger").const_get(alert_level.upcase)
12
+ @alerter = Bricolage::Logger.new(device: sns_datasource)
13
+ @alerter.level = ::Logger.const_get(alert_level.upcase)
10
14
  end
11
15
 
12
16
  def_delegators '@logger', :level, :level=, :debug?, :info?, :warn?, :error?, :fatal?, :unknown?
13
17
 
14
- %w(log debug info warn error fatal unknown).each do |m|
18
+ %w[log debug info warn error fatal unknown].each do |m|
15
19
  define_method(m) do |*args|
16
- [@logger, @sns_logger].map {|t| t.send(m, *args) }
20
+ @logger.__send__(m, *args)
21
+ begin
22
+ @alerter.__send__(m, *args)
23
+ rescue Exception => err
24
+ @logger.error "could not send alert: #{err.message}"
25
+ end
26
+ end
27
+ end
28
+
29
+ def exception(ex)
30
+ @logger.exception(ex)
31
+ begin
32
+ @alerter.error(ex.message)
33
+ rescue Exception => err
34
+ @logger.error "could not send alert: #{err.message}"
17
35
  end
18
36
  end
19
37
 
@@ -29,16 +29,16 @@ module Bricolage
29
29
  config = YAML.load(File.read(config_path))
30
30
  logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
31
31
  ctx = Context.for_application('.', environment: opts.environment, logger: logger)
32
- event_queue = ctx.get_data_source('sqs', config.fetch('event-queue-ds'))
33
- task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
32
+ event_queue = ctx.get_data_source('sqs', config.fetch('event-queue-ds', 'sqs_event'))
33
+ task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds', 'sqs_task'))
34
34
  alert_logger = AlertingLogger.new(
35
35
  logger: ctx.logger,
36
- sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds')),
36
+ sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds', 'sns')),
37
37
  alert_level: config.fetch('alert-level', 'warn')
38
38
  )
39
39
 
40
40
  object_buffer = ObjectBuffer.new(
41
- control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
41
+ control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_data')),
42
42
  logger: alert_logger
43
43
  )
44
44
 
@@ -85,12 +85,14 @@ module Bricolage
85
85
  @dispatch_interval = dispatch_interval
86
86
  @dispatch_message_id = nil
87
87
  @logger = logger
88
+ @dispatch_requested = false
88
89
  @checkpoint_requested = false
89
90
  end
90
91
 
91
92
  attr_reader :logger
92
93
 
93
94
  def event_loop
95
+ logger.info "dispatcher started"
94
96
  set_dispatch_timer
95
97
  @event_queue.handle_messages(handler: self, message_class: Event)
96
98
  @event_queue.process_async_delete_force
@@ -99,9 +101,17 @@ module Bricolage
99
101
 
100
102
  # override
101
103
  def after_message_batch
104
+ # must be processed first
102
105
  @event_queue.process_async_delete
106
+
107
+ if @dispatch_requested
108
+ dispatch_tasks
109
+ @dispatch_requested = false
110
+ end
111
+
103
112
  if @checkpoint_requested
104
113
  create_checkpoint
114
+ @checkpoint_requested = false # is needless, but reset it just in case
105
115
  end
106
116
  end
107
117
 
@@ -139,13 +149,19 @@ module Bricolage
139
149
  end
140
150
 
141
151
  def handle_dispatch(e)
152
+ logger.info "dispatching tasks requested"
153
+ # Dispatching tasks may takes 10 minutes or more, it can exceeds visibility timeout.
154
+ # To avoid this, delay dispatching until all events of current message batch are processed.
142
155
  if @dispatch_message_id == e.message_id
143
- tasks = @object_buffer.flush_tasks
144
- send_tasks tasks
145
- set_dispatch_timer
156
+ @dispatch_requested = true
146
157
  end
147
- # Delete this event immediately
148
- @event_queue.delete_message(e)
158
+ @event_queue.delete_message_async(e)
159
+ end
160
+
161
+ def dispatch_tasks
162
+ tasks = @object_buffer.flush_tasks
163
+ send_tasks tasks
164
+ set_dispatch_timer
149
165
  end
150
166
 
151
167
  def set_dispatch_timer
@@ -51,7 +51,7 @@ module Bricolage
51
51
  strload_tasks
52
52
  where
53
53
  task_id = #{@params.task_id}
54
- and (task_id not in (select task_id from strload_jobs) or #{@params.force})
54
+ and (#{@params.force?} or task_id not in (select task_id from strload_jobs))
55
55
  returning job_id
56
56
  ;
57
57
  EndSQL
@@ -60,26 +60,20 @@ module Bricolage
60
60
  end
61
61
 
62
62
  def do_load
63
- ManifestFile.create(
64
- @params.ctl_bucket,
65
- job_id: @job_id,
66
- object_urls: @params.object_urls,
67
- logger: @logger
68
- ) {|manifest|
69
- if @params.enable_work_table?
70
- prepare_work_table @params.work_table
63
+ manifest = ManifestFile.create(@params.ctl_bucket, job_id: @job_id, object_urls: @params.object_urls, logger: @logger)
64
+ if @params.enable_work_table?
65
+ @connection.transaction {|txn|
66
+ # NOTE: This transaction ends with truncation, this DELETE does nothing
67
+ # from the second time. So don't worry about DELETE cost here.
68
+ @connection.execute("delete from #{@params.work_table}")
71
69
  load_objects @params.work_table, manifest, @params.load_options_string
72
- @connection.transaction {
73
- commit_work_table @params
74
- commit_job_result
75
- }
76
- else
77
- @connection.transaction {
78
- load_objects @params.dest_table, manifest, @params.load_options_string
79
- commit_job_result
80
- }
81
- end
82
- }
70
+ commit_work_table txn, @params
71
+ }
72
+ commit_job_result
73
+ else
74
+ load_objects @params.dest_table, manifest, @params.load_options_string
75
+ commit_job_result
76
+ end
83
77
  rescue JobFailure => ex
84
78
  write_job_error 'failure', ex.message
85
79
  raise
@@ -88,10 +82,6 @@ module Bricolage
88
82
  raise
89
83
  end
90
84
 
91
- def prepare_work_table(work_table)
92
- @connection.execute("truncate #{work_table}")
93
- end
94
-
95
85
  def load_objects(dest_table, manifest, options)
96
86
  @connection.execute(<<-EndSQL.strip.gsub(/\s+/, ' '))
97
87
  copy #{dest_table}
@@ -106,14 +96,37 @@ module Bricolage
106
96
  @logger.info "load succeeded: #{manifest.url}"
107
97
  end
108
98
 
109
- def commit_work_table(params)
99
+ def commit_work_table(txn, params)
110
100
  @connection.execute(params.sql_source)
111
- # keep work table records for later tracking
101
+ txn.truncate_and_commit(params.work_table)
112
102
  end
113
103
 
114
104
  def commit_job_result
115
105
  @end_time = Time.now
116
- write_job_result 'success', ''
106
+ @ctl_ds.open {|conn|
107
+ conn.transaction {
108
+ write_job_result conn, 'success', ''
109
+ update_loaded_flag conn
110
+ }
111
+ }
112
+ end
113
+
114
+ def update_loaded_flag(connection)
115
+ connection.execute(<<-EndSQL)
116
+ update
117
+ strload_objects
118
+ set
119
+ loaded = true
120
+ where
121
+ object_id in (
122
+ select
123
+ object_id
124
+ from
125
+ strload_task_objects
126
+ where task_id = (select task_id from strload_jobs where job_id = #{@job_id})
127
+ )
128
+ ;
129
+ EndSQL
117
130
  end
118
131
 
119
132
  MAX_MESSAGE_LENGTH = 1000
@@ -121,23 +134,23 @@ module Bricolage
121
134
  def write_job_error(status, message)
122
135
  @end_time = Time.now
123
136
  @logger.warn message.lines.first
124
- write_job_result status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
125
- end
126
-
127
- def write_job_result(status, message)
128
137
  @ctl_ds.open {|conn|
129
- conn.execute(<<-EndSQL)
130
- update
131
- strload_jobs
132
- set
133
- (status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
134
- where
135
- job_id = #{@job_id}
136
- ;
137
- EndSQL
138
+ write_job_result conn, status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
138
139
  }
139
140
  end
140
141
 
142
+ def write_job_result(connection, status, message)
143
+ connection.execute(<<-EndSQL)
144
+ update
145
+ strload_jobs
146
+ set
147
+ (status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
148
+ where
149
+ job_id = #{@job_id}
150
+ ;
151
+ EndSQL
152
+ end
153
+
141
154
  end
142
155
 
143
156
  end
@@ -1,3 +1,4 @@
1
+ require 'bricolage/job'
1
2
  require 'bricolage/rubyjobclass'
2
3
  require 'bricolage/psqldatasource'
3
4
 
@@ -60,8 +61,8 @@ module Bricolage
60
61
  @task.table
61
62
  end
62
63
 
63
- def force
64
- @task.force
64
+ def force?
65
+ @task.force?
65
66
  end
66
67
 
67
68
  def object_urls
@@ -1,3 +1,4 @@
1
+ require 'bricolage/context'
1
2
  require 'bricolage/sqsdatasource'
2
3
  require 'bricolage/streamingload/task'
3
4
  require 'bricolage/streamingload/loader'
@@ -5,6 +6,7 @@ require 'bricolage/streamingload/alertinglogger'
5
6
  require 'bricolage/logger'
6
7
  require 'bricolage/exception'
7
8
  require 'bricolage/version'
9
+ require 'yaml'
8
10
  require 'optparse'
9
11
 
10
12
  module Bricolage
@@ -23,21 +25,25 @@ module Bricolage
23
25
  config_path, * = opts.rest_arguments
24
26
  config = YAML.load(File.read(config_path))
25
27
  logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
26
- ctx = Context.for_application('.', environment: opts.environment, logger: logger)
27
- redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds'))
28
- task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
29
- alert_logger = AlertingLogger.new(
30
- logger: ctx.logger,
31
- sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds')),
32
- alert_level: config.fetch('alert-level', 'warn')
33
- )
28
+ ctx = Context.for_application(opts.working_dir, environment: opts.environment, logger: logger)
29
+ redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds', 'db_data'))
30
+ task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds', 'sqs_task'))
31
+ raw_logger = logger = ctx.logger
32
+ if config.key?('alert-level')
33
+ logger = AlertingLogger.new(
34
+ logger: raw_logger,
35
+ sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds', 'sns')),
36
+ alert_level: config.fetch('alert-level', 'warn')
37
+ )
38
+ end
34
39
 
35
40
  service = new(
36
41
  context: ctx,
37
- control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
42
+ control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_ctl')),
38
43
  data_source: redshift_ds,
39
44
  task_queue: task_queue,
40
- logger: alert_logger
45
+ working_dir: opts.working_dir,
46
+ logger: logger
41
47
  )
42
48
 
43
49
  if opts.task_id
@@ -46,12 +52,18 @@ module Bricolage
46
52
  else
47
53
  # Server mode
48
54
  Process.daemon(true) if opts.daemon?
55
+ Dir.chdir '/'
49
56
  create_pid_file opts.pid_file_path if opts.pid_file_path
50
- service.event_loop
57
+ begin
58
+ logger.info "*** bricolage-streaming-loader started: pid=#{$$}"
59
+ service.event_loop
60
+ logger.info "*** bricolage-streaming-loader shutdown gracefully: pid=#{$$}"
61
+ rescue Exception => ex
62
+ logger.exception(ex)
63
+ logger.error "*** bricolage-streaming-loader abort: pid=#{$$}"
64
+ raise
65
+ end
51
66
  end
52
- rescue Exception => e
53
- alert_logger.error e.message
54
- raise
55
67
  end
56
68
 
57
69
  def LoaderService.new_logger(path, config)
@@ -70,11 +82,12 @@ module Bricolage
70
82
  # ignore
71
83
  end
72
84
 
73
- def initialize(context:, control_data_source:, data_source:, task_queue:, logger:)
85
+ def initialize(context:, control_data_source:, data_source:, task_queue:, working_dir:, logger:)
74
86
  @ctx = context
75
87
  @ctl_ds = control_data_source
76
88
  @ds = data_source
77
89
  @task_queue = task_queue
90
+ @working_dir = working_dir
78
91
  @logger = logger
79
92
  end
80
93
 
@@ -82,7 +95,6 @@ module Bricolage
82
95
 
83
96
  def event_loop
84
97
  @task_queue.handle_messages(handler: self, message_class: Task)
85
- @logger.info "shutdown gracefully"
86
98
  end
87
99
 
88
100
  def execute_task_by_id(task_id)
@@ -95,19 +107,23 @@ module Bricolage
95
107
 
96
108
  # message handler
97
109
  def handle_streaming_load_v3(task)
98
- # 1. Load task detail from table
99
- # 2. Skip disabled (sqs message should not have disabled state since it will never be exectuted)
100
- # 3. Try execute
101
- # - Skip if the task has already been executed AND force = false
102
- loadtask = load_task(task.id, force: task.force)
103
- return if loadtask.disabled # skip if disabled, but don't delete sqs msg
104
- execute_task(loadtask)
105
- # Delete load task immediately (do not use async delete)
106
- @task_queue.delete_message(task)
110
+ Dir.chdir(@working_dir) {
111
+ loadtask = load_task(task.id, force: task.force?)
112
+ if loadtask.disabled
113
+ # Skip if disabled, and don't delete SQS message.
114
+ @logger.info "skip disabled task: task_id=#{task.id}"
115
+ return
116
+ end
117
+ execute_task(loadtask)
118
+ # Do not use async delete
119
+ @task_queue.delete_message(task)
120
+ }
121
+ rescue => ex
122
+ @logger.exception ex
107
123
  end
108
124
 
109
125
  def execute_task(task)
110
- @logger.info "handling load task: table=#{task.qualified_name} task_id=#{task.id}"
126
+ @logger.info "execute task: task_id=#{task.id} table=#{task.qualified_name}"
111
127
  loader = Loader.load_from_file(@ctx, @ctl_ds, task, logger: @logger)
112
128
  loader.execute
113
129
  end
@@ -119,16 +135,18 @@ module Bricolage
119
135
  def initialize(argv)
120
136
  @argv = argv
121
137
  @task_id = nil
138
+ @environment = Context::DEFAULT_ENV
122
139
  @daemon = false
123
140
  @log_file_path = nil
124
141
  @pid_file_path = nil
142
+ @working_dir = Dir.getwd
125
143
  @rest_arguments = nil
126
144
 
127
145
  @opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
128
146
  opts.on('--task-id=ID', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
129
147
  @task_id = task_id
130
148
  }
131
- opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{Context::DEFAULT_ENV}]") {|env|
149
+ opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{@environment}]") {|env|
132
150
  @environment = env
133
151
  }
134
152
  opts.on('--daemon', 'Becomes daemon in server mode.') {
@@ -140,6 +158,9 @@ module Bricolage
140
158
  opts.on('--pid-file=PATH', 'Creates PID file.') {|path|
141
159
  @pid_file_path = path
142
160
  }
161
+ opts.on('--working-dir=PATH', "Loader working directory. [default: #{@working_dir}]") {|path|
162
+ @working_dir = path
163
+ }
143
164
  opts.on('--help', 'Prints this message and quit.') {
144
165
  puts opts.help
145
166
  exit 0
@@ -161,14 +182,18 @@ module Bricolage
161
182
  raise OptionError, err.message
162
183
  end
163
184
 
164
- attr_reader :rest_arguments, :environment, :log_file_path
185
+ attr_reader :rest_arguments
186
+
165
187
  attr_reader :task_id
188
+ attr_reader :environment
166
189
 
167
190
  def daemon?
168
191
  @daemon
169
192
  end
170
193
 
194
+ attr_reader :log_file_path
171
195
  attr_reader :pid_file_path
196
+ attr_reader :working_dir
172
197
 
173
198
  end
174
199
 
@@ -6,7 +6,12 @@ module Bricolage
6
6
 
7
7
  def ManifestFile.create(ds, job_id:, object_urls:, logger:, noop: false, &block)
8
8
  manifest = new(ds, job_id, object_urls, logger: logger, noop: noop)
9
- manifest.create_temporary(&block)
9
+ if block
10
+ manifest.create_temporary(&block)
11
+ else
12
+ manifest.put
13
+ return manifest
14
+ end
10
15
  end
11
16
 
12
17
  def initialize(ds, job_id, object_urls, logger:, noop: false)
@@ -22,7 +27,9 @@ module Bricolage
22
27
  end
23
28
 
24
29
  def name
25
- @name ||= "manifest-#{@job_id}.json"
30
+ return @name if @name
31
+ now =Time.now
32
+ "#{now.strftime('%Y/%m/%d')}/manifest-#{now.strftime('%H%M%S')}-#{@job_id}.json"
26
33
  end
27
34
 
28
35
  def url
@@ -42,26 +42,43 @@ module Bricolage
42
42
 
43
43
  class ObjectBuffer
44
44
 
45
+ TASK_GENERATION_TIME_LIMIT = 30 #sec
46
+
45
47
  include SQLUtils
46
48
 
47
49
  def initialize(control_data_source:, logger:)
48
50
  @ctl_ds = control_data_source
49
51
  @logger = logger
52
+ @task_generation_time_limit = TASK_GENERATION_TIME_LIMIT
50
53
  end
51
54
 
52
55
  def put(obj)
53
56
  @ctl_ds.open {|conn|
54
- insert_object(conn, obj)
57
+ suppress_sql_logging {
58
+ conn.transaction {
59
+ object_id = insert_object(conn, obj)
60
+ if object_id
61
+ insert_task_objects(conn, object_id)
62
+ else
63
+ insert_dup_object(conn, obj)
64
+ end
65
+ }
66
+ }
55
67
  }
56
68
  end
57
69
 
58
70
  # Flushes multiple tables periodically
59
71
  def flush_tasks
60
- task_ids = nil
61
- @ctl_ds.open {|conn|
62
- conn.transaction {|txn|
63
- task_ids = insert_tasks(conn)
64
- insert_task_object_mappings(conn) unless task_ids.empty?
72
+ task_ids = []
73
+ warn_slow_task_generation {
74
+ @ctl_ds.open {|conn|
75
+ conn.transaction {|txn|
76
+ task_ids = insert_tasks(conn)
77
+ unless task_ids.empty?
78
+ update_task_object_mappings(conn, task_ids)
79
+ log_mapped_object_num(conn, task_ids)
80
+ end
81
+ }
65
82
  }
66
83
  }
67
84
  return task_ids.map {|id| LoadTask.create(task_id: id) }
@@ -73,11 +90,12 @@ module Bricolage
73
90
  task_ids = []
74
91
  @ctl_ds.open {|conn|
75
92
  conn.transaction {|txn|
76
- # insert_task_object_mappings may not consume all saved objects
93
+ # update_task_object_mappings may not consume all saved objects
77
94
  # (e.g. there are too many objects for one table), we must create
78
95
  # tasks repeatedly until there are no unassigned objects.
79
96
  until (ids = insert_tasks_force(conn)).empty?
80
- insert_task_object_mappings(conn)
97
+ update_task_object_mappings(conn, ids)
98
+ log_mapped_object_num(conn, ids)
81
99
  task_ids.concat ids
82
100
  end
83
101
  }
@@ -91,11 +109,12 @@ module Bricolage
91
109
  task_ids = []
92
110
  @ctl_ds.open {|conn|
93
111
  conn.transaction {|txn|
94
- # insert_task_object_mappings may not consume all saved objects
112
+ # update_task_object_mappings may not consume all saved objects
95
113
  # (e.g. there are too many objects for one table), we must create
96
114
  # tasks repeatedly until there are no unassigned objects.
97
115
  until (ids = insert_table_task_force(conn, table_name)).empty?
98
- insert_task_object_mappings(conn)
116
+ update_task_object_mappings(conn, ids)
117
+ log_mapped_object_num(conn, ids)
99
118
  task_ids.concat ids
100
119
  end
101
120
  }
@@ -106,30 +125,66 @@ module Bricolage
106
125
  private
107
126
 
108
127
  def insert_object(conn, obj)
109
- suppress_sql_logging {
110
- conn.update(<<-EndSQL)
111
- insert into strload_objects
112
- ( object_url
113
- , object_size
114
- , data_source_id
115
- , message_id
116
- , event_time
117
- , submit_time
118
- )
119
- select
120
- #{s obj.url}
121
- , #{obj.size}
122
- , #{s obj.data_source_id}
123
- , #{s obj.message_id}
124
- , '#{obj.event_time}' AT TIME ZONE 'JST'
125
- , current_timestamp
126
- from
127
- strload_tables
128
- where
129
- data_source_id = #{s obj.data_source_id}
130
- ;
131
- EndSQL
132
- }
128
+ object_ids = conn.query_values(<<-EndSQL)
129
+ insert into strload_objects
130
+ ( object_url
131
+ , object_size
132
+ , data_source_id
133
+ , message_id
134
+ , event_time
135
+ , submit_time
136
+ )
137
+ values
138
+ ( #{s obj.url}
139
+ , #{obj.size}
140
+ , #{s obj.data_source_id}
141
+ , #{s obj.message_id}
142
+ , '#{obj.event_time}' AT TIME ZONE 'JST'
143
+ , current_timestamp
144
+ )
145
+ on conflict on constraint strload_objects_object_url
146
+ do nothing
147
+ returning object_id
148
+ ;
149
+ EndSQL
150
+ return object_ids.first
151
+ end
152
+
153
+ def insert_dup_object(conn, obj)
154
+ @logger.info "Duplicated object recieved: object_url=#{obj.url}"
155
+ conn.update(<<-EndSQL)
156
+ insert into strload_dup_objects
157
+ ( object_url
158
+ , object_size
159
+ , data_source_id
160
+ , message_id
161
+ , event_time
162
+ , submit_time
163
+ )
164
+ values
165
+ ( #{s obj.url}
166
+ , #{obj.size}
167
+ , #{s obj.data_source_id}
168
+ , #{s obj.message_id}
169
+ , '#{obj.event_time}' AT TIME ZONE 'JST'
170
+ , current_timestamp
171
+ )
172
+ ;
173
+ EndSQL
174
+ end
175
+
176
+ def insert_task_objects(conn, object_id)
177
+ conn.update(<<-EndSQL)
178
+ insert into strload_task_objects
179
+ ( task_id
180
+ , object_id
181
+ )
182
+ values
183
+ ( -1
184
+ , #{object_id}
185
+ )
186
+ ;
187
+ EndSQL
133
188
  end
134
189
 
135
190
  def insert_tasks_force(conn)
@@ -140,14 +195,12 @@ module Bricolage
140
195
  task_ids = conn.query_values(<<-EndSQL)
141
196
  insert into strload_tasks
142
197
  ( task_class
143
- , schema_name
144
- , table_name
198
+ , table_id
145
199
  , submit_time
146
200
  )
147
201
  select
148
202
  'streaming_load_v3'
149
- , tbl.schema_name
150
- , tbl.table_name
203
+ , tbl.table_id
151
204
  , current_timestamp
152
205
  from
153
206
  strload_tables tbl
@@ -158,19 +211,9 @@ module Bricolage
158
211
  data_source_id
159
212
  , count(*) as object_count
160
213
  from
161
- (
162
- select
163
- min(object_id) as object_id
164
- , object_url
165
- from
166
- strload_objects
167
- group by
168
- object_url
169
- ) uniq_objects
170
- inner join strload_objects using (object_id)
171
- left outer join strload_task_objects using (object_id)
214
+ strload_objects
172
215
  where
173
- task_id is null -- not assigned to a task
216
+ object_id in (select object_id from strload_task_objects where task_id = -1)
174
217
  group by
175
218
  data_source_id
176
219
  ) obj
@@ -179,28 +222,27 @@ module Bricolage
179
222
  -- preceeding task's submit time
180
223
  left outer join (
181
224
  select
182
- schema_name
183
- , table_name
225
+ table_id
184
226
  , max(submit_time) as latest_submit_time
185
227
  from
186
228
  strload_tasks
187
229
  group by
188
- schema_name, table_name
230
+ table_id
189
231
  ) task
190
- using (schema_name, table_name)
232
+ using (table_id)
191
233
  where
192
234
  not tbl.disabled -- not disabled
193
235
  and (
194
236
  #{force ? "true or" : ""} -- Creates tasks with no conditions if forced
195
237
  obj.object_count > tbl.load_batch_size -- batch_size exceeded?
196
- or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
197
- or latest_submit_time is null -- no previous tasks?
238
+ or extract(epoch from current_timestamp - task.latest_submit_time) > tbl.load_interval -- load_interval exceeded?
239
+ or task.latest_submit_time is null -- no previous tasks?
198
240
  )
199
241
  returning task_id
200
242
  ;
201
243
  EndSQL
202
244
 
203
- @logger.info "Number of task created: #{task_ids.size}"
245
+ log_created_tasks task_ids
204
246
  task_ids
205
247
  end
206
248
 
@@ -208,14 +250,12 @@ module Bricolage
208
250
  task_ids = conn.query_values(<<-EndSQL)
209
251
  insert into strload_tasks
210
252
  ( task_class
211
- , schema_name
212
- , table_name
253
+ , table_id
213
254
  , submit_time
214
255
  )
215
256
  select
216
257
  'streaming_load_v3'
217
- , tbl.schema_name
218
- , tbl.table_name
258
+ , tbl.table_id
219
259
  , current_timestamp
220
260
  from
221
261
  strload_tables tbl
@@ -227,21 +267,9 @@ module Bricolage
227
267
  data_source_id
228
268
  , count(*) as object_count
229
269
  from
230
- (
231
- select
232
- min(object_id) as object_id
233
- , object_url
234
- from
235
- strload_objects
236
- where
237
- data_source_id = #{s table_name}
238
- group by
239
- object_url
240
- ) uniq_objects
241
- inner join strload_objects using (object_id)
242
- left outer join strload_task_objects using (object_id)
270
+ strload_objects
243
271
  where
244
- task_id is null -- not assigned to a task
272
+ object_id in (select object_id from strload_task_objects where task_id = -1)
245
273
  group by
246
274
  data_source_id
247
275
  ) obj
@@ -254,70 +282,55 @@ module Bricolage
254
282
  EndSQL
255
283
 
256
284
  # It must be 1
257
- @logger.info "Number of task created: #{task_ids.size}"
285
+ log_created_tasks(task_ids)
258
286
  task_ids
259
287
  end
260
288
 
261
- def insert_task_object_mappings(conn)
289
+ def update_task_object_mappings(conn, task_ids)
262
290
  conn.update(<<-EndSQL)
263
- insert into strload_task_objects
264
- ( task_id
265
- , object_id
266
- )
291
+ update strload_task_objects dst
292
+ set
293
+ task_id = tasks.task_id
294
+ from
295
+ strload_tasks tasks
296
+ inner join strload_tables tables using (table_id)
297
+ inner join (
298
+ select
299
+ object_id
300
+ , data_source_id
301
+ , row_number() over (partition by data_source_id order by object_id) as object_seq
302
+ from
303
+ strload_objects
304
+ where
305
+ object_id in (select object_id from strload_task_objects where task_id = -1)
306
+ ) tsk_obj
307
+ using (data_source_id)
308
+ where
309
+ dst.task_id = -1
310
+ and tasks.task_id in (#{task_ids.join(",")})
311
+ and dst.object_id = tsk_obj.object_id
312
+ and tsk_obj.object_seq <= tables.load_batch_size
313
+ ;
314
+ EndSQL
315
+ end
316
+
317
+ def log_mapped_object_num(conn, task_ids)
318
+ # This method is required since UPDATE does not "returning" multiple values
319
+ rows = conn.query_values(<<-EndSQL)
267
320
  select
268
321
  task_id
269
- , object_id
270
- from (
271
- select
272
- row_number() over (partition by task.task_id order by obj.object_id) as object_count
273
- , task.task_id
274
- , obj.object_id
275
- , load_batch_size
276
- from
277
- -- unassigned objects
278
- (
279
- select
280
- data_source_id
281
- , uniq_objects.object_url
282
- , object_id
283
- from
284
- (
285
- select
286
- min(object_id) as object_id
287
- , object_url
288
- from
289
- strload_objects
290
- group by
291
- object_url
292
- ) uniq_objects
293
- inner join strload_objects using(object_id)
294
- left outer join strload_task_objects using(object_id)
295
- where
296
- task_id is null
297
- ) obj
298
-
299
- -- tasks without objects
300
- inner join (
301
- select
302
- tbl.data_source_id
303
- , min(task_id) as task_id -- pick up oldest task
304
- , max(load_batch_size) as load_batch_size
305
- from
306
- strload_tasks
307
- inner join strload_tables tbl
308
- using (schema_name, table_name)
309
- where
310
- -- unassigned objects
311
- task_id not in (select distinct task_id from strload_task_objects)
312
- group by
313
- 1
314
- ) task
315
- using (data_source_id)
316
- ) as t
322
+ , count(*)
323
+ from
324
+ strload_task_objects
317
325
  where
318
- object_count <= load_batch_size -- limit number of objects assigned to single task
326
+ task_id in (#{task_ids.join(',')})
327
+ group by
328
+ task_id
319
329
  ;
320
330
  EndSQL
331
+ rows.each_slice(2) do |task_id, object_count|
332
+ @logger.info "Number of objects assigned to task: task_id=#{task_id} object_count=#{object_count}"
333
+ end
321
334
  end
322
335
 
323
336
  def suppress_sql_logging
@@ -331,6 +344,22 @@ module Bricolage
331
344
  end
332
345
  end
333
346
 
347
+ def log_created_tasks(task_ids)
348
+ created_task_num = task_ids.size
349
+ @logger.info "Number of task created: #{created_task_num}"
350
+ @logger.info "Created task ids: #{task_ids}" if created_task_num > 0
351
+ end
352
+
353
+ def warn_slow_task_generation(&block)
354
+ start_time = Time.now
355
+ yield
356
+ exec_time = (Time.now - start_time)
357
+ if exec_time > @task_generation_time_limit
358
+ @logger.warn "Long task generation time: #{exec_time}"
359
+ @task_generation_time_limit = @task_generation_time_limit * 1.1
360
+ end
361
+ end
362
+
334
363
  end
335
364
 
336
365
  end
@@ -35,7 +35,7 @@ module Bricolage
35
35
  def LoadTask.parse_sqs_record(msg, rec)
36
36
  {
37
37
  task_id: rec['taskId'],
38
- force: rec['force'],
38
+ force: (rec['force'].to_s == 'true')
39
39
  }
40
40
  end
41
41
 
@@ -49,7 +49,7 @@ module Bricolage
49
49
  from
50
50
  strload_tasks tsk
51
51
  inner join strload_tables tbl
52
- using(schema_name, table_name)
52
+ using(table_id)
53
53
  where
54
54
  task_id = #{task_id}
55
55
  ;
@@ -94,7 +94,11 @@ module Bricolage
94
94
  @disabled = disabled
95
95
  end
96
96
 
97
- attr_reader :id, :force
97
+ attr_reader :id
98
+
99
+ def force?
100
+ !!@force
101
+ end
98
102
 
99
103
  #
100
104
  # For writer only
@@ -1,5 +1,5 @@
1
1
  module Bricolage
2
2
  module StreamingLoad
3
- VERSION = '0.5.1'
3
+ VERSION = '0.6.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage-streamingload
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-08-24 00:00:00.000000000 Z
12
+ date: 2016-09-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bricolage
@@ -17,42 +17,42 @@ dependencies:
17
17
  requirements:
18
18
  - - '='
19
19
  - !ruby/object:Gem::Version
20
- version: 5.16.8
20
+ version: 5.16.9
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - '='
26
26
  - !ruby/object:Gem::Version
27
- version: 5.16.8
27
+ version: 5.16.9
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: pg
30
30
  requirement: !ruby/object:Gem::Requirement
31
31
  requirements:
32
- - - ">="
32
+ - - '='
33
33
  - !ruby/object:Gem::Version
34
- version: '0'
34
+ version: 0.18.4
35
35
  type: :runtime
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
- - - ">="
39
+ - - '='
40
40
  - !ruby/object:Gem::Version
41
- version: '0'
41
+ version: 0.18.4
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: aws-sdk
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
- - - "~>"
46
+ - - '='
47
47
  - !ruby/object:Gem::Version
48
- version: '2'
48
+ version: 2.5.6
49
49
  type: :runtime
50
50
  prerelease: false
51
51
  version_requirements: !ruby/object:Gem::Requirement
52
52
  requirements:
53
- - - "~>"
53
+ - - '='
54
54
  - !ruby/object:Gem::Version
55
- version: '2'
55
+ version: 2.5.6
56
56
  - !ruby/object:Gem::Dependency
57
57
  name: rake
58
58
  requirement: !ruby/object:Gem::Requirement
@@ -138,7 +138,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
138
138
  requirements:
139
139
  - - ">="
140
140
  - !ruby/object:Gem::Version
141
- version: 2.0.0
141
+ version: 2.1.0
142
142
  required_rubygems_version: !ruby/object:Gem::Requirement
143
143
  requirements:
144
144
  - - ">="