bricolage-streamingload 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6fcd9200426dc3c2389566e0b6c5618d7ee2ae2f
4
- data.tar.gz: 6880ed950da15b2550c706bb0525364286889e23
3
+ metadata.gz: 17aa54eda4d063cb571f3a7671a4e6413ea079e1
4
+ data.tar.gz: 1e879a10e505c01a9f66393a079e18de997a3478
5
5
  SHA512:
6
- metadata.gz: e278f2dbf3e9cc26b3c1420bea5a4e580af7eaf006b01e2bfecd641efa553214583b3291160d80911c9b7ed06ee00164eb76a64b13a4ef7ab586dcfaae790de7
7
- data.tar.gz: ea493c3e3f2ee8644c3615dc1667fe2902120c8af5c86bfe71b882020aa967801c70b12eff69c0032f6ee486781fcde475a7c0e18e9d73bdb84ec2a32c62c535
6
+ metadata.gz: 5f5778b2ecff8e2bf8d06e1ab00e8e1ee394772a24e4812b74b921c8303566638a212632c896b9b46c7ed5918357db89c65116595664628dba1c2e3cfbef3375
7
+ data.tar.gz: ebb53db3b87334f5c725e28665f9716f2246d1aef4389d635627e31b07c356e0f9a9edea451162ca39505da82dba22f5fd710d8bbdcb24f8e0fd583c2c734955
@@ -1,19 +1,37 @@
1
+ require 'bricolage/logger'
2
+ require 'logger'
3
+ require 'forwardable'
4
+
1
5
  module Bricolage
2
6
  module StreamingLoad
3
7
  class AlertingLogger
4
8
  extend Forwardable
5
9
 
6
- def initialize(logger: , sns_datasource: , alert_level: 'warn')
10
+ def initialize(logger:, sns_datasource:, alert_level: 'warn')
7
11
  @logger = logger
8
- @sns_logger = Bricolage::Logger.new(device: sns_datasource)
9
- @sns_logger.level = Kernel.const_get("Logger").const_get(alert_level.upcase)
12
+ @alerter = Bricolage::Logger.new(device: sns_datasource)
13
+ @alerter.level = ::Logger.const_get(alert_level.upcase)
10
14
  end
11
15
 
12
16
  def_delegators '@logger', :level, :level=, :debug?, :info?, :warn?, :error?, :fatal?, :unknown?
13
17
 
14
- %w(log debug info warn error fatal unknown).each do |m|
18
+ %w[log debug info warn error fatal unknown].each do |m|
15
19
  define_method(m) do |*args|
16
- [@logger, @sns_logger].map {|t| t.send(m, *args) }
20
+ @logger.__send__(m, *args)
21
+ begin
22
+ @alerter.__send__(m, *args)
23
+ rescue Exception => err
24
+ @logger.error "could not send alert: #{err.message}"
25
+ end
26
+ end
27
+ end
28
+
29
+ def exception(ex)
30
+ @logger.exception(ex)
31
+ begin
32
+ @alerter.error(ex.message)
33
+ rescue Exception => err
34
+ @logger.error "could not send alert: #{err.message}"
17
35
  end
18
36
  end
19
37
 
@@ -29,16 +29,16 @@ module Bricolage
29
29
  config = YAML.load(File.read(config_path))
30
30
  logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
31
31
  ctx = Context.for_application('.', environment: opts.environment, logger: logger)
32
- event_queue = ctx.get_data_source('sqs', config.fetch('event-queue-ds'))
33
- task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
32
+ event_queue = ctx.get_data_source('sqs', config.fetch('event-queue-ds', 'sqs_event'))
33
+ task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds', 'sqs_task'))
34
34
  alert_logger = AlertingLogger.new(
35
35
  logger: ctx.logger,
36
- sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds')),
36
+ sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds', 'sns')),
37
37
  alert_level: config.fetch('alert-level', 'warn')
38
38
  )
39
39
 
40
40
  object_buffer = ObjectBuffer.new(
41
- control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
41
+ control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_data')),
42
42
  logger: alert_logger
43
43
  )
44
44
 
@@ -85,12 +85,14 @@ module Bricolage
85
85
  @dispatch_interval = dispatch_interval
86
86
  @dispatch_message_id = nil
87
87
  @logger = logger
88
+ @dispatch_requested = false
88
89
  @checkpoint_requested = false
89
90
  end
90
91
 
91
92
  attr_reader :logger
92
93
 
93
94
  def event_loop
95
+ logger.info "dispatcher started"
94
96
  set_dispatch_timer
95
97
  @event_queue.handle_messages(handler: self, message_class: Event)
96
98
  @event_queue.process_async_delete_force
@@ -99,9 +101,17 @@ module Bricolage
99
101
 
100
102
  # override
101
103
  def after_message_batch
104
+ # must be processed first
102
105
  @event_queue.process_async_delete
106
+
107
+ if @dispatch_requested
108
+ dispatch_tasks
109
+ @dispatch_requested = false
110
+ end
111
+
103
112
  if @checkpoint_requested
104
113
  create_checkpoint
114
+ @checkpoint_requested = false # is needless, but reset it just in case
105
115
  end
106
116
  end
107
117
 
@@ -139,13 +149,19 @@ module Bricolage
139
149
  end
140
150
 
141
151
  def handle_dispatch(e)
152
+ logger.info "dispatching tasks requested"
153
+ # Dispatching tasks may takes 10 minutes or more, it can exceeds visibility timeout.
154
+ # To avoid this, delay dispatching until all events of current message batch are processed.
142
155
  if @dispatch_message_id == e.message_id
143
- tasks = @object_buffer.flush_tasks
144
- send_tasks tasks
145
- set_dispatch_timer
156
+ @dispatch_requested = true
146
157
  end
147
- # Delete this event immediately
148
- @event_queue.delete_message(e)
158
+ @event_queue.delete_message_async(e)
159
+ end
160
+
161
+ def dispatch_tasks
162
+ tasks = @object_buffer.flush_tasks
163
+ send_tasks tasks
164
+ set_dispatch_timer
149
165
  end
150
166
 
151
167
  def set_dispatch_timer
@@ -51,7 +51,7 @@ module Bricolage
51
51
  strload_tasks
52
52
  where
53
53
  task_id = #{@params.task_id}
54
- and (task_id not in (select task_id from strload_jobs) or #{@params.force})
54
+ and (#{@params.force?} or task_id not in (select task_id from strload_jobs))
55
55
  returning job_id
56
56
  ;
57
57
  EndSQL
@@ -60,26 +60,20 @@ module Bricolage
60
60
  end
61
61
 
62
62
  def do_load
63
- ManifestFile.create(
64
- @params.ctl_bucket,
65
- job_id: @job_id,
66
- object_urls: @params.object_urls,
67
- logger: @logger
68
- ) {|manifest|
69
- if @params.enable_work_table?
70
- prepare_work_table @params.work_table
63
+ manifest = ManifestFile.create(@params.ctl_bucket, job_id: @job_id, object_urls: @params.object_urls, logger: @logger)
64
+ if @params.enable_work_table?
65
+ @connection.transaction {|txn|
66
+ # NOTE: This transaction ends with truncation, this DELETE does nothing
67
+ # from the second time. So don't worry about DELETE cost here.
68
+ @connection.execute("delete from #{@params.work_table}")
71
69
  load_objects @params.work_table, manifest, @params.load_options_string
72
- @connection.transaction {
73
- commit_work_table @params
74
- commit_job_result
75
- }
76
- else
77
- @connection.transaction {
78
- load_objects @params.dest_table, manifest, @params.load_options_string
79
- commit_job_result
80
- }
81
- end
82
- }
70
+ commit_work_table txn, @params
71
+ }
72
+ commit_job_result
73
+ else
74
+ load_objects @params.dest_table, manifest, @params.load_options_string
75
+ commit_job_result
76
+ end
83
77
  rescue JobFailure => ex
84
78
  write_job_error 'failure', ex.message
85
79
  raise
@@ -88,10 +82,6 @@ module Bricolage
88
82
  raise
89
83
  end
90
84
 
91
- def prepare_work_table(work_table)
92
- @connection.execute("truncate #{work_table}")
93
- end
94
-
95
85
  def load_objects(dest_table, manifest, options)
96
86
  @connection.execute(<<-EndSQL.strip.gsub(/\s+/, ' '))
97
87
  copy #{dest_table}
@@ -106,14 +96,37 @@ module Bricolage
106
96
  @logger.info "load succeeded: #{manifest.url}"
107
97
  end
108
98
 
109
- def commit_work_table(params)
99
+ def commit_work_table(txn, params)
110
100
  @connection.execute(params.sql_source)
111
- # keep work table records for later tracking
101
+ txn.truncate_and_commit(params.work_table)
112
102
  end
113
103
 
114
104
  def commit_job_result
115
105
  @end_time = Time.now
116
- write_job_result 'success', ''
106
+ @ctl_ds.open {|conn|
107
+ conn.transaction {
108
+ write_job_result conn, 'success', ''
109
+ update_loaded_flag conn
110
+ }
111
+ }
112
+ end
113
+
114
+ def update_loaded_flag(connection)
115
+ connection.execute(<<-EndSQL)
116
+ update
117
+ strload_objects
118
+ set
119
+ loaded = true
120
+ where
121
+ object_id in (
122
+ select
123
+ object_id
124
+ from
125
+ strload_task_objects
126
+ where task_id = (select task_id from strload_jobs where job_id = #{@job_id})
127
+ )
128
+ ;
129
+ EndSQL
117
130
  end
118
131
 
119
132
  MAX_MESSAGE_LENGTH = 1000
@@ -121,23 +134,23 @@ module Bricolage
121
134
  def write_job_error(status, message)
122
135
  @end_time = Time.now
123
136
  @logger.warn message.lines.first
124
- write_job_result status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
125
- end
126
-
127
- def write_job_result(status, message)
128
137
  @ctl_ds.open {|conn|
129
- conn.execute(<<-EndSQL)
130
- update
131
- strload_jobs
132
- set
133
- (status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
134
- where
135
- job_id = #{@job_id}
136
- ;
137
- EndSQL
138
+ write_job_result conn, status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
138
139
  }
139
140
  end
140
141
 
142
+ def write_job_result(connection, status, message)
143
+ connection.execute(<<-EndSQL)
144
+ update
145
+ strload_jobs
146
+ set
147
+ (status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
148
+ where
149
+ job_id = #{@job_id}
150
+ ;
151
+ EndSQL
152
+ end
153
+
141
154
  end
142
155
 
143
156
  end
@@ -1,3 +1,4 @@
1
+ require 'bricolage/job'
1
2
  require 'bricolage/rubyjobclass'
2
3
  require 'bricolage/psqldatasource'
3
4
 
@@ -60,8 +61,8 @@ module Bricolage
60
61
  @task.table
61
62
  end
62
63
 
63
- def force
64
- @task.force
64
+ def force?
65
+ @task.force?
65
66
  end
66
67
 
67
68
  def object_urls
@@ -1,3 +1,4 @@
1
+ require 'bricolage/context'
1
2
  require 'bricolage/sqsdatasource'
2
3
  require 'bricolage/streamingload/task'
3
4
  require 'bricolage/streamingload/loader'
@@ -5,6 +6,7 @@ require 'bricolage/streamingload/alertinglogger'
5
6
  require 'bricolage/logger'
6
7
  require 'bricolage/exception'
7
8
  require 'bricolage/version'
9
+ require 'yaml'
8
10
  require 'optparse'
9
11
 
10
12
  module Bricolage
@@ -23,21 +25,25 @@ module Bricolage
23
25
  config_path, * = opts.rest_arguments
24
26
  config = YAML.load(File.read(config_path))
25
27
  logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
26
- ctx = Context.for_application('.', environment: opts.environment, logger: logger)
27
- redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds'))
28
- task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
29
- alert_logger = AlertingLogger.new(
30
- logger: ctx.logger,
31
- sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds')),
32
- alert_level: config.fetch('alert-level', 'warn')
33
- )
28
+ ctx = Context.for_application(opts.working_dir, environment: opts.environment, logger: logger)
29
+ redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds', 'db_data'))
30
+ task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds', 'sqs_task'))
31
+ raw_logger = logger = ctx.logger
32
+ if config.key?('alert-level')
33
+ logger = AlertingLogger.new(
34
+ logger: raw_logger,
35
+ sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds', 'sns')),
36
+ alert_level: config.fetch('alert-level', 'warn')
37
+ )
38
+ end
34
39
 
35
40
  service = new(
36
41
  context: ctx,
37
- control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
42
+ control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_ctl')),
38
43
  data_source: redshift_ds,
39
44
  task_queue: task_queue,
40
- logger: alert_logger
45
+ working_dir: opts.working_dir,
46
+ logger: logger
41
47
  )
42
48
 
43
49
  if opts.task_id
@@ -46,12 +52,18 @@ module Bricolage
46
52
  else
47
53
  # Server mode
48
54
  Process.daemon(true) if opts.daemon?
55
+ Dir.chdir '/'
49
56
  create_pid_file opts.pid_file_path if opts.pid_file_path
50
- service.event_loop
57
+ begin
58
+ logger.info "*** bricolage-streaming-loader started: pid=#{$$}"
59
+ service.event_loop
60
+ logger.info "*** bricolage-streaming-loader shutdown gracefully: pid=#{$$}"
61
+ rescue Exception => ex
62
+ logger.exception(ex)
63
+ logger.error "*** bricolage-streaming-loader abort: pid=#{$$}"
64
+ raise
65
+ end
51
66
  end
52
- rescue Exception => e
53
- alert_logger.error e.message
54
- raise
55
67
  end
56
68
 
57
69
  def LoaderService.new_logger(path, config)
@@ -70,11 +82,12 @@ module Bricolage
70
82
  # ignore
71
83
  end
72
84
 
73
- def initialize(context:, control_data_source:, data_source:, task_queue:, logger:)
85
+ def initialize(context:, control_data_source:, data_source:, task_queue:, working_dir:, logger:)
74
86
  @ctx = context
75
87
  @ctl_ds = control_data_source
76
88
  @ds = data_source
77
89
  @task_queue = task_queue
90
+ @working_dir = working_dir
78
91
  @logger = logger
79
92
  end
80
93
 
@@ -82,7 +95,6 @@ module Bricolage
82
95
 
83
96
  def event_loop
84
97
  @task_queue.handle_messages(handler: self, message_class: Task)
85
- @logger.info "shutdown gracefully"
86
98
  end
87
99
 
88
100
  def execute_task_by_id(task_id)
@@ -95,19 +107,23 @@ module Bricolage
95
107
 
96
108
  # message handler
97
109
  def handle_streaming_load_v3(task)
98
- # 1. Load task detail from table
99
- # 2. Skip disabled (sqs message should not have disabled state since it will never be exectuted)
100
- # 3. Try execute
101
- # - Skip if the task has already been executed AND force = false
102
- loadtask = load_task(task.id, force: task.force)
103
- return if loadtask.disabled # skip if disabled, but don't delete sqs msg
104
- execute_task(loadtask)
105
- # Delete load task immediately (do not use async delete)
106
- @task_queue.delete_message(task)
110
+ Dir.chdir(@working_dir) {
111
+ loadtask = load_task(task.id, force: task.force?)
112
+ if loadtask.disabled
113
+ # Skip if disabled, and don't delete SQS message.
114
+ @logger.info "skip disabled task: task_id=#{task.id}"
115
+ return
116
+ end
117
+ execute_task(loadtask)
118
+ # Do not use async delete
119
+ @task_queue.delete_message(task)
120
+ }
121
+ rescue => ex
122
+ @logger.exception ex
107
123
  end
108
124
 
109
125
  def execute_task(task)
110
- @logger.info "handling load task: table=#{task.qualified_name} task_id=#{task.id}"
126
+ @logger.info "execute task: task_id=#{task.id} table=#{task.qualified_name}"
111
127
  loader = Loader.load_from_file(@ctx, @ctl_ds, task, logger: @logger)
112
128
  loader.execute
113
129
  end
@@ -119,16 +135,18 @@ module Bricolage
119
135
  def initialize(argv)
120
136
  @argv = argv
121
137
  @task_id = nil
138
+ @environment = Context::DEFAULT_ENV
122
139
  @daemon = false
123
140
  @log_file_path = nil
124
141
  @pid_file_path = nil
142
+ @working_dir = Dir.getwd
125
143
  @rest_arguments = nil
126
144
 
127
145
  @opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
128
146
  opts.on('--task-id=ID', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
129
147
  @task_id = task_id
130
148
  }
131
- opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{Context::DEFAULT_ENV}]") {|env|
149
+ opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{@environment}]") {|env|
132
150
  @environment = env
133
151
  }
134
152
  opts.on('--daemon', 'Becomes daemon in server mode.') {
@@ -140,6 +158,9 @@ module Bricolage
140
158
  opts.on('--pid-file=PATH', 'Creates PID file.') {|path|
141
159
  @pid_file_path = path
142
160
  }
161
+ opts.on('--working-dir=PATH', "Loader working directory. [default: #{@working_dir}]") {|path|
162
+ @working_dir = path
163
+ }
143
164
  opts.on('--help', 'Prints this message and quit.') {
144
165
  puts opts.help
145
166
  exit 0
@@ -161,14 +182,18 @@ module Bricolage
161
182
  raise OptionError, err.message
162
183
  end
163
184
 
164
- attr_reader :rest_arguments, :environment, :log_file_path
185
+ attr_reader :rest_arguments
186
+
165
187
  attr_reader :task_id
188
+ attr_reader :environment
166
189
 
167
190
  def daemon?
168
191
  @daemon
169
192
  end
170
193
 
194
+ attr_reader :log_file_path
171
195
  attr_reader :pid_file_path
196
+ attr_reader :working_dir
172
197
 
173
198
  end
174
199
 
@@ -6,7 +6,12 @@ module Bricolage
6
6
 
7
7
  def ManifestFile.create(ds, job_id:, object_urls:, logger:, noop: false, &block)
8
8
  manifest = new(ds, job_id, object_urls, logger: logger, noop: noop)
9
- manifest.create_temporary(&block)
9
+ if block
10
+ manifest.create_temporary(&block)
11
+ else
12
+ manifest.put
13
+ return manifest
14
+ end
10
15
  end
11
16
 
12
17
  def initialize(ds, job_id, object_urls, logger:, noop: false)
@@ -22,7 +27,9 @@ module Bricolage
22
27
  end
23
28
 
24
29
  def name
25
- @name ||= "manifest-#{@job_id}.json"
30
+ return @name if @name
31
+ now =Time.now
32
+ "#{now.strftime('%Y/%m/%d')}/manifest-#{now.strftime('%H%M%S')}-#{@job_id}.json"
26
33
  end
27
34
 
28
35
  def url
@@ -42,26 +42,43 @@ module Bricolage
42
42
 
43
43
  class ObjectBuffer
44
44
 
45
+ TASK_GENERATION_TIME_LIMIT = 30 #sec
46
+
45
47
  include SQLUtils
46
48
 
47
49
  def initialize(control_data_source:, logger:)
48
50
  @ctl_ds = control_data_source
49
51
  @logger = logger
52
+ @task_generation_time_limit = TASK_GENERATION_TIME_LIMIT
50
53
  end
51
54
 
52
55
  def put(obj)
53
56
  @ctl_ds.open {|conn|
54
- insert_object(conn, obj)
57
+ suppress_sql_logging {
58
+ conn.transaction {
59
+ object_id = insert_object(conn, obj)
60
+ if object_id
61
+ insert_task_objects(conn, object_id)
62
+ else
63
+ insert_dup_object(conn, obj)
64
+ end
65
+ }
66
+ }
55
67
  }
56
68
  end
57
69
 
58
70
  # Flushes multiple tables periodically
59
71
  def flush_tasks
60
- task_ids = nil
61
- @ctl_ds.open {|conn|
62
- conn.transaction {|txn|
63
- task_ids = insert_tasks(conn)
64
- insert_task_object_mappings(conn) unless task_ids.empty?
72
+ task_ids = []
73
+ warn_slow_task_generation {
74
+ @ctl_ds.open {|conn|
75
+ conn.transaction {|txn|
76
+ task_ids = insert_tasks(conn)
77
+ unless task_ids.empty?
78
+ update_task_object_mappings(conn, task_ids)
79
+ log_mapped_object_num(conn, task_ids)
80
+ end
81
+ }
65
82
  }
66
83
  }
67
84
  return task_ids.map {|id| LoadTask.create(task_id: id) }
@@ -73,11 +90,12 @@ module Bricolage
73
90
  task_ids = []
74
91
  @ctl_ds.open {|conn|
75
92
  conn.transaction {|txn|
76
- # insert_task_object_mappings may not consume all saved objects
93
+ # update_task_object_mappings may not consume all saved objects
77
94
  # (e.g. there are too many objects for one table), we must create
78
95
  # tasks repeatedly until there are no unassigned objects.
79
96
  until (ids = insert_tasks_force(conn)).empty?
80
- insert_task_object_mappings(conn)
97
+ update_task_object_mappings(conn, ids)
98
+ log_mapped_object_num(conn, ids)
81
99
  task_ids.concat ids
82
100
  end
83
101
  }
@@ -91,11 +109,12 @@ module Bricolage
91
109
  task_ids = []
92
110
  @ctl_ds.open {|conn|
93
111
  conn.transaction {|txn|
94
- # insert_task_object_mappings may not consume all saved objects
112
+ # update_task_object_mappings may not consume all saved objects
95
113
  # (e.g. there are too many objects for one table), we must create
96
114
  # tasks repeatedly until there are no unassigned objects.
97
115
  until (ids = insert_table_task_force(conn, table_name)).empty?
98
- insert_task_object_mappings(conn)
116
+ update_task_object_mappings(conn, ids)
117
+ log_mapped_object_num(conn, ids)
99
118
  task_ids.concat ids
100
119
  end
101
120
  }
@@ -106,30 +125,66 @@ module Bricolage
106
125
  private
107
126
 
108
127
  def insert_object(conn, obj)
109
- suppress_sql_logging {
110
- conn.update(<<-EndSQL)
111
- insert into strload_objects
112
- ( object_url
113
- , object_size
114
- , data_source_id
115
- , message_id
116
- , event_time
117
- , submit_time
118
- )
119
- select
120
- #{s obj.url}
121
- , #{obj.size}
122
- , #{s obj.data_source_id}
123
- , #{s obj.message_id}
124
- , '#{obj.event_time}' AT TIME ZONE 'JST'
125
- , current_timestamp
126
- from
127
- strload_tables
128
- where
129
- data_source_id = #{s obj.data_source_id}
130
- ;
131
- EndSQL
132
- }
128
+ object_ids = conn.query_values(<<-EndSQL)
129
+ insert into strload_objects
130
+ ( object_url
131
+ , object_size
132
+ , data_source_id
133
+ , message_id
134
+ , event_time
135
+ , submit_time
136
+ )
137
+ values
138
+ ( #{s obj.url}
139
+ , #{obj.size}
140
+ , #{s obj.data_source_id}
141
+ , #{s obj.message_id}
142
+ , '#{obj.event_time}' AT TIME ZONE 'JST'
143
+ , current_timestamp
144
+ )
145
+ on conflict on constraint strload_objects_object_url
146
+ do nothing
147
+ returning object_id
148
+ ;
149
+ EndSQL
150
+ return object_ids.first
151
+ end
152
+
153
+ def insert_dup_object(conn, obj)
154
+ @logger.info "Duplicated object recieved: object_url=#{obj.url}"
155
+ conn.update(<<-EndSQL)
156
+ insert into strload_dup_objects
157
+ ( object_url
158
+ , object_size
159
+ , data_source_id
160
+ , message_id
161
+ , event_time
162
+ , submit_time
163
+ )
164
+ values
165
+ ( #{s obj.url}
166
+ , #{obj.size}
167
+ , #{s obj.data_source_id}
168
+ , #{s obj.message_id}
169
+ , '#{obj.event_time}' AT TIME ZONE 'JST'
170
+ , current_timestamp
171
+ )
172
+ ;
173
+ EndSQL
174
+ end
175
+
176
+ def insert_task_objects(conn, object_id)
177
+ conn.update(<<-EndSQL)
178
+ insert into strload_task_objects
179
+ ( task_id
180
+ , object_id
181
+ )
182
+ values
183
+ ( -1
184
+ , #{object_id}
185
+ )
186
+ ;
187
+ EndSQL
133
188
  end
134
189
 
135
190
  def insert_tasks_force(conn)
@@ -140,14 +195,12 @@ module Bricolage
140
195
  task_ids = conn.query_values(<<-EndSQL)
141
196
  insert into strload_tasks
142
197
  ( task_class
143
- , schema_name
144
- , table_name
198
+ , table_id
145
199
  , submit_time
146
200
  )
147
201
  select
148
202
  'streaming_load_v3'
149
- , tbl.schema_name
150
- , tbl.table_name
203
+ , tbl.table_id
151
204
  , current_timestamp
152
205
  from
153
206
  strload_tables tbl
@@ -158,19 +211,9 @@ module Bricolage
158
211
  data_source_id
159
212
  , count(*) as object_count
160
213
  from
161
- (
162
- select
163
- min(object_id) as object_id
164
- , object_url
165
- from
166
- strload_objects
167
- group by
168
- object_url
169
- ) uniq_objects
170
- inner join strload_objects using (object_id)
171
- left outer join strload_task_objects using (object_id)
214
+ strload_objects
172
215
  where
173
- task_id is null -- not assigned to a task
216
+ object_id in (select object_id from strload_task_objects where task_id = -1)
174
217
  group by
175
218
  data_source_id
176
219
  ) obj
@@ -179,28 +222,27 @@ module Bricolage
179
222
  -- preceeding task's submit time
180
223
  left outer join (
181
224
  select
182
- schema_name
183
- , table_name
225
+ table_id
184
226
  , max(submit_time) as latest_submit_time
185
227
  from
186
228
  strload_tasks
187
229
  group by
188
- schema_name, table_name
230
+ table_id
189
231
  ) task
190
- using (schema_name, table_name)
232
+ using (table_id)
191
233
  where
192
234
  not tbl.disabled -- not disabled
193
235
  and (
194
236
  #{force ? "true or" : ""} -- Creates tasks with no conditions if forced
195
237
  obj.object_count > tbl.load_batch_size -- batch_size exceeded?
196
- or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
197
- or latest_submit_time is null -- no previous tasks?
238
+ or extract(epoch from current_timestamp - task.latest_submit_time) > tbl.load_interval -- load_interval exceeded?
239
+ or task.latest_submit_time is null -- no previous tasks?
198
240
  )
199
241
  returning task_id
200
242
  ;
201
243
  EndSQL
202
244
 
203
- @logger.info "Number of task created: #{task_ids.size}"
245
+ log_created_tasks task_ids
204
246
  task_ids
205
247
  end
206
248
 
@@ -208,14 +250,12 @@ module Bricolage
208
250
  task_ids = conn.query_values(<<-EndSQL)
209
251
  insert into strload_tasks
210
252
  ( task_class
211
- , schema_name
212
- , table_name
253
+ , table_id
213
254
  , submit_time
214
255
  )
215
256
  select
216
257
  'streaming_load_v3'
217
- , tbl.schema_name
218
- , tbl.table_name
258
+ , tbl.table_id
219
259
  , current_timestamp
220
260
  from
221
261
  strload_tables tbl
@@ -227,21 +267,9 @@ module Bricolage
227
267
  data_source_id
228
268
  , count(*) as object_count
229
269
  from
230
- (
231
- select
232
- min(object_id) as object_id
233
- , object_url
234
- from
235
- strload_objects
236
- where
237
- data_source_id = #{s table_name}
238
- group by
239
- object_url
240
- ) uniq_objects
241
- inner join strload_objects using (object_id)
242
- left outer join strload_task_objects using (object_id)
270
+ strload_objects
243
271
  where
244
- task_id is null -- not assigned to a task
272
+ object_id in (select object_id from strload_task_objects where task_id = -1)
245
273
  group by
246
274
  data_source_id
247
275
  ) obj
@@ -254,70 +282,55 @@ module Bricolage
254
282
  EndSQL
255
283
 
256
284
  # It must be 1
257
- @logger.info "Number of task created: #{task_ids.size}"
285
+ log_created_tasks(task_ids)
258
286
  task_ids
259
287
  end
260
288
 
261
- def insert_task_object_mappings(conn)
289
+ def update_task_object_mappings(conn, task_ids)
262
290
  conn.update(<<-EndSQL)
263
- insert into strload_task_objects
264
- ( task_id
265
- , object_id
266
- )
291
+ update strload_task_objects dst
292
+ set
293
+ task_id = tasks.task_id
294
+ from
295
+ strload_tasks tasks
296
+ inner join strload_tables tables using (table_id)
297
+ inner join (
298
+ select
299
+ object_id
300
+ , data_source_id
301
+ , row_number() over (partition by data_source_id order by object_id) as object_seq
302
+ from
303
+ strload_objects
304
+ where
305
+ object_id in (select object_id from strload_task_objects where task_id = -1)
306
+ ) tsk_obj
307
+ using (data_source_id)
308
+ where
309
+ dst.task_id = -1
310
+ and tasks.task_id in (#{task_ids.join(",")})
311
+ and dst.object_id = tsk_obj.object_id
312
+ and tsk_obj.object_seq <= tables.load_batch_size
313
+ ;
314
+ EndSQL
315
+ end
316
+
317
+ def log_mapped_object_num(conn, task_ids)
318
+ # This method is required since UPDATE does not "returning" multiple values
319
+ rows = conn.query_values(<<-EndSQL)
267
320
  select
268
321
  task_id
269
- , object_id
270
- from (
271
- select
272
- row_number() over (partition by task.task_id order by obj.object_id) as object_count
273
- , task.task_id
274
- , obj.object_id
275
- , load_batch_size
276
- from
277
- -- unassigned objects
278
- (
279
- select
280
- data_source_id
281
- , uniq_objects.object_url
282
- , object_id
283
- from
284
- (
285
- select
286
- min(object_id) as object_id
287
- , object_url
288
- from
289
- strload_objects
290
- group by
291
- object_url
292
- ) uniq_objects
293
- inner join strload_objects using(object_id)
294
- left outer join strload_task_objects using(object_id)
295
- where
296
- task_id is null
297
- ) obj
298
-
299
- -- tasks without objects
300
- inner join (
301
- select
302
- tbl.data_source_id
303
- , min(task_id) as task_id -- pick up oldest task
304
- , max(load_batch_size) as load_batch_size
305
- from
306
- strload_tasks
307
- inner join strload_tables tbl
308
- using (schema_name, table_name)
309
- where
310
- -- unassigned objects
311
- task_id not in (select distinct task_id from strload_task_objects)
312
- group by
313
- 1
314
- ) task
315
- using (data_source_id)
316
- ) as t
322
+ , count(*)
323
+ from
324
+ strload_task_objects
317
325
  where
318
- object_count <= load_batch_size -- limit number of objects assigned to single task
326
+ task_id in (#{task_ids.join(',')})
327
+ group by
328
+ task_id
319
329
  ;
320
330
  EndSQL
331
+ rows.each_slice(2) do |task_id, object_count|
332
+ @logger.info "Number of objects assigned to task: task_id=#{task_id} object_count=#{object_count}"
333
+ end
321
334
  end
322
335
 
323
336
  def suppress_sql_logging
@@ -331,6 +344,22 @@ module Bricolage
331
344
  end
332
345
  end
333
346
 
347
+ def log_created_tasks(task_ids)
348
+ created_task_num = task_ids.size
349
+ @logger.info "Number of task created: #{created_task_num}"
350
+ @logger.info "Created task ids: #{task_ids}" if created_task_num > 0
351
+ end
352
+
353
+ def warn_slow_task_generation(&block)
354
+ start_time = Time.now
355
+ yield
356
+ exec_time = (Time.now - start_time)
357
+ if exec_time > @task_generation_time_limit
358
+ @logger.warn "Long task generation time: #{exec_time}"
359
+ @task_generation_time_limit = @task_generation_time_limit * 1.1
360
+ end
361
+ end
362
+
334
363
  end
335
364
 
336
365
  end
@@ -35,7 +35,7 @@ module Bricolage
35
35
  def LoadTask.parse_sqs_record(msg, rec)
36
36
  {
37
37
  task_id: rec['taskId'],
38
- force: rec['force'],
38
+ force: (rec['force'].to_s == 'true')
39
39
  }
40
40
  end
41
41
 
@@ -49,7 +49,7 @@ module Bricolage
49
49
  from
50
50
  strload_tasks tsk
51
51
  inner join strload_tables tbl
52
- using(schema_name, table_name)
52
+ using(table_id)
53
53
  where
54
54
  task_id = #{task_id}
55
55
  ;
@@ -94,7 +94,11 @@ module Bricolage
94
94
  @disabled = disabled
95
95
  end
96
96
 
97
- attr_reader :id, :force
97
+ attr_reader :id
98
+
99
+ def force?
100
+ !!@force
101
+ end
98
102
 
99
103
  #
100
104
  # For writer only
@@ -1,5 +1,5 @@
1
1
  module Bricolage
2
2
  module StreamingLoad
3
- VERSION = '0.5.1'
3
+ VERSION = '0.6.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bricolage-streamingload
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-08-24 00:00:00.000000000 Z
12
+ date: 2016-09-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bricolage
@@ -17,42 +17,42 @@ dependencies:
17
17
  requirements:
18
18
  - - '='
19
19
  - !ruby/object:Gem::Version
20
- version: 5.16.8
20
+ version: 5.16.9
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - '='
26
26
  - !ruby/object:Gem::Version
27
- version: 5.16.8
27
+ version: 5.16.9
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: pg
30
30
  requirement: !ruby/object:Gem::Requirement
31
31
  requirements:
32
- - - ">="
32
+ - - '='
33
33
  - !ruby/object:Gem::Version
34
- version: '0'
34
+ version: 0.18.4
35
35
  type: :runtime
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
- - - ">="
39
+ - - '='
40
40
  - !ruby/object:Gem::Version
41
- version: '0'
41
+ version: 0.18.4
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: aws-sdk
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
- - - "~>"
46
+ - - '='
47
47
  - !ruby/object:Gem::Version
48
- version: '2'
48
+ version: 2.5.6
49
49
  type: :runtime
50
50
  prerelease: false
51
51
  version_requirements: !ruby/object:Gem::Requirement
52
52
  requirements:
53
- - - "~>"
53
+ - - '='
54
54
  - !ruby/object:Gem::Version
55
- version: '2'
55
+ version: 2.5.6
56
56
  - !ruby/object:Gem::Dependency
57
57
  name: rake
58
58
  requirement: !ruby/object:Gem::Requirement
@@ -138,7 +138,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
138
138
  requirements:
139
139
  - - ">="
140
140
  - !ruby/object:Gem::Version
141
- version: 2.0.0
141
+ version: 2.1.0
142
142
  required_rubygems_version: !ruby/object:Gem::Requirement
143
143
  requirements:
144
144
  - - ">="