bricolage-streamingload 0.5.1 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bricolage/streamingload/alertinglogger.rb +23 -5
- data/lib/bricolage/streamingload/dispatcher.rb +25 -9
- data/lib/bricolage/streamingload/loader.rb +53 -40
- data/lib/bricolage/streamingload/loaderparams.rb +3 -2
- data/lib/bricolage/streamingload/loaderservice.rb +53 -28
- data/lib/bricolage/streamingload/manifest.rb +9 -2
- data/lib/bricolage/streamingload/objectbuffer.rb +159 -130
- data/lib/bricolage/streamingload/task.rb +7 -3
- data/lib/bricolage/streamingload/version.rb +1 -1
- metadata +13 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17aa54eda4d063cb571f3a7671a4e6413ea079e1
|
4
|
+
data.tar.gz: 1e879a10e505c01a9f66393a079e18de997a3478
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f5778b2ecff8e2bf8d06e1ab00e8e1ee394772a24e4812b74b921c8303566638a212632c896b9b46c7ed5918357db89c65116595664628dba1c2e3cfbef3375
|
7
|
+
data.tar.gz: ebb53db3b87334f5c725e28665f9716f2246d1aef4389d635627e31b07c356e0f9a9edea451162ca39505da82dba22f5fd710d8bbdcb24f8e0fd583c2c734955
|
@@ -1,19 +1,37 @@
|
|
1
|
+
require 'bricolage/logger'
|
2
|
+
require 'logger'
|
3
|
+
require 'forwardable'
|
4
|
+
|
1
5
|
module Bricolage
|
2
6
|
module StreamingLoad
|
3
7
|
class AlertingLogger
|
4
8
|
extend Forwardable
|
5
9
|
|
6
|
-
def initialize(logger
|
10
|
+
def initialize(logger:, sns_datasource:, alert_level: 'warn')
|
7
11
|
@logger = logger
|
8
|
-
@
|
9
|
-
@
|
12
|
+
@alerter = Bricolage::Logger.new(device: sns_datasource)
|
13
|
+
@alerter.level = ::Logger.const_get(alert_level.upcase)
|
10
14
|
end
|
11
15
|
|
12
16
|
def_delegators '@logger', :level, :level=, :debug?, :info?, :warn?, :error?, :fatal?, :unknown?
|
13
17
|
|
14
|
-
%w
|
18
|
+
%w[log debug info warn error fatal unknown].each do |m|
|
15
19
|
define_method(m) do |*args|
|
16
|
-
|
20
|
+
@logger.__send__(m, *args)
|
21
|
+
begin
|
22
|
+
@alerter.__send__(m, *args)
|
23
|
+
rescue Exception => err
|
24
|
+
@logger.error "could not send alert: #{err.message}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def exception(ex)
|
30
|
+
@logger.exception(ex)
|
31
|
+
begin
|
32
|
+
@alerter.error(ex.message)
|
33
|
+
rescue Exception => err
|
34
|
+
@logger.error "could not send alert: #{err.message}"
|
17
35
|
end
|
18
36
|
end
|
19
37
|
|
@@ -29,16 +29,16 @@ module Bricolage
|
|
29
29
|
config = YAML.load(File.read(config_path))
|
30
30
|
logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
|
31
31
|
ctx = Context.for_application('.', environment: opts.environment, logger: logger)
|
32
|
-
event_queue = ctx.get_data_source('sqs', config.fetch('event-queue-ds'))
|
33
|
-
task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
|
32
|
+
event_queue = ctx.get_data_source('sqs', config.fetch('event-queue-ds', 'sqs_event'))
|
33
|
+
task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds', 'sqs_task'))
|
34
34
|
alert_logger = AlertingLogger.new(
|
35
35
|
logger: ctx.logger,
|
36
|
-
sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds')),
|
36
|
+
sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds', 'sns')),
|
37
37
|
alert_level: config.fetch('alert-level', 'warn')
|
38
38
|
)
|
39
39
|
|
40
40
|
object_buffer = ObjectBuffer.new(
|
41
|
-
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
|
41
|
+
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_data')),
|
42
42
|
logger: alert_logger
|
43
43
|
)
|
44
44
|
|
@@ -85,12 +85,14 @@ module Bricolage
|
|
85
85
|
@dispatch_interval = dispatch_interval
|
86
86
|
@dispatch_message_id = nil
|
87
87
|
@logger = logger
|
88
|
+
@dispatch_requested = false
|
88
89
|
@checkpoint_requested = false
|
89
90
|
end
|
90
91
|
|
91
92
|
attr_reader :logger
|
92
93
|
|
93
94
|
def event_loop
|
95
|
+
logger.info "dispatcher started"
|
94
96
|
set_dispatch_timer
|
95
97
|
@event_queue.handle_messages(handler: self, message_class: Event)
|
96
98
|
@event_queue.process_async_delete_force
|
@@ -99,9 +101,17 @@ module Bricolage
|
|
99
101
|
|
100
102
|
# override
|
101
103
|
def after_message_batch
|
104
|
+
# must be processed first
|
102
105
|
@event_queue.process_async_delete
|
106
|
+
|
107
|
+
if @dispatch_requested
|
108
|
+
dispatch_tasks
|
109
|
+
@dispatch_requested = false
|
110
|
+
end
|
111
|
+
|
103
112
|
if @checkpoint_requested
|
104
113
|
create_checkpoint
|
114
|
+
@checkpoint_requested = false # is needless, but reset it just in case
|
105
115
|
end
|
106
116
|
end
|
107
117
|
|
@@ -139,13 +149,19 @@ module Bricolage
|
|
139
149
|
end
|
140
150
|
|
141
151
|
def handle_dispatch(e)
|
152
|
+
logger.info "dispatching tasks requested"
|
153
|
+
# Dispatching tasks may takes 10 minutes or more, it can exceeds visibility timeout.
|
154
|
+
# To avoid this, delay dispatching until all events of current message batch are processed.
|
142
155
|
if @dispatch_message_id == e.message_id
|
143
|
-
|
144
|
-
send_tasks tasks
|
145
|
-
set_dispatch_timer
|
156
|
+
@dispatch_requested = true
|
146
157
|
end
|
147
|
-
|
148
|
-
|
158
|
+
@event_queue.delete_message_async(e)
|
159
|
+
end
|
160
|
+
|
161
|
+
def dispatch_tasks
|
162
|
+
tasks = @object_buffer.flush_tasks
|
163
|
+
send_tasks tasks
|
164
|
+
set_dispatch_timer
|
149
165
|
end
|
150
166
|
|
151
167
|
def set_dispatch_timer
|
@@ -51,7 +51,7 @@ module Bricolage
|
|
51
51
|
strload_tasks
|
52
52
|
where
|
53
53
|
task_id = #{@params.task_id}
|
54
|
-
and (task_id not in (select task_id from strload_jobs)
|
54
|
+
and (#{@params.force?} or task_id not in (select task_id from strload_jobs))
|
55
55
|
returning job_id
|
56
56
|
;
|
57
57
|
EndSQL
|
@@ -60,26 +60,20 @@ module Bricolage
|
|
60
60
|
end
|
61
61
|
|
62
62
|
def do_load
|
63
|
-
ManifestFile.create(
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
if @params.enable_work_table?
|
70
|
-
prepare_work_table @params.work_table
|
63
|
+
manifest = ManifestFile.create(@params.ctl_bucket, job_id: @job_id, object_urls: @params.object_urls, logger: @logger)
|
64
|
+
if @params.enable_work_table?
|
65
|
+
@connection.transaction {|txn|
|
66
|
+
# NOTE: This transaction ends with truncation, this DELETE does nothing
|
67
|
+
# from the second time. So don't worry about DELETE cost here.
|
68
|
+
@connection.execute("delete from #{@params.work_table}")
|
71
69
|
load_objects @params.work_table, manifest, @params.load_options_string
|
72
|
-
@
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
commit_job_result
|
80
|
-
}
|
81
|
-
end
|
82
|
-
}
|
70
|
+
commit_work_table txn, @params
|
71
|
+
}
|
72
|
+
commit_job_result
|
73
|
+
else
|
74
|
+
load_objects @params.dest_table, manifest, @params.load_options_string
|
75
|
+
commit_job_result
|
76
|
+
end
|
83
77
|
rescue JobFailure => ex
|
84
78
|
write_job_error 'failure', ex.message
|
85
79
|
raise
|
@@ -88,10 +82,6 @@ module Bricolage
|
|
88
82
|
raise
|
89
83
|
end
|
90
84
|
|
91
|
-
def prepare_work_table(work_table)
|
92
|
-
@connection.execute("truncate #{work_table}")
|
93
|
-
end
|
94
|
-
|
95
85
|
def load_objects(dest_table, manifest, options)
|
96
86
|
@connection.execute(<<-EndSQL.strip.gsub(/\s+/, ' '))
|
97
87
|
copy #{dest_table}
|
@@ -106,14 +96,37 @@ module Bricolage
|
|
106
96
|
@logger.info "load succeeded: #{manifest.url}"
|
107
97
|
end
|
108
98
|
|
109
|
-
def commit_work_table(params)
|
99
|
+
def commit_work_table(txn, params)
|
110
100
|
@connection.execute(params.sql_source)
|
111
|
-
|
101
|
+
txn.truncate_and_commit(params.work_table)
|
112
102
|
end
|
113
103
|
|
114
104
|
def commit_job_result
|
115
105
|
@end_time = Time.now
|
116
|
-
|
106
|
+
@ctl_ds.open {|conn|
|
107
|
+
conn.transaction {
|
108
|
+
write_job_result conn, 'success', ''
|
109
|
+
update_loaded_flag conn
|
110
|
+
}
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
114
|
+
def update_loaded_flag(connection)
|
115
|
+
connection.execute(<<-EndSQL)
|
116
|
+
update
|
117
|
+
strload_objects
|
118
|
+
set
|
119
|
+
loaded = true
|
120
|
+
where
|
121
|
+
object_id in (
|
122
|
+
select
|
123
|
+
object_id
|
124
|
+
from
|
125
|
+
strload_task_objects
|
126
|
+
where task_id = (select task_id from strload_jobs where job_id = #{@job_id})
|
127
|
+
)
|
128
|
+
;
|
129
|
+
EndSQL
|
117
130
|
end
|
118
131
|
|
119
132
|
MAX_MESSAGE_LENGTH = 1000
|
@@ -121,23 +134,23 @@ module Bricolage
|
|
121
134
|
def write_job_error(status, message)
|
122
135
|
@end_time = Time.now
|
123
136
|
@logger.warn message.lines.first
|
124
|
-
write_job_result status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
|
125
|
-
end
|
126
|
-
|
127
|
-
def write_job_result(status, message)
|
128
137
|
@ctl_ds.open {|conn|
|
129
|
-
conn.
|
130
|
-
update
|
131
|
-
strload_jobs
|
132
|
-
set
|
133
|
-
(status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
|
134
|
-
where
|
135
|
-
job_id = #{@job_id}
|
136
|
-
;
|
137
|
-
EndSQL
|
138
|
+
write_job_result conn, status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
|
138
139
|
}
|
139
140
|
end
|
140
141
|
|
142
|
+
def write_job_result(connection, status, message)
|
143
|
+
connection.execute(<<-EndSQL)
|
144
|
+
update
|
145
|
+
strload_jobs
|
146
|
+
set
|
147
|
+
(status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
|
148
|
+
where
|
149
|
+
job_id = #{@job_id}
|
150
|
+
;
|
151
|
+
EndSQL
|
152
|
+
end
|
153
|
+
|
141
154
|
end
|
142
155
|
|
143
156
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'bricolage/context'
|
1
2
|
require 'bricolage/sqsdatasource'
|
2
3
|
require 'bricolage/streamingload/task'
|
3
4
|
require 'bricolage/streamingload/loader'
|
@@ -5,6 +6,7 @@ require 'bricolage/streamingload/alertinglogger'
|
|
5
6
|
require 'bricolage/logger'
|
6
7
|
require 'bricolage/exception'
|
7
8
|
require 'bricolage/version'
|
9
|
+
require 'yaml'
|
8
10
|
require 'optparse'
|
9
11
|
|
10
12
|
module Bricolage
|
@@ -23,21 +25,25 @@ module Bricolage
|
|
23
25
|
config_path, * = opts.rest_arguments
|
24
26
|
config = YAML.load(File.read(config_path))
|
25
27
|
logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
|
26
|
-
ctx = Context.for_application(
|
27
|
-
redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds'))
|
28
|
-
task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
ctx = Context.for_application(opts.working_dir, environment: opts.environment, logger: logger)
|
29
|
+
redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds', 'db_data'))
|
30
|
+
task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds', 'sqs_task'))
|
31
|
+
raw_logger = logger = ctx.logger
|
32
|
+
if config.key?('alert-level')
|
33
|
+
logger = AlertingLogger.new(
|
34
|
+
logger: raw_logger,
|
35
|
+
sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds', 'sns')),
|
36
|
+
alert_level: config.fetch('alert-level', 'warn')
|
37
|
+
)
|
38
|
+
end
|
34
39
|
|
35
40
|
service = new(
|
36
41
|
context: ctx,
|
37
|
-
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
|
42
|
+
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_ctl')),
|
38
43
|
data_source: redshift_ds,
|
39
44
|
task_queue: task_queue,
|
40
|
-
|
45
|
+
working_dir: opts.working_dir,
|
46
|
+
logger: logger
|
41
47
|
)
|
42
48
|
|
43
49
|
if opts.task_id
|
@@ -46,12 +52,18 @@ module Bricolage
|
|
46
52
|
else
|
47
53
|
# Server mode
|
48
54
|
Process.daemon(true) if opts.daemon?
|
55
|
+
Dir.chdir '/'
|
49
56
|
create_pid_file opts.pid_file_path if opts.pid_file_path
|
50
|
-
|
57
|
+
begin
|
58
|
+
logger.info "*** bricolage-streaming-loader started: pid=#{$$}"
|
59
|
+
service.event_loop
|
60
|
+
logger.info "*** bricolage-streaming-loader shutdown gracefully: pid=#{$$}"
|
61
|
+
rescue Exception => ex
|
62
|
+
logger.exception(ex)
|
63
|
+
logger.error "*** bricolage-streaming-loader abort: pid=#{$$}"
|
64
|
+
raise
|
65
|
+
end
|
51
66
|
end
|
52
|
-
rescue Exception => e
|
53
|
-
alert_logger.error e.message
|
54
|
-
raise
|
55
67
|
end
|
56
68
|
|
57
69
|
def LoaderService.new_logger(path, config)
|
@@ -70,11 +82,12 @@ module Bricolage
|
|
70
82
|
# ignore
|
71
83
|
end
|
72
84
|
|
73
|
-
def initialize(context:, control_data_source:, data_source:, task_queue:, logger:)
|
85
|
+
def initialize(context:, control_data_source:, data_source:, task_queue:, working_dir:, logger:)
|
74
86
|
@ctx = context
|
75
87
|
@ctl_ds = control_data_source
|
76
88
|
@ds = data_source
|
77
89
|
@task_queue = task_queue
|
90
|
+
@working_dir = working_dir
|
78
91
|
@logger = logger
|
79
92
|
end
|
80
93
|
|
@@ -82,7 +95,6 @@ module Bricolage
|
|
82
95
|
|
83
96
|
def event_loop
|
84
97
|
@task_queue.handle_messages(handler: self, message_class: Task)
|
85
|
-
@logger.info "shutdown gracefully"
|
86
98
|
end
|
87
99
|
|
88
100
|
def execute_task_by_id(task_id)
|
@@ -95,19 +107,23 @@ module Bricolage
|
|
95
107
|
|
96
108
|
# message handler
|
97
109
|
def handle_streaming_load_v3(task)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
110
|
+
Dir.chdir(@working_dir) {
|
111
|
+
loadtask = load_task(task.id, force: task.force?)
|
112
|
+
if loadtask.disabled
|
113
|
+
# Skip if disabled, and don't delete SQS message.
|
114
|
+
@logger.info "skip disabled task: task_id=#{task.id}"
|
115
|
+
return
|
116
|
+
end
|
117
|
+
execute_task(loadtask)
|
118
|
+
# Do not use async delete
|
119
|
+
@task_queue.delete_message(task)
|
120
|
+
}
|
121
|
+
rescue => ex
|
122
|
+
@logger.exception ex
|
107
123
|
end
|
108
124
|
|
109
125
|
def execute_task(task)
|
110
|
-
@logger.info "
|
126
|
+
@logger.info "execute task: task_id=#{task.id} table=#{task.qualified_name}"
|
111
127
|
loader = Loader.load_from_file(@ctx, @ctl_ds, task, logger: @logger)
|
112
128
|
loader.execute
|
113
129
|
end
|
@@ -119,16 +135,18 @@ module Bricolage
|
|
119
135
|
def initialize(argv)
|
120
136
|
@argv = argv
|
121
137
|
@task_id = nil
|
138
|
+
@environment = Context::DEFAULT_ENV
|
122
139
|
@daemon = false
|
123
140
|
@log_file_path = nil
|
124
141
|
@pid_file_path = nil
|
142
|
+
@working_dir = Dir.getwd
|
125
143
|
@rest_arguments = nil
|
126
144
|
|
127
145
|
@opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
|
128
146
|
opts.on('--task-id=ID', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
|
129
147
|
@task_id = task_id
|
130
148
|
}
|
131
|
-
opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{
|
149
|
+
opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{@environment}]") {|env|
|
132
150
|
@environment = env
|
133
151
|
}
|
134
152
|
opts.on('--daemon', 'Becomes daemon in server mode.') {
|
@@ -140,6 +158,9 @@ module Bricolage
|
|
140
158
|
opts.on('--pid-file=PATH', 'Creates PID file.') {|path|
|
141
159
|
@pid_file_path = path
|
142
160
|
}
|
161
|
+
opts.on('--working-dir=PATH', "Loader working directory. [default: #{@working_dir}]") {|path|
|
162
|
+
@working_dir = path
|
163
|
+
}
|
143
164
|
opts.on('--help', 'Prints this message and quit.') {
|
144
165
|
puts opts.help
|
145
166
|
exit 0
|
@@ -161,14 +182,18 @@ module Bricolage
|
|
161
182
|
raise OptionError, err.message
|
162
183
|
end
|
163
184
|
|
164
|
-
attr_reader :rest_arguments
|
185
|
+
attr_reader :rest_arguments
|
186
|
+
|
165
187
|
attr_reader :task_id
|
188
|
+
attr_reader :environment
|
166
189
|
|
167
190
|
def daemon?
|
168
191
|
@daemon
|
169
192
|
end
|
170
193
|
|
194
|
+
attr_reader :log_file_path
|
171
195
|
attr_reader :pid_file_path
|
196
|
+
attr_reader :working_dir
|
172
197
|
|
173
198
|
end
|
174
199
|
|
@@ -6,7 +6,12 @@ module Bricolage
|
|
6
6
|
|
7
7
|
def ManifestFile.create(ds, job_id:, object_urls:, logger:, noop: false, &block)
|
8
8
|
manifest = new(ds, job_id, object_urls, logger: logger, noop: noop)
|
9
|
-
|
9
|
+
if block
|
10
|
+
manifest.create_temporary(&block)
|
11
|
+
else
|
12
|
+
manifest.put
|
13
|
+
return manifest
|
14
|
+
end
|
10
15
|
end
|
11
16
|
|
12
17
|
def initialize(ds, job_id, object_urls, logger:, noop: false)
|
@@ -22,7 +27,9 @@ module Bricolage
|
|
22
27
|
end
|
23
28
|
|
24
29
|
def name
|
25
|
-
@name
|
30
|
+
return @name if @name
|
31
|
+
now =Time.now
|
32
|
+
"#{now.strftime('%Y/%m/%d')}/manifest-#{now.strftime('%H%M%S')}-#{@job_id}.json"
|
26
33
|
end
|
27
34
|
|
28
35
|
def url
|
@@ -42,26 +42,43 @@ module Bricolage
|
|
42
42
|
|
43
43
|
class ObjectBuffer
|
44
44
|
|
45
|
+
TASK_GENERATION_TIME_LIMIT = 30 #sec
|
46
|
+
|
45
47
|
include SQLUtils
|
46
48
|
|
47
49
|
def initialize(control_data_source:, logger:)
|
48
50
|
@ctl_ds = control_data_source
|
49
51
|
@logger = logger
|
52
|
+
@task_generation_time_limit = TASK_GENERATION_TIME_LIMIT
|
50
53
|
end
|
51
54
|
|
52
55
|
def put(obj)
|
53
56
|
@ctl_ds.open {|conn|
|
54
|
-
|
57
|
+
suppress_sql_logging {
|
58
|
+
conn.transaction {
|
59
|
+
object_id = insert_object(conn, obj)
|
60
|
+
if object_id
|
61
|
+
insert_task_objects(conn, object_id)
|
62
|
+
else
|
63
|
+
insert_dup_object(conn, obj)
|
64
|
+
end
|
65
|
+
}
|
66
|
+
}
|
55
67
|
}
|
56
68
|
end
|
57
69
|
|
58
70
|
# Flushes multiple tables periodically
|
59
71
|
def flush_tasks
|
60
|
-
task_ids =
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
72
|
+
task_ids = []
|
73
|
+
warn_slow_task_generation {
|
74
|
+
@ctl_ds.open {|conn|
|
75
|
+
conn.transaction {|txn|
|
76
|
+
task_ids = insert_tasks(conn)
|
77
|
+
unless task_ids.empty?
|
78
|
+
update_task_object_mappings(conn, task_ids)
|
79
|
+
log_mapped_object_num(conn, task_ids)
|
80
|
+
end
|
81
|
+
}
|
65
82
|
}
|
66
83
|
}
|
67
84
|
return task_ids.map {|id| LoadTask.create(task_id: id) }
|
@@ -73,11 +90,12 @@ module Bricolage
|
|
73
90
|
task_ids = []
|
74
91
|
@ctl_ds.open {|conn|
|
75
92
|
conn.transaction {|txn|
|
76
|
-
#
|
93
|
+
# update_task_object_mappings may not consume all saved objects
|
77
94
|
# (e.g. there are too many objects for one table), we must create
|
78
95
|
# tasks repeatedly until there are no unassigned objects.
|
79
96
|
until (ids = insert_tasks_force(conn)).empty?
|
80
|
-
|
97
|
+
update_task_object_mappings(conn, ids)
|
98
|
+
log_mapped_object_num(conn, ids)
|
81
99
|
task_ids.concat ids
|
82
100
|
end
|
83
101
|
}
|
@@ -91,11 +109,12 @@ module Bricolage
|
|
91
109
|
task_ids = []
|
92
110
|
@ctl_ds.open {|conn|
|
93
111
|
conn.transaction {|txn|
|
94
|
-
#
|
112
|
+
# update_task_object_mappings may not consume all saved objects
|
95
113
|
# (e.g. there are too many objects for one table), we must create
|
96
114
|
# tasks repeatedly until there are no unassigned objects.
|
97
115
|
until (ids = insert_table_task_force(conn, table_name)).empty?
|
98
|
-
|
116
|
+
update_task_object_mappings(conn, ids)
|
117
|
+
log_mapped_object_num(conn, ids)
|
99
118
|
task_ids.concat ids
|
100
119
|
end
|
101
120
|
}
|
@@ -106,30 +125,66 @@ module Bricolage
|
|
106
125
|
private
|
107
126
|
|
108
127
|
def insert_object(conn, obj)
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
128
|
+
object_ids = conn.query_values(<<-EndSQL)
|
129
|
+
insert into strload_objects
|
130
|
+
( object_url
|
131
|
+
, object_size
|
132
|
+
, data_source_id
|
133
|
+
, message_id
|
134
|
+
, event_time
|
135
|
+
, submit_time
|
136
|
+
)
|
137
|
+
values
|
138
|
+
( #{s obj.url}
|
139
|
+
, #{obj.size}
|
140
|
+
, #{s obj.data_source_id}
|
141
|
+
, #{s obj.message_id}
|
142
|
+
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
143
|
+
, current_timestamp
|
144
|
+
)
|
145
|
+
on conflict on constraint strload_objects_object_url
|
146
|
+
do nothing
|
147
|
+
returning object_id
|
148
|
+
;
|
149
|
+
EndSQL
|
150
|
+
return object_ids.first
|
151
|
+
end
|
152
|
+
|
153
|
+
def insert_dup_object(conn, obj)
|
154
|
+
@logger.info "Duplicated object recieved: object_url=#{obj.url}"
|
155
|
+
conn.update(<<-EndSQL)
|
156
|
+
insert into strload_dup_objects
|
157
|
+
( object_url
|
158
|
+
, object_size
|
159
|
+
, data_source_id
|
160
|
+
, message_id
|
161
|
+
, event_time
|
162
|
+
, submit_time
|
163
|
+
)
|
164
|
+
values
|
165
|
+
( #{s obj.url}
|
166
|
+
, #{obj.size}
|
167
|
+
, #{s obj.data_source_id}
|
168
|
+
, #{s obj.message_id}
|
169
|
+
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
170
|
+
, current_timestamp
|
171
|
+
)
|
172
|
+
;
|
173
|
+
EndSQL
|
174
|
+
end
|
175
|
+
|
176
|
+
def insert_task_objects(conn, object_id)
|
177
|
+
conn.update(<<-EndSQL)
|
178
|
+
insert into strload_task_objects
|
179
|
+
( task_id
|
180
|
+
, object_id
|
181
|
+
)
|
182
|
+
values
|
183
|
+
( -1
|
184
|
+
, #{object_id}
|
185
|
+
)
|
186
|
+
;
|
187
|
+
EndSQL
|
133
188
|
end
|
134
189
|
|
135
190
|
def insert_tasks_force(conn)
|
@@ -140,14 +195,12 @@ module Bricolage
|
|
140
195
|
task_ids = conn.query_values(<<-EndSQL)
|
141
196
|
insert into strload_tasks
|
142
197
|
( task_class
|
143
|
-
,
|
144
|
-
, table_name
|
198
|
+
, table_id
|
145
199
|
, submit_time
|
146
200
|
)
|
147
201
|
select
|
148
202
|
'streaming_load_v3'
|
149
|
-
, tbl.
|
150
|
-
, tbl.table_name
|
203
|
+
, tbl.table_id
|
151
204
|
, current_timestamp
|
152
205
|
from
|
153
206
|
strload_tables tbl
|
@@ -158,19 +211,9 @@ module Bricolage
|
|
158
211
|
data_source_id
|
159
212
|
, count(*) as object_count
|
160
213
|
from
|
161
|
-
|
162
|
-
select
|
163
|
-
min(object_id) as object_id
|
164
|
-
, object_url
|
165
|
-
from
|
166
|
-
strload_objects
|
167
|
-
group by
|
168
|
-
object_url
|
169
|
-
) uniq_objects
|
170
|
-
inner join strload_objects using (object_id)
|
171
|
-
left outer join strload_task_objects using (object_id)
|
214
|
+
strload_objects
|
172
215
|
where
|
173
|
-
|
216
|
+
object_id in (select object_id from strload_task_objects where task_id = -1)
|
174
217
|
group by
|
175
218
|
data_source_id
|
176
219
|
) obj
|
@@ -179,28 +222,27 @@ module Bricolage
|
|
179
222
|
-- preceeding task's submit time
|
180
223
|
left outer join (
|
181
224
|
select
|
182
|
-
|
183
|
-
, table_name
|
225
|
+
table_id
|
184
226
|
, max(submit_time) as latest_submit_time
|
185
227
|
from
|
186
228
|
strload_tasks
|
187
229
|
group by
|
188
|
-
|
230
|
+
table_id
|
189
231
|
) task
|
190
|
-
using (
|
232
|
+
using (table_id)
|
191
233
|
where
|
192
234
|
not tbl.disabled -- not disabled
|
193
235
|
and (
|
194
236
|
#{force ? "true or" : ""} -- Creates tasks with no conditions if forced
|
195
237
|
obj.object_count > tbl.load_batch_size -- batch_size exceeded?
|
196
|
-
or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
|
197
|
-
or latest_submit_time is null
|
238
|
+
or extract(epoch from current_timestamp - task.latest_submit_time) > tbl.load_interval -- load_interval exceeded?
|
239
|
+
or task.latest_submit_time is null -- no previous tasks?
|
198
240
|
)
|
199
241
|
returning task_id
|
200
242
|
;
|
201
243
|
EndSQL
|
202
244
|
|
203
|
-
|
245
|
+
log_created_tasks task_ids
|
204
246
|
task_ids
|
205
247
|
end
|
206
248
|
|
@@ -208,14 +250,12 @@ module Bricolage
|
|
208
250
|
task_ids = conn.query_values(<<-EndSQL)
|
209
251
|
insert into strload_tasks
|
210
252
|
( task_class
|
211
|
-
,
|
212
|
-
, table_name
|
253
|
+
, table_id
|
213
254
|
, submit_time
|
214
255
|
)
|
215
256
|
select
|
216
257
|
'streaming_load_v3'
|
217
|
-
, tbl.
|
218
|
-
, tbl.table_name
|
258
|
+
, tbl.table_id
|
219
259
|
, current_timestamp
|
220
260
|
from
|
221
261
|
strload_tables tbl
|
@@ -227,21 +267,9 @@ module Bricolage
|
|
227
267
|
data_source_id
|
228
268
|
, count(*) as object_count
|
229
269
|
from
|
230
|
-
|
231
|
-
select
|
232
|
-
min(object_id) as object_id
|
233
|
-
, object_url
|
234
|
-
from
|
235
|
-
strload_objects
|
236
|
-
where
|
237
|
-
data_source_id = #{s table_name}
|
238
|
-
group by
|
239
|
-
object_url
|
240
|
-
) uniq_objects
|
241
|
-
inner join strload_objects using (object_id)
|
242
|
-
left outer join strload_task_objects using (object_id)
|
270
|
+
strload_objects
|
243
271
|
where
|
244
|
-
|
272
|
+
object_id in (select object_id from strload_task_objects where task_id = -1)
|
245
273
|
group by
|
246
274
|
data_source_id
|
247
275
|
) obj
|
@@ -254,70 +282,55 @@ module Bricolage
|
|
254
282
|
EndSQL
|
255
283
|
|
256
284
|
# It must be 1
|
257
|
-
|
285
|
+
log_created_tasks(task_ids)
|
258
286
|
task_ids
|
259
287
|
end
|
260
288
|
|
261
|
-
def
|
289
|
+
def update_task_object_mappings(conn, task_ids)
|
262
290
|
conn.update(<<-EndSQL)
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
291
|
+
update strload_task_objects dst
|
292
|
+
set
|
293
|
+
task_id = tasks.task_id
|
294
|
+
from
|
295
|
+
strload_tasks tasks
|
296
|
+
inner join strload_tables tables using (table_id)
|
297
|
+
inner join (
|
298
|
+
select
|
299
|
+
object_id
|
300
|
+
, data_source_id
|
301
|
+
, row_number() over (partition by data_source_id order by object_id) as object_seq
|
302
|
+
from
|
303
|
+
strload_objects
|
304
|
+
where
|
305
|
+
object_id in (select object_id from strload_task_objects where task_id = -1)
|
306
|
+
) tsk_obj
|
307
|
+
using (data_source_id)
|
308
|
+
where
|
309
|
+
dst.task_id = -1
|
310
|
+
and tasks.task_id in (#{task_ids.join(",")})
|
311
|
+
and dst.object_id = tsk_obj.object_id
|
312
|
+
and tsk_obj.object_seq <= tables.load_batch_size
|
313
|
+
;
|
314
|
+
EndSQL
|
315
|
+
end
|
316
|
+
|
317
|
+
def log_mapped_object_num(conn, task_ids)
|
318
|
+
# This method is required since UPDATE does not "returning" multiple values
|
319
|
+
rows = conn.query_values(<<-EndSQL)
|
267
320
|
select
|
268
321
|
task_id
|
269
|
-
,
|
270
|
-
from
|
271
|
-
|
272
|
-
row_number() over (partition by task.task_id order by obj.object_id) as object_count
|
273
|
-
, task.task_id
|
274
|
-
, obj.object_id
|
275
|
-
, load_batch_size
|
276
|
-
from
|
277
|
-
-- unassigned objects
|
278
|
-
(
|
279
|
-
select
|
280
|
-
data_source_id
|
281
|
-
, uniq_objects.object_url
|
282
|
-
, object_id
|
283
|
-
from
|
284
|
-
(
|
285
|
-
select
|
286
|
-
min(object_id) as object_id
|
287
|
-
, object_url
|
288
|
-
from
|
289
|
-
strload_objects
|
290
|
-
group by
|
291
|
-
object_url
|
292
|
-
) uniq_objects
|
293
|
-
inner join strload_objects using(object_id)
|
294
|
-
left outer join strload_task_objects using(object_id)
|
295
|
-
where
|
296
|
-
task_id is null
|
297
|
-
) obj
|
298
|
-
|
299
|
-
-- tasks without objects
|
300
|
-
inner join (
|
301
|
-
select
|
302
|
-
tbl.data_source_id
|
303
|
-
, min(task_id) as task_id -- pick up oldest task
|
304
|
-
, max(load_batch_size) as load_batch_size
|
305
|
-
from
|
306
|
-
strload_tasks
|
307
|
-
inner join strload_tables tbl
|
308
|
-
using (schema_name, table_name)
|
309
|
-
where
|
310
|
-
-- unassigned objects
|
311
|
-
task_id not in (select distinct task_id from strload_task_objects)
|
312
|
-
group by
|
313
|
-
1
|
314
|
-
) task
|
315
|
-
using (data_source_id)
|
316
|
-
) as t
|
322
|
+
, count(*)
|
323
|
+
from
|
324
|
+
strload_task_objects
|
317
325
|
where
|
318
|
-
|
326
|
+
task_id in (#{task_ids.join(',')})
|
327
|
+
group by
|
328
|
+
task_id
|
319
329
|
;
|
320
330
|
EndSQL
|
331
|
+
rows.each_slice(2) do |task_id, object_count|
|
332
|
+
@logger.info "Number of objects assigned to task: task_id=#{task_id} object_count=#{object_count}"
|
333
|
+
end
|
321
334
|
end
|
322
335
|
|
323
336
|
def suppress_sql_logging
|
@@ -331,6 +344,22 @@ module Bricolage
|
|
331
344
|
end
|
332
345
|
end
|
333
346
|
|
347
|
+
def log_created_tasks(task_ids)
|
348
|
+
created_task_num = task_ids.size
|
349
|
+
@logger.info "Number of task created: #{created_task_num}"
|
350
|
+
@logger.info "Created task ids: #{task_ids}" if created_task_num > 0
|
351
|
+
end
|
352
|
+
|
353
|
+
def warn_slow_task_generation(&block)
|
354
|
+
start_time = Time.now
|
355
|
+
yield
|
356
|
+
exec_time = (Time.now - start_time)
|
357
|
+
if exec_time > @task_generation_time_limit
|
358
|
+
@logger.warn "Long task generation time: #{exec_time}"
|
359
|
+
@task_generation_time_limit = @task_generation_time_limit * 1.1
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
334
363
|
end
|
335
364
|
|
336
365
|
end
|
@@ -35,7 +35,7 @@ module Bricolage
|
|
35
35
|
def LoadTask.parse_sqs_record(msg, rec)
|
36
36
|
{
|
37
37
|
task_id: rec['taskId'],
|
38
|
-
force: rec['force']
|
38
|
+
force: (rec['force'].to_s == 'true')
|
39
39
|
}
|
40
40
|
end
|
41
41
|
|
@@ -49,7 +49,7 @@ module Bricolage
|
|
49
49
|
from
|
50
50
|
strload_tasks tsk
|
51
51
|
inner join strload_tables tbl
|
52
|
-
using(
|
52
|
+
using(table_id)
|
53
53
|
where
|
54
54
|
task_id = #{task_id}
|
55
55
|
;
|
@@ -94,7 +94,11 @@ module Bricolage
|
|
94
94
|
@disabled = disabled
|
95
95
|
end
|
96
96
|
|
97
|
-
attr_reader :id
|
97
|
+
attr_reader :id
|
98
|
+
|
99
|
+
def force?
|
100
|
+
!!@force
|
101
|
+
end
|
98
102
|
|
99
103
|
#
|
100
104
|
# For writer only
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bricolage-streamingload
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minero Aoki
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-09-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bricolage
|
@@ -17,42 +17,42 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - '='
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 5.16.
|
20
|
+
version: 5.16.9
|
21
21
|
type: :runtime
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
25
|
- - '='
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: 5.16.
|
27
|
+
version: 5.16.9
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: pg
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
31
31
|
requirements:
|
32
|
-
- -
|
32
|
+
- - '='
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version:
|
34
|
+
version: 0.18.4
|
35
35
|
type: :runtime
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
|
-
- -
|
39
|
+
- - '='
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
41
|
+
version: 0.18.4
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
43
|
name: aws-sdk
|
44
44
|
requirement: !ruby/object:Gem::Requirement
|
45
45
|
requirements:
|
46
|
-
- -
|
46
|
+
- - '='
|
47
47
|
- !ruby/object:Gem::Version
|
48
|
-
version:
|
48
|
+
version: 2.5.6
|
49
49
|
type: :runtime
|
50
50
|
prerelease: false
|
51
51
|
version_requirements: !ruby/object:Gem::Requirement
|
52
52
|
requirements:
|
53
|
-
- -
|
53
|
+
- - '='
|
54
54
|
- !ruby/object:Gem::Version
|
55
|
-
version:
|
55
|
+
version: 2.5.6
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
57
|
name: rake
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,7 +138,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
138
138
|
requirements:
|
139
139
|
- - ">="
|
140
140
|
- !ruby/object:Gem::Version
|
141
|
-
version: 2.
|
141
|
+
version: 2.1.0
|
142
142
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
143
|
requirements:
|
144
144
|
- - ">="
|