bricolage-streamingload 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bricolage/streamingload/alertinglogger.rb +23 -5
- data/lib/bricolage/streamingload/dispatcher.rb +25 -9
- data/lib/bricolage/streamingload/loader.rb +53 -40
- data/lib/bricolage/streamingload/loaderparams.rb +3 -2
- data/lib/bricolage/streamingload/loaderservice.rb +53 -28
- data/lib/bricolage/streamingload/manifest.rb +9 -2
- data/lib/bricolage/streamingload/objectbuffer.rb +159 -130
- data/lib/bricolage/streamingload/task.rb +7 -3
- data/lib/bricolage/streamingload/version.rb +1 -1
- metadata +13 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17aa54eda4d063cb571f3a7671a4e6413ea079e1
|
4
|
+
data.tar.gz: 1e879a10e505c01a9f66393a079e18de997a3478
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f5778b2ecff8e2bf8d06e1ab00e8e1ee394772a24e4812b74b921c8303566638a212632c896b9b46c7ed5918357db89c65116595664628dba1c2e3cfbef3375
|
7
|
+
data.tar.gz: ebb53db3b87334f5c725e28665f9716f2246d1aef4389d635627e31b07c356e0f9a9edea451162ca39505da82dba22f5fd710d8bbdcb24f8e0fd583c2c734955
|
@@ -1,19 +1,37 @@
|
|
1
|
+
require 'bricolage/logger'
|
2
|
+
require 'logger'
|
3
|
+
require 'forwardable'
|
4
|
+
|
1
5
|
module Bricolage
|
2
6
|
module StreamingLoad
|
3
7
|
class AlertingLogger
|
4
8
|
extend Forwardable
|
5
9
|
|
6
|
-
def initialize(logger
|
10
|
+
def initialize(logger:, sns_datasource:, alert_level: 'warn')
|
7
11
|
@logger = logger
|
8
|
-
@
|
9
|
-
@
|
12
|
+
@alerter = Bricolage::Logger.new(device: sns_datasource)
|
13
|
+
@alerter.level = ::Logger.const_get(alert_level.upcase)
|
10
14
|
end
|
11
15
|
|
12
16
|
def_delegators '@logger', :level, :level=, :debug?, :info?, :warn?, :error?, :fatal?, :unknown?
|
13
17
|
|
14
|
-
%w
|
18
|
+
%w[log debug info warn error fatal unknown].each do |m|
|
15
19
|
define_method(m) do |*args|
|
16
|
-
|
20
|
+
@logger.__send__(m, *args)
|
21
|
+
begin
|
22
|
+
@alerter.__send__(m, *args)
|
23
|
+
rescue Exception => err
|
24
|
+
@logger.error "could not send alert: #{err.message}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def exception(ex)
|
30
|
+
@logger.exception(ex)
|
31
|
+
begin
|
32
|
+
@alerter.error(ex.message)
|
33
|
+
rescue Exception => err
|
34
|
+
@logger.error "could not send alert: #{err.message}"
|
17
35
|
end
|
18
36
|
end
|
19
37
|
|
@@ -29,16 +29,16 @@ module Bricolage
|
|
29
29
|
config = YAML.load(File.read(config_path))
|
30
30
|
logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
|
31
31
|
ctx = Context.for_application('.', environment: opts.environment, logger: logger)
|
32
|
-
event_queue = ctx.get_data_source('sqs', config.fetch('event-queue-ds'))
|
33
|
-
task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
|
32
|
+
event_queue = ctx.get_data_source('sqs', config.fetch('event-queue-ds', 'sqs_event'))
|
33
|
+
task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds', 'sqs_task'))
|
34
34
|
alert_logger = AlertingLogger.new(
|
35
35
|
logger: ctx.logger,
|
36
|
-
sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds')),
|
36
|
+
sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds', 'sns')),
|
37
37
|
alert_level: config.fetch('alert-level', 'warn')
|
38
38
|
)
|
39
39
|
|
40
40
|
object_buffer = ObjectBuffer.new(
|
41
|
-
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
|
41
|
+
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_data')),
|
42
42
|
logger: alert_logger
|
43
43
|
)
|
44
44
|
|
@@ -85,12 +85,14 @@ module Bricolage
|
|
85
85
|
@dispatch_interval = dispatch_interval
|
86
86
|
@dispatch_message_id = nil
|
87
87
|
@logger = logger
|
88
|
+
@dispatch_requested = false
|
88
89
|
@checkpoint_requested = false
|
89
90
|
end
|
90
91
|
|
91
92
|
attr_reader :logger
|
92
93
|
|
93
94
|
def event_loop
|
95
|
+
logger.info "dispatcher started"
|
94
96
|
set_dispatch_timer
|
95
97
|
@event_queue.handle_messages(handler: self, message_class: Event)
|
96
98
|
@event_queue.process_async_delete_force
|
@@ -99,9 +101,17 @@ module Bricolage
|
|
99
101
|
|
100
102
|
# override
|
101
103
|
def after_message_batch
|
104
|
+
# must be processed first
|
102
105
|
@event_queue.process_async_delete
|
106
|
+
|
107
|
+
if @dispatch_requested
|
108
|
+
dispatch_tasks
|
109
|
+
@dispatch_requested = false
|
110
|
+
end
|
111
|
+
|
103
112
|
if @checkpoint_requested
|
104
113
|
create_checkpoint
|
114
|
+
@checkpoint_requested = false # is needless, but reset it just in case
|
105
115
|
end
|
106
116
|
end
|
107
117
|
|
@@ -139,13 +149,19 @@ module Bricolage
|
|
139
149
|
end
|
140
150
|
|
141
151
|
def handle_dispatch(e)
|
152
|
+
logger.info "dispatching tasks requested"
|
153
|
+
# Dispatching tasks may takes 10 minutes or more, it can exceeds visibility timeout.
|
154
|
+
# To avoid this, delay dispatching until all events of current message batch are processed.
|
142
155
|
if @dispatch_message_id == e.message_id
|
143
|
-
|
144
|
-
send_tasks tasks
|
145
|
-
set_dispatch_timer
|
156
|
+
@dispatch_requested = true
|
146
157
|
end
|
147
|
-
|
148
|
-
|
158
|
+
@event_queue.delete_message_async(e)
|
159
|
+
end
|
160
|
+
|
161
|
+
def dispatch_tasks
|
162
|
+
tasks = @object_buffer.flush_tasks
|
163
|
+
send_tasks tasks
|
164
|
+
set_dispatch_timer
|
149
165
|
end
|
150
166
|
|
151
167
|
def set_dispatch_timer
|
@@ -51,7 +51,7 @@ module Bricolage
|
|
51
51
|
strload_tasks
|
52
52
|
where
|
53
53
|
task_id = #{@params.task_id}
|
54
|
-
and (task_id not in (select task_id from strload_jobs)
|
54
|
+
and (#{@params.force?} or task_id not in (select task_id from strload_jobs))
|
55
55
|
returning job_id
|
56
56
|
;
|
57
57
|
EndSQL
|
@@ -60,26 +60,20 @@ module Bricolage
|
|
60
60
|
end
|
61
61
|
|
62
62
|
def do_load
|
63
|
-
ManifestFile.create(
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
if @params.enable_work_table?
|
70
|
-
prepare_work_table @params.work_table
|
63
|
+
manifest = ManifestFile.create(@params.ctl_bucket, job_id: @job_id, object_urls: @params.object_urls, logger: @logger)
|
64
|
+
if @params.enable_work_table?
|
65
|
+
@connection.transaction {|txn|
|
66
|
+
# NOTE: This transaction ends with truncation, this DELETE does nothing
|
67
|
+
# from the second time. So don't worry about DELETE cost here.
|
68
|
+
@connection.execute("delete from #{@params.work_table}")
|
71
69
|
load_objects @params.work_table, manifest, @params.load_options_string
|
72
|
-
@
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
commit_job_result
|
80
|
-
}
|
81
|
-
end
|
82
|
-
}
|
70
|
+
commit_work_table txn, @params
|
71
|
+
}
|
72
|
+
commit_job_result
|
73
|
+
else
|
74
|
+
load_objects @params.dest_table, manifest, @params.load_options_string
|
75
|
+
commit_job_result
|
76
|
+
end
|
83
77
|
rescue JobFailure => ex
|
84
78
|
write_job_error 'failure', ex.message
|
85
79
|
raise
|
@@ -88,10 +82,6 @@ module Bricolage
|
|
88
82
|
raise
|
89
83
|
end
|
90
84
|
|
91
|
-
def prepare_work_table(work_table)
|
92
|
-
@connection.execute("truncate #{work_table}")
|
93
|
-
end
|
94
|
-
|
95
85
|
def load_objects(dest_table, manifest, options)
|
96
86
|
@connection.execute(<<-EndSQL.strip.gsub(/\s+/, ' '))
|
97
87
|
copy #{dest_table}
|
@@ -106,14 +96,37 @@ module Bricolage
|
|
106
96
|
@logger.info "load succeeded: #{manifest.url}"
|
107
97
|
end
|
108
98
|
|
109
|
-
def commit_work_table(params)
|
99
|
+
def commit_work_table(txn, params)
|
110
100
|
@connection.execute(params.sql_source)
|
111
|
-
|
101
|
+
txn.truncate_and_commit(params.work_table)
|
112
102
|
end
|
113
103
|
|
114
104
|
def commit_job_result
|
115
105
|
@end_time = Time.now
|
116
|
-
|
106
|
+
@ctl_ds.open {|conn|
|
107
|
+
conn.transaction {
|
108
|
+
write_job_result conn, 'success', ''
|
109
|
+
update_loaded_flag conn
|
110
|
+
}
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
114
|
+
def update_loaded_flag(connection)
|
115
|
+
connection.execute(<<-EndSQL)
|
116
|
+
update
|
117
|
+
strload_objects
|
118
|
+
set
|
119
|
+
loaded = true
|
120
|
+
where
|
121
|
+
object_id in (
|
122
|
+
select
|
123
|
+
object_id
|
124
|
+
from
|
125
|
+
strload_task_objects
|
126
|
+
where task_id = (select task_id from strload_jobs where job_id = #{@job_id})
|
127
|
+
)
|
128
|
+
;
|
129
|
+
EndSQL
|
117
130
|
end
|
118
131
|
|
119
132
|
MAX_MESSAGE_LENGTH = 1000
|
@@ -121,23 +134,23 @@ module Bricolage
|
|
121
134
|
def write_job_error(status, message)
|
122
135
|
@end_time = Time.now
|
123
136
|
@logger.warn message.lines.first
|
124
|
-
write_job_result status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
|
125
|
-
end
|
126
|
-
|
127
|
-
def write_job_result(status, message)
|
128
137
|
@ctl_ds.open {|conn|
|
129
|
-
conn.
|
130
|
-
update
|
131
|
-
strload_jobs
|
132
|
-
set
|
133
|
-
(status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
|
134
|
-
where
|
135
|
-
job_id = #{@job_id}
|
136
|
-
;
|
137
|
-
EndSQL
|
138
|
+
write_job_result conn, status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
|
138
139
|
}
|
139
140
|
end
|
140
141
|
|
142
|
+
def write_job_result(connection, status, message)
|
143
|
+
connection.execute(<<-EndSQL)
|
144
|
+
update
|
145
|
+
strload_jobs
|
146
|
+
set
|
147
|
+
(status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
|
148
|
+
where
|
149
|
+
job_id = #{@job_id}
|
150
|
+
;
|
151
|
+
EndSQL
|
152
|
+
end
|
153
|
+
|
141
154
|
end
|
142
155
|
|
143
156
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'bricolage/context'
|
1
2
|
require 'bricolage/sqsdatasource'
|
2
3
|
require 'bricolage/streamingload/task'
|
3
4
|
require 'bricolage/streamingload/loader'
|
@@ -5,6 +6,7 @@ require 'bricolage/streamingload/alertinglogger'
|
|
5
6
|
require 'bricolage/logger'
|
6
7
|
require 'bricolage/exception'
|
7
8
|
require 'bricolage/version'
|
9
|
+
require 'yaml'
|
8
10
|
require 'optparse'
|
9
11
|
|
10
12
|
module Bricolage
|
@@ -23,21 +25,25 @@ module Bricolage
|
|
23
25
|
config_path, * = opts.rest_arguments
|
24
26
|
config = YAML.load(File.read(config_path))
|
25
27
|
logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
|
26
|
-
ctx = Context.for_application(
|
27
|
-
redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds'))
|
28
|
-
task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
28
|
+
ctx = Context.for_application(opts.working_dir, environment: opts.environment, logger: logger)
|
29
|
+
redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds', 'db_data'))
|
30
|
+
task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds', 'sqs_task'))
|
31
|
+
raw_logger = logger = ctx.logger
|
32
|
+
if config.key?('alert-level')
|
33
|
+
logger = AlertingLogger.new(
|
34
|
+
logger: raw_logger,
|
35
|
+
sns_datasource: ctx.get_data_source('sns', config.fetch('sns-ds', 'sns')),
|
36
|
+
alert_level: config.fetch('alert-level', 'warn')
|
37
|
+
)
|
38
|
+
end
|
34
39
|
|
35
40
|
service = new(
|
36
41
|
context: ctx,
|
37
|
-
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
|
42
|
+
control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds', 'db_ctl')),
|
38
43
|
data_source: redshift_ds,
|
39
44
|
task_queue: task_queue,
|
40
|
-
|
45
|
+
working_dir: opts.working_dir,
|
46
|
+
logger: logger
|
41
47
|
)
|
42
48
|
|
43
49
|
if opts.task_id
|
@@ -46,12 +52,18 @@ module Bricolage
|
|
46
52
|
else
|
47
53
|
# Server mode
|
48
54
|
Process.daemon(true) if opts.daemon?
|
55
|
+
Dir.chdir '/'
|
49
56
|
create_pid_file opts.pid_file_path if opts.pid_file_path
|
50
|
-
|
57
|
+
begin
|
58
|
+
logger.info "*** bricolage-streaming-loader started: pid=#{$$}"
|
59
|
+
service.event_loop
|
60
|
+
logger.info "*** bricolage-streaming-loader shutdown gracefully: pid=#{$$}"
|
61
|
+
rescue Exception => ex
|
62
|
+
logger.exception(ex)
|
63
|
+
logger.error "*** bricolage-streaming-loader abort: pid=#{$$}"
|
64
|
+
raise
|
65
|
+
end
|
51
66
|
end
|
52
|
-
rescue Exception => e
|
53
|
-
alert_logger.error e.message
|
54
|
-
raise
|
55
67
|
end
|
56
68
|
|
57
69
|
def LoaderService.new_logger(path, config)
|
@@ -70,11 +82,12 @@ module Bricolage
|
|
70
82
|
# ignore
|
71
83
|
end
|
72
84
|
|
73
|
-
def initialize(context:, control_data_source:, data_source:, task_queue:, logger:)
|
85
|
+
def initialize(context:, control_data_source:, data_source:, task_queue:, working_dir:, logger:)
|
74
86
|
@ctx = context
|
75
87
|
@ctl_ds = control_data_source
|
76
88
|
@ds = data_source
|
77
89
|
@task_queue = task_queue
|
90
|
+
@working_dir = working_dir
|
78
91
|
@logger = logger
|
79
92
|
end
|
80
93
|
|
@@ -82,7 +95,6 @@ module Bricolage
|
|
82
95
|
|
83
96
|
def event_loop
|
84
97
|
@task_queue.handle_messages(handler: self, message_class: Task)
|
85
|
-
@logger.info "shutdown gracefully"
|
86
98
|
end
|
87
99
|
|
88
100
|
def execute_task_by_id(task_id)
|
@@ -95,19 +107,23 @@ module Bricolage
|
|
95
107
|
|
96
108
|
# message handler
|
97
109
|
def handle_streaming_load_v3(task)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
110
|
+
Dir.chdir(@working_dir) {
|
111
|
+
loadtask = load_task(task.id, force: task.force?)
|
112
|
+
if loadtask.disabled
|
113
|
+
# Skip if disabled, and don't delete SQS message.
|
114
|
+
@logger.info "skip disabled task: task_id=#{task.id}"
|
115
|
+
return
|
116
|
+
end
|
117
|
+
execute_task(loadtask)
|
118
|
+
# Do not use async delete
|
119
|
+
@task_queue.delete_message(task)
|
120
|
+
}
|
121
|
+
rescue => ex
|
122
|
+
@logger.exception ex
|
107
123
|
end
|
108
124
|
|
109
125
|
def execute_task(task)
|
110
|
-
@logger.info "
|
126
|
+
@logger.info "execute task: task_id=#{task.id} table=#{task.qualified_name}"
|
111
127
|
loader = Loader.load_from_file(@ctx, @ctl_ds, task, logger: @logger)
|
112
128
|
loader.execute
|
113
129
|
end
|
@@ -119,16 +135,18 @@ module Bricolage
|
|
119
135
|
def initialize(argv)
|
120
136
|
@argv = argv
|
121
137
|
@task_id = nil
|
138
|
+
@environment = Context::DEFAULT_ENV
|
122
139
|
@daemon = false
|
123
140
|
@log_file_path = nil
|
124
141
|
@pid_file_path = nil
|
142
|
+
@working_dir = Dir.getwd
|
125
143
|
@rest_arguments = nil
|
126
144
|
|
127
145
|
@opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
|
128
146
|
opts.on('--task-id=ID', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
|
129
147
|
@task_id = task_id
|
130
148
|
}
|
131
|
-
opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{
|
149
|
+
opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{@environment}]") {|env|
|
132
150
|
@environment = env
|
133
151
|
}
|
134
152
|
opts.on('--daemon', 'Becomes daemon in server mode.') {
|
@@ -140,6 +158,9 @@ module Bricolage
|
|
140
158
|
opts.on('--pid-file=PATH', 'Creates PID file.') {|path|
|
141
159
|
@pid_file_path = path
|
142
160
|
}
|
161
|
+
opts.on('--working-dir=PATH', "Loader working directory. [default: #{@working_dir}]") {|path|
|
162
|
+
@working_dir = path
|
163
|
+
}
|
143
164
|
opts.on('--help', 'Prints this message and quit.') {
|
144
165
|
puts opts.help
|
145
166
|
exit 0
|
@@ -161,14 +182,18 @@ module Bricolage
|
|
161
182
|
raise OptionError, err.message
|
162
183
|
end
|
163
184
|
|
164
|
-
attr_reader :rest_arguments
|
185
|
+
attr_reader :rest_arguments
|
186
|
+
|
165
187
|
attr_reader :task_id
|
188
|
+
attr_reader :environment
|
166
189
|
|
167
190
|
def daemon?
|
168
191
|
@daemon
|
169
192
|
end
|
170
193
|
|
194
|
+
attr_reader :log_file_path
|
171
195
|
attr_reader :pid_file_path
|
196
|
+
attr_reader :working_dir
|
172
197
|
|
173
198
|
end
|
174
199
|
|
@@ -6,7 +6,12 @@ module Bricolage
|
|
6
6
|
|
7
7
|
def ManifestFile.create(ds, job_id:, object_urls:, logger:, noop: false, &block)
|
8
8
|
manifest = new(ds, job_id, object_urls, logger: logger, noop: noop)
|
9
|
-
|
9
|
+
if block
|
10
|
+
manifest.create_temporary(&block)
|
11
|
+
else
|
12
|
+
manifest.put
|
13
|
+
return manifest
|
14
|
+
end
|
10
15
|
end
|
11
16
|
|
12
17
|
def initialize(ds, job_id, object_urls, logger:, noop: false)
|
@@ -22,7 +27,9 @@ module Bricolage
|
|
22
27
|
end
|
23
28
|
|
24
29
|
def name
|
25
|
-
@name
|
30
|
+
return @name if @name
|
31
|
+
now =Time.now
|
32
|
+
"#{now.strftime('%Y/%m/%d')}/manifest-#{now.strftime('%H%M%S')}-#{@job_id}.json"
|
26
33
|
end
|
27
34
|
|
28
35
|
def url
|
@@ -42,26 +42,43 @@ module Bricolage
|
|
42
42
|
|
43
43
|
class ObjectBuffer
|
44
44
|
|
45
|
+
TASK_GENERATION_TIME_LIMIT = 30 #sec
|
46
|
+
|
45
47
|
include SQLUtils
|
46
48
|
|
47
49
|
def initialize(control_data_source:, logger:)
|
48
50
|
@ctl_ds = control_data_source
|
49
51
|
@logger = logger
|
52
|
+
@task_generation_time_limit = TASK_GENERATION_TIME_LIMIT
|
50
53
|
end
|
51
54
|
|
52
55
|
def put(obj)
|
53
56
|
@ctl_ds.open {|conn|
|
54
|
-
|
57
|
+
suppress_sql_logging {
|
58
|
+
conn.transaction {
|
59
|
+
object_id = insert_object(conn, obj)
|
60
|
+
if object_id
|
61
|
+
insert_task_objects(conn, object_id)
|
62
|
+
else
|
63
|
+
insert_dup_object(conn, obj)
|
64
|
+
end
|
65
|
+
}
|
66
|
+
}
|
55
67
|
}
|
56
68
|
end
|
57
69
|
|
58
70
|
# Flushes multiple tables periodically
|
59
71
|
def flush_tasks
|
60
|
-
task_ids =
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
72
|
+
task_ids = []
|
73
|
+
warn_slow_task_generation {
|
74
|
+
@ctl_ds.open {|conn|
|
75
|
+
conn.transaction {|txn|
|
76
|
+
task_ids = insert_tasks(conn)
|
77
|
+
unless task_ids.empty?
|
78
|
+
update_task_object_mappings(conn, task_ids)
|
79
|
+
log_mapped_object_num(conn, task_ids)
|
80
|
+
end
|
81
|
+
}
|
65
82
|
}
|
66
83
|
}
|
67
84
|
return task_ids.map {|id| LoadTask.create(task_id: id) }
|
@@ -73,11 +90,12 @@ module Bricolage
|
|
73
90
|
task_ids = []
|
74
91
|
@ctl_ds.open {|conn|
|
75
92
|
conn.transaction {|txn|
|
76
|
-
#
|
93
|
+
# update_task_object_mappings may not consume all saved objects
|
77
94
|
# (e.g. there are too many objects for one table), we must create
|
78
95
|
# tasks repeatedly until there are no unassigned objects.
|
79
96
|
until (ids = insert_tasks_force(conn)).empty?
|
80
|
-
|
97
|
+
update_task_object_mappings(conn, ids)
|
98
|
+
log_mapped_object_num(conn, ids)
|
81
99
|
task_ids.concat ids
|
82
100
|
end
|
83
101
|
}
|
@@ -91,11 +109,12 @@ module Bricolage
|
|
91
109
|
task_ids = []
|
92
110
|
@ctl_ds.open {|conn|
|
93
111
|
conn.transaction {|txn|
|
94
|
-
#
|
112
|
+
# update_task_object_mappings may not consume all saved objects
|
95
113
|
# (e.g. there are too many objects for one table), we must create
|
96
114
|
# tasks repeatedly until there are no unassigned objects.
|
97
115
|
until (ids = insert_table_task_force(conn, table_name)).empty?
|
98
|
-
|
116
|
+
update_task_object_mappings(conn, ids)
|
117
|
+
log_mapped_object_num(conn, ids)
|
99
118
|
task_ids.concat ids
|
100
119
|
end
|
101
120
|
}
|
@@ -106,30 +125,66 @@ module Bricolage
|
|
106
125
|
private
|
107
126
|
|
108
127
|
def insert_object(conn, obj)
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
128
|
+
object_ids = conn.query_values(<<-EndSQL)
|
129
|
+
insert into strload_objects
|
130
|
+
( object_url
|
131
|
+
, object_size
|
132
|
+
, data_source_id
|
133
|
+
, message_id
|
134
|
+
, event_time
|
135
|
+
, submit_time
|
136
|
+
)
|
137
|
+
values
|
138
|
+
( #{s obj.url}
|
139
|
+
, #{obj.size}
|
140
|
+
, #{s obj.data_source_id}
|
141
|
+
, #{s obj.message_id}
|
142
|
+
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
143
|
+
, current_timestamp
|
144
|
+
)
|
145
|
+
on conflict on constraint strload_objects_object_url
|
146
|
+
do nothing
|
147
|
+
returning object_id
|
148
|
+
;
|
149
|
+
EndSQL
|
150
|
+
return object_ids.first
|
151
|
+
end
|
152
|
+
|
153
|
+
def insert_dup_object(conn, obj)
|
154
|
+
@logger.info "Duplicated object recieved: object_url=#{obj.url}"
|
155
|
+
conn.update(<<-EndSQL)
|
156
|
+
insert into strload_dup_objects
|
157
|
+
( object_url
|
158
|
+
, object_size
|
159
|
+
, data_source_id
|
160
|
+
, message_id
|
161
|
+
, event_time
|
162
|
+
, submit_time
|
163
|
+
)
|
164
|
+
values
|
165
|
+
( #{s obj.url}
|
166
|
+
, #{obj.size}
|
167
|
+
, #{s obj.data_source_id}
|
168
|
+
, #{s obj.message_id}
|
169
|
+
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
170
|
+
, current_timestamp
|
171
|
+
)
|
172
|
+
;
|
173
|
+
EndSQL
|
174
|
+
end
|
175
|
+
|
176
|
+
def insert_task_objects(conn, object_id)
|
177
|
+
conn.update(<<-EndSQL)
|
178
|
+
insert into strload_task_objects
|
179
|
+
( task_id
|
180
|
+
, object_id
|
181
|
+
)
|
182
|
+
values
|
183
|
+
( -1
|
184
|
+
, #{object_id}
|
185
|
+
)
|
186
|
+
;
|
187
|
+
EndSQL
|
133
188
|
end
|
134
189
|
|
135
190
|
def insert_tasks_force(conn)
|
@@ -140,14 +195,12 @@ module Bricolage
|
|
140
195
|
task_ids = conn.query_values(<<-EndSQL)
|
141
196
|
insert into strload_tasks
|
142
197
|
( task_class
|
143
|
-
,
|
144
|
-
, table_name
|
198
|
+
, table_id
|
145
199
|
, submit_time
|
146
200
|
)
|
147
201
|
select
|
148
202
|
'streaming_load_v3'
|
149
|
-
, tbl.
|
150
|
-
, tbl.table_name
|
203
|
+
, tbl.table_id
|
151
204
|
, current_timestamp
|
152
205
|
from
|
153
206
|
strload_tables tbl
|
@@ -158,19 +211,9 @@ module Bricolage
|
|
158
211
|
data_source_id
|
159
212
|
, count(*) as object_count
|
160
213
|
from
|
161
|
-
|
162
|
-
select
|
163
|
-
min(object_id) as object_id
|
164
|
-
, object_url
|
165
|
-
from
|
166
|
-
strload_objects
|
167
|
-
group by
|
168
|
-
object_url
|
169
|
-
) uniq_objects
|
170
|
-
inner join strload_objects using (object_id)
|
171
|
-
left outer join strload_task_objects using (object_id)
|
214
|
+
strload_objects
|
172
215
|
where
|
173
|
-
|
216
|
+
object_id in (select object_id from strload_task_objects where task_id = -1)
|
174
217
|
group by
|
175
218
|
data_source_id
|
176
219
|
) obj
|
@@ -179,28 +222,27 @@ module Bricolage
|
|
179
222
|
-- preceeding task's submit time
|
180
223
|
left outer join (
|
181
224
|
select
|
182
|
-
|
183
|
-
, table_name
|
225
|
+
table_id
|
184
226
|
, max(submit_time) as latest_submit_time
|
185
227
|
from
|
186
228
|
strload_tasks
|
187
229
|
group by
|
188
|
-
|
230
|
+
table_id
|
189
231
|
) task
|
190
|
-
using (
|
232
|
+
using (table_id)
|
191
233
|
where
|
192
234
|
not tbl.disabled -- not disabled
|
193
235
|
and (
|
194
236
|
#{force ? "true or" : ""} -- Creates tasks with no conditions if forced
|
195
237
|
obj.object_count > tbl.load_batch_size -- batch_size exceeded?
|
196
|
-
or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
|
197
|
-
or latest_submit_time is null
|
238
|
+
or extract(epoch from current_timestamp - task.latest_submit_time) > tbl.load_interval -- load_interval exceeded?
|
239
|
+
or task.latest_submit_time is null -- no previous tasks?
|
198
240
|
)
|
199
241
|
returning task_id
|
200
242
|
;
|
201
243
|
EndSQL
|
202
244
|
|
203
|
-
|
245
|
+
log_created_tasks task_ids
|
204
246
|
task_ids
|
205
247
|
end
|
206
248
|
|
@@ -208,14 +250,12 @@ module Bricolage
|
|
208
250
|
task_ids = conn.query_values(<<-EndSQL)
|
209
251
|
insert into strload_tasks
|
210
252
|
( task_class
|
211
|
-
,
|
212
|
-
, table_name
|
253
|
+
, table_id
|
213
254
|
, submit_time
|
214
255
|
)
|
215
256
|
select
|
216
257
|
'streaming_load_v3'
|
217
|
-
, tbl.
|
218
|
-
, tbl.table_name
|
258
|
+
, tbl.table_id
|
219
259
|
, current_timestamp
|
220
260
|
from
|
221
261
|
strload_tables tbl
|
@@ -227,21 +267,9 @@ module Bricolage
|
|
227
267
|
data_source_id
|
228
268
|
, count(*) as object_count
|
229
269
|
from
|
230
|
-
|
231
|
-
select
|
232
|
-
min(object_id) as object_id
|
233
|
-
, object_url
|
234
|
-
from
|
235
|
-
strload_objects
|
236
|
-
where
|
237
|
-
data_source_id = #{s table_name}
|
238
|
-
group by
|
239
|
-
object_url
|
240
|
-
) uniq_objects
|
241
|
-
inner join strload_objects using (object_id)
|
242
|
-
left outer join strload_task_objects using (object_id)
|
270
|
+
strload_objects
|
243
271
|
where
|
244
|
-
|
272
|
+
object_id in (select object_id from strload_task_objects where task_id = -1)
|
245
273
|
group by
|
246
274
|
data_source_id
|
247
275
|
) obj
|
@@ -254,70 +282,55 @@ module Bricolage
|
|
254
282
|
EndSQL
|
255
283
|
|
256
284
|
# It must be 1
|
257
|
-
|
285
|
+
log_created_tasks(task_ids)
|
258
286
|
task_ids
|
259
287
|
end
|
260
288
|
|
261
|
-
def
|
289
|
+
def update_task_object_mappings(conn, task_ids)
|
262
290
|
conn.update(<<-EndSQL)
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
291
|
+
update strload_task_objects dst
|
292
|
+
set
|
293
|
+
task_id = tasks.task_id
|
294
|
+
from
|
295
|
+
strload_tasks tasks
|
296
|
+
inner join strload_tables tables using (table_id)
|
297
|
+
inner join (
|
298
|
+
select
|
299
|
+
object_id
|
300
|
+
, data_source_id
|
301
|
+
, row_number() over (partition by data_source_id order by object_id) as object_seq
|
302
|
+
from
|
303
|
+
strload_objects
|
304
|
+
where
|
305
|
+
object_id in (select object_id from strload_task_objects where task_id = -1)
|
306
|
+
) tsk_obj
|
307
|
+
using (data_source_id)
|
308
|
+
where
|
309
|
+
dst.task_id = -1
|
310
|
+
and tasks.task_id in (#{task_ids.join(",")})
|
311
|
+
and dst.object_id = tsk_obj.object_id
|
312
|
+
and tsk_obj.object_seq <= tables.load_batch_size
|
313
|
+
;
|
314
|
+
EndSQL
|
315
|
+
end
|
316
|
+
|
317
|
+
def log_mapped_object_num(conn, task_ids)
|
318
|
+
# This method is required since UPDATE does not "returning" multiple values
|
319
|
+
rows = conn.query_values(<<-EndSQL)
|
267
320
|
select
|
268
321
|
task_id
|
269
|
-
,
|
270
|
-
from
|
271
|
-
|
272
|
-
row_number() over (partition by task.task_id order by obj.object_id) as object_count
|
273
|
-
, task.task_id
|
274
|
-
, obj.object_id
|
275
|
-
, load_batch_size
|
276
|
-
from
|
277
|
-
-- unassigned objects
|
278
|
-
(
|
279
|
-
select
|
280
|
-
data_source_id
|
281
|
-
, uniq_objects.object_url
|
282
|
-
, object_id
|
283
|
-
from
|
284
|
-
(
|
285
|
-
select
|
286
|
-
min(object_id) as object_id
|
287
|
-
, object_url
|
288
|
-
from
|
289
|
-
strload_objects
|
290
|
-
group by
|
291
|
-
object_url
|
292
|
-
) uniq_objects
|
293
|
-
inner join strload_objects using(object_id)
|
294
|
-
left outer join strload_task_objects using(object_id)
|
295
|
-
where
|
296
|
-
task_id is null
|
297
|
-
) obj
|
298
|
-
|
299
|
-
-- tasks without objects
|
300
|
-
inner join (
|
301
|
-
select
|
302
|
-
tbl.data_source_id
|
303
|
-
, min(task_id) as task_id -- pick up oldest task
|
304
|
-
, max(load_batch_size) as load_batch_size
|
305
|
-
from
|
306
|
-
strload_tasks
|
307
|
-
inner join strload_tables tbl
|
308
|
-
using (schema_name, table_name)
|
309
|
-
where
|
310
|
-
-- unassigned objects
|
311
|
-
task_id not in (select distinct task_id from strload_task_objects)
|
312
|
-
group by
|
313
|
-
1
|
314
|
-
) task
|
315
|
-
using (data_source_id)
|
316
|
-
) as t
|
322
|
+
, count(*)
|
323
|
+
from
|
324
|
+
strload_task_objects
|
317
325
|
where
|
318
|
-
|
326
|
+
task_id in (#{task_ids.join(',')})
|
327
|
+
group by
|
328
|
+
task_id
|
319
329
|
;
|
320
330
|
EndSQL
|
331
|
+
rows.each_slice(2) do |task_id, object_count|
|
332
|
+
@logger.info "Number of objects assigned to task: task_id=#{task_id} object_count=#{object_count}"
|
333
|
+
end
|
321
334
|
end
|
322
335
|
|
323
336
|
def suppress_sql_logging
|
@@ -331,6 +344,22 @@ module Bricolage
|
|
331
344
|
end
|
332
345
|
end
|
333
346
|
|
347
|
+
def log_created_tasks(task_ids)
|
348
|
+
created_task_num = task_ids.size
|
349
|
+
@logger.info "Number of task created: #{created_task_num}"
|
350
|
+
@logger.info "Created task ids: #{task_ids}" if created_task_num > 0
|
351
|
+
end
|
352
|
+
|
353
|
+
def warn_slow_task_generation(&block)
|
354
|
+
start_time = Time.now
|
355
|
+
yield
|
356
|
+
exec_time = (Time.now - start_time)
|
357
|
+
if exec_time > @task_generation_time_limit
|
358
|
+
@logger.warn "Long task generation time: #{exec_time}"
|
359
|
+
@task_generation_time_limit = @task_generation_time_limit * 1.1
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
334
363
|
end
|
335
364
|
|
336
365
|
end
|
@@ -35,7 +35,7 @@ module Bricolage
|
|
35
35
|
def LoadTask.parse_sqs_record(msg, rec)
|
36
36
|
{
|
37
37
|
task_id: rec['taskId'],
|
38
|
-
force: rec['force']
|
38
|
+
force: (rec['force'].to_s == 'true')
|
39
39
|
}
|
40
40
|
end
|
41
41
|
|
@@ -49,7 +49,7 @@ module Bricolage
|
|
49
49
|
from
|
50
50
|
strload_tasks tsk
|
51
51
|
inner join strload_tables tbl
|
52
|
-
using(
|
52
|
+
using(table_id)
|
53
53
|
where
|
54
54
|
task_id = #{task_id}
|
55
55
|
;
|
@@ -94,7 +94,11 @@ module Bricolage
|
|
94
94
|
@disabled = disabled
|
95
95
|
end
|
96
96
|
|
97
|
-
attr_reader :id
|
97
|
+
attr_reader :id
|
98
|
+
|
99
|
+
def force?
|
100
|
+
!!@force
|
101
|
+
end
|
98
102
|
|
99
103
|
#
|
100
104
|
# For writer only
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bricolage-streamingload
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minero Aoki
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-09-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bricolage
|
@@ -17,42 +17,42 @@ dependencies:
|
|
17
17
|
requirements:
|
18
18
|
- - '='
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 5.16.
|
20
|
+
version: 5.16.9
|
21
21
|
type: :runtime
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
25
|
- - '='
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: 5.16.
|
27
|
+
version: 5.16.9
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
29
|
name: pg
|
30
30
|
requirement: !ruby/object:Gem::Requirement
|
31
31
|
requirements:
|
32
|
-
- -
|
32
|
+
- - '='
|
33
33
|
- !ruby/object:Gem::Version
|
34
|
-
version:
|
34
|
+
version: 0.18.4
|
35
35
|
type: :runtime
|
36
36
|
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
|
-
- -
|
39
|
+
- - '='
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
41
|
+
version: 0.18.4
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
43
|
name: aws-sdk
|
44
44
|
requirement: !ruby/object:Gem::Requirement
|
45
45
|
requirements:
|
46
|
-
- -
|
46
|
+
- - '='
|
47
47
|
- !ruby/object:Gem::Version
|
48
|
-
version:
|
48
|
+
version: 2.5.6
|
49
49
|
type: :runtime
|
50
50
|
prerelease: false
|
51
51
|
version_requirements: !ruby/object:Gem::Requirement
|
52
52
|
requirements:
|
53
|
-
- -
|
53
|
+
- - '='
|
54
54
|
- !ruby/object:Gem::Version
|
55
|
-
version:
|
55
|
+
version: 2.5.6
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
57
|
name: rake
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,7 +138,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
138
138
|
requirements:
|
139
139
|
- - ">="
|
140
140
|
- !ruby/object:Gem::Version
|
141
|
-
version: 2.
|
141
|
+
version: 2.1.0
|
142
142
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
143
|
requirements:
|
144
144
|
- - ">="
|