bricolage-streamingload 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,139 @@
1
+ require 'bricolage/sqsdatasource'
2
+
3
+ module Bricolage
4
+
5
+ module StreamingLoad
6
+
7
+ class Event < SQSMessage
8
+
9
+ def Event.get_concrete_class(msg, rec)
10
+ case
11
+ when rec['eventName'] == 'shutdown' then ShutdownEvent
12
+ when rec['eventName'] == 'dispatch' then DispatchEvent
13
+ when rec['eventName'] == 'flush' then FlushEvent
14
+ when rec['eventSource'] == 'aws:s3'
15
+ S3ObjectEvent
16
+ else
17
+ raise "[FATAL] unknown SQS message record: eventSource=#{rec['eventSource']} event=#{rec['eventName']} message_id=#{msg.message_id}"
18
+ end
19
+ end
20
+
21
+ def message_type
22
+ raise "#{self.class}\#message_type must be implemented"
23
+ end
24
+
25
+ def data?
26
+ false
27
+ end
28
+
29
+ end
30
+
31
+
32
+ class ShutdownEvent < Event
33
+
34
+ def ShutdownEvent.create
35
+ super name: 'shutdown'
36
+ end
37
+
38
+ def ShutdownEvent.parse_sqs_record(msg, rec)
39
+ {}
40
+ end
41
+
42
+ alias message_type name
43
+
44
+ def init_message
45
+ end
46
+
47
+ end
48
+
49
+
50
+ class FlushEvent < Event
51
+
52
+ def FlushEvent.create(delay_seconds:, table_name:)
53
+ super name: 'flush', delay_seconds: delay_seconds, table_name: table_name
54
+ end
55
+
56
+ def FlushEvent.parse_sqs_record(msg, rec)
57
+ {
58
+ table_name: rec['tableName']
59
+ }
60
+ end
61
+
62
+ alias message_type name
63
+
64
+ def init_message(table_name:)
65
+ @table_name = table_name
66
+ end
67
+
68
+ attr_reader :table_name
69
+
70
+ def body
71
+ obj = super
72
+ obj['tableName'] = @table_name
73
+ obj
74
+ end
75
+
76
+ end
77
+
78
+ class DispatchEvent < Event
79
+
80
+ def DispatchEvent.create(delay_seconds:)
81
+ super name: 'dispatch', delay_seconds: delay_seconds
82
+ end
83
+
84
+ alias message_type name
85
+
86
+ def init_message(dummy)
87
+ end
88
+ end
89
+
90
+
91
+ class S3ObjectEvent < Event
92
+
93
+ def S3ObjectEvent.parse_sqs_record(msg, rec)
94
+ {
95
+ region: rec['awsRegion'],
96
+ bucket: rec['s3']['bucket']['name'],
97
+ key: rec['s3']['object']['key'],
98
+ size: rec['s3']['object']['size']
99
+ }
100
+ end
101
+
102
+ def message_type
103
+ 'data'
104
+ end
105
+
106
+ def init_message(region:, bucket:, key:, size:)
107
+ @region = region
108
+ @bucket = bucket
109
+ @key = key
110
+ @size = size
111
+ end
112
+
113
+ attr_reader :region
114
+ attr_reader :bucket
115
+ attr_reader :key
116
+ attr_reader :size
117
+
118
+ def url
119
+ "s3://#{@bucket}/#{@key}"
120
+ end
121
+
122
+ # override
123
+ def data?
124
+ true
125
+ end
126
+
127
+ def created?
128
+ !!(/\AObjectCreated:(?!Copy)/ =~ @name)
129
+ end
130
+
131
+ def loadable_object(url_patterns)
132
+ LoadableObject.new(self, url_patterns.match(url))
133
+ end
134
+
135
+ end
136
+
137
+ end
138
+
139
+ end
@@ -0,0 +1,144 @@
1
+ require 'bricolage/streamingload/loaderparams'
2
+ require 'bricolage/streamingload/manifest'
3
+ require 'bricolage/sqlutils'
4
+ require 'socket'
5
+ require 'json'
6
+
7
+ module Bricolage
8
+
9
+ module StreamingLoad
10
+
11
+ class Loader
12
+
13
+ include SQLUtils
14
+
15
+ def Loader.load_from_file(ctx, ctl_ds, task, logger:)
16
+ params = LoaderParams.load(ctx, task)
17
+ new(ctl_ds, params, logger: logger)
18
+ end
19
+
20
+ def initialize(ctl_ds, params, logger:)
21
+ @ctl_ds = ctl_ds
22
+ @params = params
23
+ @logger = logger
24
+ @process_id = "#{Socket.gethostname}-#{$$}"
25
+ end
26
+
27
+ def execute
28
+ @job_id = assign_task
29
+ return unless @job_id # task already executed by other loader
30
+ @params.ds.open {|conn|
31
+ @connection = conn
32
+ do_load
33
+ }
34
+ end
35
+
36
+ def assign_task
37
+ @ctl_ds.open {|conn|
38
+ job_id = conn.query_value(<<-EndSQL)
39
+ insert into strload_jobs
40
+ ( task_id
41
+ , process_id
42
+ , status
43
+ , start_time
44
+ )
45
+ select
46
+ task_id
47
+ , #{s @process_id}
48
+ , 'running'
49
+ , current_timestamp
50
+ from
51
+ strload_tasks
52
+ where
53
+ task_id = #{@params.task_id}
54
+ and (task_id not in (select task_id from strload_jobs) or #{@params.force})
55
+ returning job_id
56
+ ;
57
+ EndSQL
58
+ return job_id
59
+ }
60
+ end
61
+
62
+ def do_load
63
+ ManifestFile.create(
64
+ @params.ctl_bucket,
65
+ job_id: @job_id,
66
+ object_urls: @params.object_urls,
67
+ logger: @logger
68
+ ) {|manifest|
69
+ if @params.enable_work_table?
70
+ prepare_work_table @params.work_table
71
+ load_objects @params.work_table, manifest, @params.load_options_string
72
+ @connection.transaction {
73
+ commit_work_table @params
74
+ commit_job_result
75
+ }
76
+ else
77
+ @connection.transaction {
78
+ load_objects @params.dest_table, manifest, @params.load_options_string
79
+ commit_job_result
80
+ }
81
+ end
82
+ }
83
+ rescue JobFailure => ex
84
+ write_job_error 'failure', ex.message
85
+ raise
86
+ rescue Exception => ex
87
+ write_job_error 'error', ex.message
88
+ raise
89
+ end
90
+
91
+ def prepare_work_table(work_table)
92
+ @connection.execute("truncate #{work_table}")
93
+ end
94
+
95
+ def load_objects(dest_table, manifest, options)
96
+ @connection.execute(<<-EndSQL.strip.gsub(/\s+/, ' '))
97
+ copy #{dest_table}
98
+ from #{s manifest.url}
99
+ credentials #{s manifest.credential_string}
100
+ manifest
101
+ statupdate false
102
+ compupdate false
103
+ #{options}
104
+ ;
105
+ EndSQL
106
+ @logger.info "load succeeded: #{manifest.url}"
107
+ end
108
+
109
+ def commit_work_table(params)
110
+ @connection.execute(params.sql_source)
111
+ # keep work table records for later tracking
112
+ end
113
+
114
+ def commit_job_result
115
+ @end_time = Time.now
116
+ write_job_result 'success', ''
117
+ end
118
+
119
+ MAX_MESSAGE_LENGTH = 1000
120
+
121
+ def write_job_error(status, message)
122
+ @end_time = Time.now
123
+ write_job_result status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
124
+ end
125
+
126
+ def write_job_result(status, message)
127
+ @ctl_ds.open {|conn|
128
+ conn.execute(<<-EndSQL)
129
+ update
130
+ strload_jobs
131
+ set
132
+ (status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
133
+ where
134
+ job_id = #{@job_id}
135
+ ;
136
+ EndSQL
137
+ }
138
+ end
139
+
140
+ end
141
+
142
+ end
143
+
144
+ end
@@ -0,0 +1,153 @@
1
+ require 'bricolage/rubyjobclass'
2
+ require 'bricolage/psqldatasource'
3
+
4
+ module Bricolage
5
+
6
+ module StreamingLoad
7
+
8
+ class LoaderParams
9
+
10
+ def LoaderParams.load(ctx, task)
11
+ job = load_job(ctx, task)
12
+ schema = resolve_schema(ctx, task.schema)
13
+ job.provide_default 'dest-table', "#{schema}.#{task.table}"
14
+ #job.provide_sql_file_by_job_id # FIXME: provide only when exist
15
+ job.compile
16
+ new(task, job)
17
+ end
18
+
19
+ def LoaderParams.load_job(ctx, task)
20
+ if job_file = find_job_file(ctx, task.schema, task.table)
21
+ ctx.logger.debug "using .job file: #{job_file}"
22
+ Job.load_file(job_file, ctx.subsystem(task.schema))
23
+ else
24
+ ctx.logger.debug "using default job parameters (no .job file)"
25
+ Job.instantiate(task.table, 'streaming_load_v3', ctx).tap {|job|
26
+ job.bind_parameters({})
27
+ }
28
+ end
29
+ end
30
+
31
+ def LoaderParams.find_job_file(ctx, schema, table)
32
+ paths = Dir.glob("#{ctx.home_path}/#{schema}/#{table}.*")
33
+ paths.select {|path| File.extname(path) == '.job' }.sort.first
34
+ end
35
+
36
+ def LoaderParams.resolve_schema(ctx, schema)
37
+ ctx.global_variables["#{schema}_schema"] || schema
38
+ end
39
+ private_class_method :resolve_schema
40
+
41
+ def initialize(task, job)
42
+ @task = task
43
+ @job = job
44
+ @params = job.params
45
+ end
46
+
47
+ def task_id
48
+ @task.id
49
+ end
50
+
51
+ def task_id
52
+ @task.id
53
+ end
54
+
55
+ def schema
56
+ @task.schema
57
+ end
58
+
59
+ def table
60
+ @task.table
61
+ end
62
+
63
+ def force
64
+ @task.force
65
+ end
66
+
67
+ def object_urls
68
+ @task.object_urls
69
+ end
70
+
71
+ def ds
72
+ @params['redshift-ds']
73
+ end
74
+
75
+ def ctl_bucket
76
+ @params['ctl-ds']
77
+ end
78
+
79
+ def enable_work_table?
80
+ !!@params['work-table']
81
+ end
82
+
83
+ def work_table
84
+ @params['work-table']
85
+ end
86
+
87
+ def dest_table
88
+ @params['dest-table']
89
+ end
90
+
91
+ def load_options_string
92
+ @params['load-options'].to_s
93
+ end
94
+
95
+ def sql_source
96
+ sql = @params['sql-file']
97
+ sql ? sql.source : "insert into #{dest_table} select * from #{work_table};"
98
+ end
99
+
100
+ end
101
+
102
+
103
+ class LoaderJob < RubyJobClass
104
+
105
+ job_class_id 'streaming_load_v3'
106
+
107
+ def self.parameters(params)
108
+ params.add DestTableParam.new(optional: false)
109
+ params.add DestTableParam.new('work-table', optional: true)
110
+ params.add KeyValuePairsParam.new('load-options', 'OPTIONS', 'Loader options.',
111
+ optional: true, default: DEFAULT_LOAD_OPTIONS,
112
+ value_handler: lambda {|value, ctx, vars| PSQLLoadOptions.parse(value) })
113
+ params.add SQLFileParam.new('sql-file', 'PATH', 'SQL to insert rows from the work table to the target table.', optional: true)
114
+ params.add DataSourceParam.new('sql', 'redshift-ds', 'Target data source.')
115
+ params.add DataSourceParam.new('s3', 'ctl-ds', 'Manifest file data source.')
116
+ end
117
+
118
+ def self.default_load_options
119
+ end
120
+
121
+ # Use loosen options by default
122
+ default_options = [
123
+ ['json', 'auto'],
124
+ ['gzip', true],
125
+ ['timeformat', 'auto'],
126
+ ['dateformat', 'auto'],
127
+ ['acceptanydate', true],
128
+ ['acceptinvchars', ' '],
129
+ ['truncatecolumns', true],
130
+ ['trimblanks', true]
131
+ ]
132
+ opts = default_options.map {|name, value| PSQLLoadOptions::Option.new(name, value) }
133
+ DEFAULT_LOAD_OPTIONS = PSQLLoadOptions.new(opts)
134
+
135
+ def self.declarations(params)
136
+ Bricolage::Declarations.new(
137
+ 'dest_table' => nil,
138
+ )
139
+ end
140
+
141
+ def initialize(params)
142
+ @params = params
143
+ end
144
+
145
+ def bind(ctx, vars)
146
+ @params['sql-file'].bind(ctx, vars) if @params['sql-file']
147
+ end
148
+
149
+ end
150
+
151
+ end
152
+
153
+ end
@@ -0,0 +1,163 @@
1
+ require 'bricolage/sqsdatasource'
2
+ require 'bricolage/streamingload/task'
3
+ require 'bricolage/streamingload/loader'
4
+ require 'bricolage/logger'
5
+ require 'bricolage/exception'
6
+ require 'bricolage/version'
7
+ require 'optparse'
8
+
9
+ module Bricolage
10
+
11
+ module StreamingLoad
12
+
13
+ class LoaderService
14
+
15
+ def LoaderService.main
16
+ opts = LoaderServiceOptions.new(ARGV)
17
+ opts.parse
18
+ unless opts.rest_arguments.size == 1
19
+ $stderr.puts opts.usage
20
+ exit 1
21
+ end
22
+ config_path, * = opts.rest_arguments
23
+ config = YAML.load(File.read(config_path))
24
+ logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
25
+ ctx = Context.for_application('.', environment: opts.environment, logger: logger)
26
+ redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds'))
27
+ task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
28
+
29
+ service = new(
30
+ context: ctx,
31
+ control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
32
+ data_source: redshift_ds,
33
+ task_queue: task_queue,
34
+ logger: ctx.logger
35
+ )
36
+
37
+ if opts.task_id
38
+ # Single task mode
39
+ service.execute_task_by_id opts.task_id
40
+ else
41
+ # Server mode
42
+ Process.daemon(true) if opts.daemon?
43
+ create_pid_file opts.pid_file_path if opts.pid_file_path
44
+ service.event_loop
45
+ end
46
+ end
47
+
48
+ def LoaderService.new_logger(path, config)
49
+ Logger.new(
50
+ device: path,
51
+ rotation_period: config.fetch('log-rotation-period', 'daily'),
52
+ rotation_size: config.fetch('log-rotation-size', nil)
53
+ )
54
+ end
55
+
56
+ def LoaderService.create_pid_file(path)
57
+ File.open(path, 'w') {|f|
58
+ f.puts $$
59
+ }
60
+ rescue
61
+ # ignore
62
+ end
63
+
64
+ def initialize(context:, control_data_source:, data_source:, task_queue:, logger:)
65
+ @ctx = context
66
+ @ctl_ds = control_data_source
67
+ @ds = data_source
68
+ @task_queue = task_queue
69
+ @logger = logger
70
+ end
71
+
72
+ def event_loop
73
+ @task_queue.main_handler_loop(handlers: self, message_class: Task)
74
+ end
75
+
76
+ def execute_task_by_id(task_id)
77
+ execute_task load_task(task_id)
78
+ end
79
+
80
+ def load_task(task_id, force: true)
81
+ @ctl_ds.open {|conn| LoadTask.load(conn, task_id, force: force) }
82
+ end
83
+
84
+ def handle_streaming_load_v3(task)
85
+ # 1. Load task detail from table
86
+ # 2. Skip disabled (sqs message should not have disabled state since it will never be exectuted)
87
+ # 3. Try execute
88
+ # - Skip if the task has already been executed AND force = false
89
+ loadtask = load_task(task.id, force: task.force)
90
+ return if loadtask.disabled # skip if disabled, but don't delete sqs msg
91
+ execute_task(loadtask)
92
+ @task_queue.delete_message(task)
93
+ end
94
+
95
+ def execute_task(task)
96
+ @logger.info "handling load task: table=#{task.qualified_name} task_id=#{task.id}"
97
+ loader = Loader.load_from_file(@ctx, @ctl_ds, task, logger: @logger)
98
+ loader.execute
99
+ end
100
+
101
+ end
102
+
103
+ class LoaderServiceOptions
104
+
105
+ def initialize(argv)
106
+ @argv = argv
107
+ @task_id = nil
108
+ @daemon = false
109
+ @log_file_path = nil
110
+ @pid_file_path = nil
111
+ @rest_arguments = nil
112
+
113
+ @opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
114
+ opts.on('--task-id=ID', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
115
+ @task_id = task_id
116
+ }
117
+ opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{Context::DEFAULT_ENV}]") {|env|
118
+ @environment = env
119
+ }
120
+ opts.on('--daemon', 'Becomes daemon in server mode.') {
121
+ @daemon = true
122
+ }
123
+ opts.on('--log-file=PATH', 'Log file path') {|path|
124
+ @log_file_path = path
125
+ }
126
+ opts.on('--pid-file=PATH', 'Creates PID file.') {|path|
127
+ @pid_file_path = path
128
+ }
129
+ opts.on('--help', 'Prints this message and quit.') {
130
+ puts opts.help
131
+ exit 0
132
+ }
133
+ opts.on('--version', 'Prints version and quit.') {
134
+ puts "#{File.basename($0)} version #{VERSION}"
135
+ exit 0
136
+ }
137
+ end
138
+
139
+ def usage
140
+ @opts.help
141
+ end
142
+
143
+ def parse
144
+ @opts.parse!(@argv)
145
+ @rest_arguments = @argv.dup
146
+ rescue OptionParser::ParseError => err
147
+ raise OptionError, err.message
148
+ end
149
+
150
+ attr_reader :rest_arguments, :environment, :log_file_path
151
+ attr_reader :task_id
152
+
153
+ def daemon?
154
+ @daemon
155
+ end
156
+
157
+ attr_reader :pid_file_path
158
+
159
+ end
160
+
161
+ end
162
+
163
+ end