bricolage-streamingload 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,139 @@
1
+ require 'bricolage/sqsdatasource'
2
+
3
+ module Bricolage
4
+
5
+ module StreamingLoad
6
+
7
+ class Event < SQSMessage
8
+
9
+ def Event.get_concrete_class(msg, rec)
10
+ case
11
+ when rec['eventName'] == 'shutdown' then ShutdownEvent
12
+ when rec['eventName'] == 'dispatch' then DispatchEvent
13
+ when rec['eventName'] == 'flush' then FlushEvent
14
+ when rec['eventSource'] == 'aws:s3'
15
+ S3ObjectEvent
16
+ else
17
+ raise "[FATAL] unknown SQS message record: eventSource=#{rec['eventSource']} event=#{rec['eventName']} message_id=#{msg.message_id}"
18
+ end
19
+ end
20
+
21
+ def message_type
22
+ raise "#{self.class}\#message_type must be implemented"
23
+ end
24
+
25
+ def data?
26
+ false
27
+ end
28
+
29
+ end
30
+
31
+
32
+ class ShutdownEvent < Event
33
+
34
+ def ShutdownEvent.create
35
+ super name: 'shutdown'
36
+ end
37
+
38
+ def ShutdownEvent.parse_sqs_record(msg, rec)
39
+ {}
40
+ end
41
+
42
+ alias message_type name
43
+
44
+ def init_message
45
+ end
46
+
47
+ end
48
+
49
+
50
+ class FlushEvent < Event
51
+
52
+ def FlushEvent.create(delay_seconds:, table_name:)
53
+ super name: 'flush', delay_seconds: delay_seconds, table_name: table_name
54
+ end
55
+
56
+ def FlushEvent.parse_sqs_record(msg, rec)
57
+ {
58
+ table_name: rec['tableName']
59
+ }
60
+ end
61
+
62
+ alias message_type name
63
+
64
+ def init_message(table_name:)
65
+ @table_name = table_name
66
+ end
67
+
68
+ attr_reader :table_name
69
+
70
+ def body
71
+ obj = super
72
+ obj['tableName'] = @table_name
73
+ obj
74
+ end
75
+
76
+ end
77
+
78
+ class DispatchEvent < Event
79
+
80
+ def DispatchEvent.create(delay_seconds:)
81
+ super name: 'dispatch', delay_seconds: delay_seconds
82
+ end
83
+
84
+ alias message_type name
85
+
86
+ def init_message(dummy)
87
+ end
88
+ end
89
+
90
+
91
+ class S3ObjectEvent < Event
92
+
93
+ def S3ObjectEvent.parse_sqs_record(msg, rec)
94
+ {
95
+ region: rec['awsRegion'],
96
+ bucket: rec['s3']['bucket']['name'],
97
+ key: rec['s3']['object']['key'],
98
+ size: rec['s3']['object']['size']
99
+ }
100
+ end
101
+
102
+ def message_type
103
+ 'data'
104
+ end
105
+
106
+ def init_message(region:, bucket:, key:, size:)
107
+ @region = region
108
+ @bucket = bucket
109
+ @key = key
110
+ @size = size
111
+ end
112
+
113
+ attr_reader :region
114
+ attr_reader :bucket
115
+ attr_reader :key
116
+ attr_reader :size
117
+
118
+ def url
119
+ "s3://#{@bucket}/#{@key}"
120
+ end
121
+
122
+ # override
123
+ def data?
124
+ true
125
+ end
126
+
127
+ def created?
128
+ !!(/\AObjectCreated:(?!Copy)/ =~ @name)
129
+ end
130
+
131
+ def loadable_object(url_patterns)
132
+ LoadableObject.new(self, url_patterns.match(url))
133
+ end
134
+
135
+ end
136
+
137
+ end
138
+
139
+ end
@@ -0,0 +1,144 @@
1
+ require 'bricolage/streamingload/loaderparams'
2
+ require 'bricolage/streamingload/manifest'
3
+ require 'bricolage/sqlutils'
4
+ require 'socket'
5
+ require 'json'
6
+
7
+ module Bricolage
8
+
9
+ module StreamingLoad
10
+
11
+ class Loader
12
+
13
+ include SQLUtils
14
+
15
+ def Loader.load_from_file(ctx, ctl_ds, task, logger:)
16
+ params = LoaderParams.load(ctx, task)
17
+ new(ctl_ds, params, logger: logger)
18
+ end
19
+
20
+ def initialize(ctl_ds, params, logger:)
21
+ @ctl_ds = ctl_ds
22
+ @params = params
23
+ @logger = logger
24
+ @process_id = "#{Socket.gethostname}-#{$$}"
25
+ end
26
+
27
+ def execute
28
+ @job_id = assign_task
29
+ return unless @job_id # task already executed by other loader
30
+ @params.ds.open {|conn|
31
+ @connection = conn
32
+ do_load
33
+ }
34
+ end
35
+
36
+ def assign_task
37
+ @ctl_ds.open {|conn|
38
+ job_id = conn.query_value(<<-EndSQL)
39
+ insert into strload_jobs
40
+ ( task_id
41
+ , process_id
42
+ , status
43
+ , start_time
44
+ )
45
+ select
46
+ task_id
47
+ , #{s @process_id}
48
+ , 'running'
49
+ , current_timestamp
50
+ from
51
+ strload_tasks
52
+ where
53
+ task_id = #{@params.task_id}
54
+ and (task_id not in (select task_id from strload_jobs) or #{@params.force})
55
+ returning job_id
56
+ ;
57
+ EndSQL
58
+ return job_id
59
+ }
60
+ end
61
+
62
+ def do_load
63
+ ManifestFile.create(
64
+ @params.ctl_bucket,
65
+ job_id: @job_id,
66
+ object_urls: @params.object_urls,
67
+ logger: @logger
68
+ ) {|manifest|
69
+ if @params.enable_work_table?
70
+ prepare_work_table @params.work_table
71
+ load_objects @params.work_table, manifest, @params.load_options_string
72
+ @connection.transaction {
73
+ commit_work_table @params
74
+ commit_job_result
75
+ }
76
+ else
77
+ @connection.transaction {
78
+ load_objects @params.dest_table, manifest, @params.load_options_string
79
+ commit_job_result
80
+ }
81
+ end
82
+ }
83
+ rescue JobFailure => ex
84
+ write_job_error 'failure', ex.message
85
+ raise
86
+ rescue Exception => ex
87
+ write_job_error 'error', ex.message
88
+ raise
89
+ end
90
+
91
+ def prepare_work_table(work_table)
92
+ @connection.execute("truncate #{work_table}")
93
+ end
94
+
95
+ def load_objects(dest_table, manifest, options)
96
+ @connection.execute(<<-EndSQL.strip.gsub(/\s+/, ' '))
97
+ copy #{dest_table}
98
+ from #{s manifest.url}
99
+ credentials #{s manifest.credential_string}
100
+ manifest
101
+ statupdate false
102
+ compupdate false
103
+ #{options}
104
+ ;
105
+ EndSQL
106
+ @logger.info "load succeeded: #{manifest.url}"
107
+ end
108
+
109
+ def commit_work_table(params)
110
+ @connection.execute(params.sql_source)
111
+ # keep work table records for later tracking
112
+ end
113
+
114
+ def commit_job_result
115
+ @end_time = Time.now
116
+ write_job_result 'success', ''
117
+ end
118
+
119
+ MAX_MESSAGE_LENGTH = 1000
120
+
121
+ def write_job_error(status, message)
122
+ @end_time = Time.now
123
+ write_job_result status, message.lines.first.strip[0, MAX_MESSAGE_LENGTH]
124
+ end
125
+
126
+ def write_job_result(status, message)
127
+ @ctl_ds.open {|conn|
128
+ conn.execute(<<-EndSQL)
129
+ update
130
+ strload_jobs
131
+ set
132
+ (status, finish_time, message) = (#{s status}, current_timestamp, #{s message})
133
+ where
134
+ job_id = #{@job_id}
135
+ ;
136
+ EndSQL
137
+ }
138
+ end
139
+
140
+ end
141
+
142
+ end
143
+
144
+ end
@@ -0,0 +1,153 @@
1
+ require 'bricolage/rubyjobclass'
2
+ require 'bricolage/psqldatasource'
3
+
4
+ module Bricolage
5
+
6
+ module StreamingLoad
7
+
8
+ class LoaderParams
9
+
10
+ def LoaderParams.load(ctx, task)
11
+ job = load_job(ctx, task)
12
+ schema = resolve_schema(ctx, task.schema)
13
+ job.provide_default 'dest-table', "#{schema}.#{task.table}"
14
+ #job.provide_sql_file_by_job_id # FIXME: provide only when exist
15
+ job.compile
16
+ new(task, job)
17
+ end
18
+
19
+ def LoaderParams.load_job(ctx, task)
20
+ if job_file = find_job_file(ctx, task.schema, task.table)
21
+ ctx.logger.debug "using .job file: #{job_file}"
22
+ Job.load_file(job_file, ctx.subsystem(task.schema))
23
+ else
24
+ ctx.logger.debug "using default job parameters (no .job file)"
25
+ Job.instantiate(task.table, 'streaming_load_v3', ctx).tap {|job|
26
+ job.bind_parameters({})
27
+ }
28
+ end
29
+ end
30
+
31
+ def LoaderParams.find_job_file(ctx, schema, table)
32
+ paths = Dir.glob("#{ctx.home_path}/#{schema}/#{table}.*")
33
+ paths.select {|path| File.extname(path) == '.job' }.sort.first
34
+ end
35
+
36
+ def LoaderParams.resolve_schema(ctx, schema)
37
+ ctx.global_variables["#{schema}_schema"] || schema
38
+ end
39
+ private_class_method :resolve_schema
40
+
41
+ def initialize(task, job)
42
+ @task = task
43
+ @job = job
44
+ @params = job.params
45
+ end
46
+
47
+ def task_id
48
+ @task.id
49
+ end
50
+
51
+ def task_id
52
+ @task.id
53
+ end
54
+
55
+ def schema
56
+ @task.schema
57
+ end
58
+
59
+ def table
60
+ @task.table
61
+ end
62
+
63
+ def force
64
+ @task.force
65
+ end
66
+
67
+ def object_urls
68
+ @task.object_urls
69
+ end
70
+
71
+ def ds
72
+ @params['redshift-ds']
73
+ end
74
+
75
+ def ctl_bucket
76
+ @params['ctl-ds']
77
+ end
78
+
79
+ def enable_work_table?
80
+ !!@params['work-table']
81
+ end
82
+
83
+ def work_table
84
+ @params['work-table']
85
+ end
86
+
87
+ def dest_table
88
+ @params['dest-table']
89
+ end
90
+
91
+ def load_options_string
92
+ @params['load-options'].to_s
93
+ end
94
+
95
+ def sql_source
96
+ sql = @params['sql-file']
97
+ sql ? sql.source : "insert into #{dest_table} select * from #{work_table};"
98
+ end
99
+
100
+ end
101
+
102
+
103
+ class LoaderJob < RubyJobClass
104
+
105
+ job_class_id 'streaming_load_v3'
106
+
107
+ def self.parameters(params)
108
+ params.add DestTableParam.new(optional: false)
109
+ params.add DestTableParam.new('work-table', optional: true)
110
+ params.add KeyValuePairsParam.new('load-options', 'OPTIONS', 'Loader options.',
111
+ optional: true, default: DEFAULT_LOAD_OPTIONS,
112
+ value_handler: lambda {|value, ctx, vars| PSQLLoadOptions.parse(value) })
113
+ params.add SQLFileParam.new('sql-file', 'PATH', 'SQL to insert rows from the work table to the target table.', optional: true)
114
+ params.add DataSourceParam.new('sql', 'redshift-ds', 'Target data source.')
115
+ params.add DataSourceParam.new('s3', 'ctl-ds', 'Manifest file data source.')
116
+ end
117
+
118
+ def self.default_load_options
119
+ end
120
+
121
+ # Use loosen options by default
122
+ default_options = [
123
+ ['json', 'auto'],
124
+ ['gzip', true],
125
+ ['timeformat', 'auto'],
126
+ ['dateformat', 'auto'],
127
+ ['acceptanydate', true],
128
+ ['acceptinvchars', ' '],
129
+ ['truncatecolumns', true],
130
+ ['trimblanks', true]
131
+ ]
132
+ opts = default_options.map {|name, value| PSQLLoadOptions::Option.new(name, value) }
133
+ DEFAULT_LOAD_OPTIONS = PSQLLoadOptions.new(opts)
134
+
135
+ def self.declarations(params)
136
+ Bricolage::Declarations.new(
137
+ 'dest_table' => nil,
138
+ )
139
+ end
140
+
141
+ def initialize(params)
142
+ @params = params
143
+ end
144
+
145
+ def bind(ctx, vars)
146
+ @params['sql-file'].bind(ctx, vars) if @params['sql-file']
147
+ end
148
+
149
+ end
150
+
151
+ end
152
+
153
+ end
@@ -0,0 +1,163 @@
1
+ require 'bricolage/sqsdatasource'
2
+ require 'bricolage/streamingload/task'
3
+ require 'bricolage/streamingload/loader'
4
+ require 'bricolage/logger'
5
+ require 'bricolage/exception'
6
+ require 'bricolage/version'
7
+ require 'optparse'
8
+
9
+ module Bricolage
10
+
11
+ module StreamingLoad
12
+
13
+ class LoaderService
14
+
15
+ def LoaderService.main
16
+ opts = LoaderServiceOptions.new(ARGV)
17
+ opts.parse
18
+ unless opts.rest_arguments.size == 1
19
+ $stderr.puts opts.usage
20
+ exit 1
21
+ end
22
+ config_path, * = opts.rest_arguments
23
+ config = YAML.load(File.read(config_path))
24
+ logger = opts.log_file_path ? new_logger(opts.log_file_path, config) : nil
25
+ ctx = Context.for_application('.', environment: opts.environment, logger: logger)
26
+ redshift_ds = ctx.get_data_source('sql', config.fetch('redshift-ds'))
27
+ task_queue = ctx.get_data_source('sqs', config.fetch('task-queue-ds'))
28
+
29
+ service = new(
30
+ context: ctx,
31
+ control_data_source: ctx.get_data_source('sql', config.fetch('ctl-postgres-ds')),
32
+ data_source: redshift_ds,
33
+ task_queue: task_queue,
34
+ logger: ctx.logger
35
+ )
36
+
37
+ if opts.task_id
38
+ # Single task mode
39
+ service.execute_task_by_id opts.task_id
40
+ else
41
+ # Server mode
42
+ Process.daemon(true) if opts.daemon?
43
+ create_pid_file opts.pid_file_path if opts.pid_file_path
44
+ service.event_loop
45
+ end
46
+ end
47
+
48
+ def LoaderService.new_logger(path, config)
49
+ Logger.new(
50
+ device: path,
51
+ rotation_period: config.fetch('log-rotation-period', 'daily'),
52
+ rotation_size: config.fetch('log-rotation-size', nil)
53
+ )
54
+ end
55
+
56
+ def LoaderService.create_pid_file(path)
57
+ File.open(path, 'w') {|f|
58
+ f.puts $$
59
+ }
60
+ rescue
61
+ # ignore
62
+ end
63
+
64
+ def initialize(context:, control_data_source:, data_source:, task_queue:, logger:)
65
+ @ctx = context
66
+ @ctl_ds = control_data_source
67
+ @ds = data_source
68
+ @task_queue = task_queue
69
+ @logger = logger
70
+ end
71
+
72
+ def event_loop
73
+ @task_queue.main_handler_loop(handlers: self, message_class: Task)
74
+ end
75
+
76
+ def execute_task_by_id(task_id)
77
+ execute_task load_task(task_id)
78
+ end
79
+
80
+ def load_task(task_id, force: true)
81
+ @ctl_ds.open {|conn| LoadTask.load(conn, task_id, force: force) }
82
+ end
83
+
84
+ def handle_streaming_load_v3(task)
85
+ # 1. Load task detail from table
86
+ # 2. Skip disabled (sqs message should not have disabled state since it will never be exectuted)
87
+ # 3. Try execute
88
+ # - Skip if the task has already been executed AND force = false
89
+ loadtask = load_task(task.id, force: task.force)
90
+ return if loadtask.disabled # skip if disabled, but don't delete sqs msg
91
+ execute_task(loadtask)
92
+ @task_queue.delete_message(task)
93
+ end
94
+
95
+ def execute_task(task)
96
+ @logger.info "handling load task: table=#{task.qualified_name} task_id=#{task.id}"
97
+ loader = Loader.load_from_file(@ctx, @ctl_ds, task, logger: @logger)
98
+ loader.execute
99
+ end
100
+
101
+ end
102
+
103
+ class LoaderServiceOptions
104
+
105
+ def initialize(argv)
106
+ @argv = argv
107
+ @task_id = nil
108
+ @daemon = false
109
+ @log_file_path = nil
110
+ @pid_file_path = nil
111
+ @rest_arguments = nil
112
+
113
+ @opts = opts = OptionParser.new("Usage: #{$0} CONFIG_PATH")
114
+ opts.on('--task-id=ID', 'Execute oneshot load task (implicitly disables daemon mode).') {|task_id|
115
+ @task_id = task_id
116
+ }
117
+ opts.on('-e', '--environment=NAME', "Sets execution environment [default: #{Context::DEFAULT_ENV}]") {|env|
118
+ @environment = env
119
+ }
120
+ opts.on('--daemon', 'Becomes daemon in server mode.') {
121
+ @daemon = true
122
+ }
123
+ opts.on('--log-file=PATH', 'Log file path') {|path|
124
+ @log_file_path = path
125
+ }
126
+ opts.on('--pid-file=PATH', 'Creates PID file.') {|path|
127
+ @pid_file_path = path
128
+ }
129
+ opts.on('--help', 'Prints this message and quit.') {
130
+ puts opts.help
131
+ exit 0
132
+ }
133
+ opts.on('--version', 'Prints version and quit.') {
134
+ puts "#{File.basename($0)} version #{VERSION}"
135
+ exit 0
136
+ }
137
+ end
138
+
139
+ def usage
140
+ @opts.help
141
+ end
142
+
143
+ def parse
144
+ @opts.parse!(@argv)
145
+ @rest_arguments = @argv.dup
146
+ rescue OptionParser::ParseError => err
147
+ raise OptionError, err.message
148
+ end
149
+
150
+ attr_reader :rest_arguments, :environment, :log_file_path
151
+ attr_reader :task_id
152
+
153
+ def daemon?
154
+ @daemon
155
+ end
156
+
157
+ attr_reader :pid_file_path
158
+
159
+ end
160
+
161
+ end
162
+
163
+ end