redtrack 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ # Datatypes provides a run time bound implementation for validating passed data types
2
+ #
3
+ # Copyright (c) 2014 RedHotLabs, Inc.
4
+ # Licensed under the MIT License
5
+
6
+ module RedTrack
7
+ class DataTypes
8
+
9
+ @logger = nil
10
+
11
+ # Constructor - non-static... Want runtime bound interface
12
+ def initialize(options)
13
+ if options && options[:logger] != nil
14
+ @logger = options[:logger]
15
+ else
16
+ @logger = Logger.new(STDOUT)
17
+ end
18
+ end
19
+
20
+ # @return [Array] Return an array of valid data types
21
+ def valid_data_types
22
+ result = %w(smallint integer bigint decimal real boolean char varchar date timestamp)
23
+ return result
24
+ end
25
+
26
+ # Check and clean value to ensure it conforms to the redshfit data type
27
+ #
28
+ # @param [Object] value the value to set for the column
29
+ # @param [String] type_definition the the type defined by the schema
30
+ # @param [String] column_name The name of the redshift column
31
+ # @return [Object] The value if it is valid
32
+ def check_smallint(value,type_definition=nil,column_name=nil)
33
+ if value.is_a?(Integer) == false
34
+ raise_exception(column_name,value,type_definition)
35
+ end
36
+ # TODO: Range / overflow check
37
+ return value
38
+ end
39
+
40
+ # Check and clean value to ensure it conforms to the redshfit data type
41
+ #
42
+ # @param [Object] value the value to set for the column
43
+ # @param [String] type_definition the the type defined by the schema
44
+ # @param [String] column_name The name of the redshift column
45
+ # @return [Object] The value if it is valid
46
+ def check_integer(value,type_definition=nil,column_name=nil)
47
+ if value.is_a?(Integer) == false
48
+ raise_exception(column_name,value,type_definition)
49
+ end
50
+ # TODO: range / overflow check
51
+ return value
52
+ end
53
+
54
+ # Check and clean value to ensure it conforms to the redshfit data type
55
+ #
56
+ # @param [Object] value the value to set for the column
57
+ # @param [String] type_definition the the type defined by the schema
58
+ # @param [String] column_name The name of the redshift column
59
+ # @return [Object] The value if it is valid
60
+ def check_bigint(value,type_definition=nil,column_name=nil)
61
+ if value.is_a?(Integer) == false
62
+ raise_exception(column_name,value,type_definition)
63
+ end
64
+ # TODO: range /overflow check
65
+ return value
66
+ end
67
+
68
+ # Check and clean value to ensure it conforms to the redshfit data type
69
+ #
70
+ # @param [Object] value the value to set for the column
71
+ # @param [String] type_definition the the type defined by the schema
72
+ # @param [String] column_name The name of the redshift column
73
+ # @return [Object] The value if it is valid
74
+ def check_decimal(value,type_definition=nil,column_name=nil)
75
+ if value.is_a?(String) == false || is_numeric(value) == false
76
+ raise_exception(column_name,value,type_definition)
77
+ #raise ""
78
+ end
79
+
80
+ return value
81
+ end
82
+
83
+ # Check and clean value to ensure it conforms to the redshfit data type
84
+ #
85
+ # @param [Object] value the value to set for the column
86
+ # @param [String] type_definition the the type defined by the schema
87
+ # @param [String] column_name The name of the redshift column
88
+ # @return [Object] The value if it is valid
89
+ def check_real(value,type_definition=nil,column_name=nil)
90
+ if is_numeric(value) == false
91
+ raise_exception(column_name,value,type_definition)
92
+ end
93
+
94
+ return value
95
+ end
96
+
97
+ # Check and clean value to ensure it conforms to the redshfit data type
98
+ #
99
+ # @param [Object] value the value to set for the column
100
+ # @param [String] type_definition the the type defined by the schema
101
+ # @param [String] column_name The name of the redshift column
102
+ # @return [Object] The value if it is valid - truncated if it is too long
103
+ def check_char(value,type_definition=nil,column_name=nil)
104
+ if value.is_a?(String) == false
105
+ raise_exception(column_name,value,type_definition)
106
+ end
107
+ # Truncate values that are too long
108
+ value = truncate_string(column_name,value,type_definition)
109
+ return value
110
+ end
111
+
112
+ # Check and clean value to ensure it conforms to the redshfit data type
113
+ #
114
+ # @param [Object] value the value to set for the column
115
+ # @param [String] type_definition the the type defined by the schema
116
+ # @param [String] column_name The name of the redshift column
117
+ # @return [Object] The value if it is valid - truncated if too long
118
+ def check_varchar(value,type_definition=nil,column_name=nil)
119
+ if value.is_a?(String) == false
120
+ raise_exception(column_name,value,type_definition)
121
+ end
122
+ # Truncate values that are too long
123
+ value = truncate_string(column_name,value,type_definition)
124
+ return value
125
+ end
126
+
127
+ def check_date(column_name,value,type_definition)
128
+
129
+ end
130
+
131
+ # Check and clean value to ensure it conforms to the redshfit data type
132
+ #
133
+ # @param [Object] value the value to set for the column
134
+ # @param [String] type_definition the the type defined by the schema
135
+ # @param [String] column_name The name of the redshift column
136
+ # @return [Object] The value if it is valid
137
+ def check_timestamp(value,type_definition=nil,column_name=nil)
138
+ if value.is_a?(String) == false || value[/\A\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\z/] == nil
139
+ raise_exception(column_name,value,type_definition)
140
+ end
141
+ return value
142
+ end
143
+
144
+ private
145
+
146
+ # Helper function, raise a general exception message
147
+ #
148
+ # @param [String] column_name The name of the redshift column
149
+ # @param [Object] value the value to set for the column
150
+ # @param [String] type_definition the the type defined by the schema
151
+ def raise_exception(column_name,value,type_definition)
152
+ raise "Value for column #{column_name}, #{value.to_s}, does not conform to type '#{type_definition}'"
153
+ end
154
+
155
+ # Determine whether the typed value is a legit number, (eg, string)
156
+ #
157
+ # @param [Numeric] value The value to check as valid numeric
158
+ # @return [Boolean] Whether or not the value is a numeric
159
+ def is_numeric(value)
160
+ Float(value) != nil rescue false
161
+ end
162
+
163
+ def truncate_string(column_name,value,type_definition)
164
+ num_chars = type_definition[/\((\d*)\)/,1].to_i
165
+ puts "Num chars: #{num_chars}"
166
+ if(value.length > num_chars)
167
+ @logger.warn("#{TAG} Data for column #{column_name} is too long (#{value.length} characters) for column type and will be truncated to #{num_chars} characters: '#{value}'")
168
+ return value[0..num_chars-1]
169
+ else
170
+ return value
171
+ end
172
+ end
173
+
174
+ end
175
+ end
@@ -0,0 +1,238 @@
1
+ # The KinesisClient provides an application interface to aws kinesis as a data broker
2
+ #
3
+ # Copyright (c) 2014 RedHotLabs, Inc.
4
+ # Licensed under the MIT License
5
+
6
+ module RedTrack
7
+ class KinesisClient
8
+
9
+ @verbose = false
10
+
11
+ TAG='RedTrack::KinesisClient'
12
+
13
+ DEFAULT_MAX_RECORDS=1000000
14
+ DEFAULT_MAX_REQUESTS=100
15
+
16
+ # Setup instance variables for kinesis access
17
+ #
18
+ # @param [Hash] options Expects :redshift_cluster_name, :redshift_dbname. Optionally :verbose
19
+ # @return [Boolean] Success
20
+ def initialize(options)
21
+ @verbose = options[:verbose] || false
22
+ @logger = options[:logger]
23
+ if @logger == nil
24
+ @logger = Logger.new(STDOUT)
25
+ end
26
+ @options=options
27
+ end
28
+
29
+ # Name of the stream in the data broker (This is a Kinesis stream name)
30
+ #
31
+ # @param [String] redshift_table Name of the redshift table
32
+ # @return [String] Name of the stream in Kinesis
33
+ def stream_name(redshift_table)
34
+ if @options[:redshift_cluster_name] == nil || @options[:redshift_dbname] == nil
35
+ raise 'Need to specify :redshift_cluster_name and :redshift_dbname as options'
36
+ end
37
+ result= @options[:redshift_cluster_name] + '.' + @options[:redshift_dbname] + ".#{redshift_table}"
38
+ return result
39
+ end
40
+
41
+ # Get hash describing the shard from describe_stream
42
+ #
43
+ # @param [String] stream_name The name of the kinesis stream
44
+ # @return [Hash] Information regarding the stream shards
45
+ def get_shard_descriptions(stream_name)
46
+ describe_response = AWS.kinesis.client.describe_stream({:stream_name => stream_name})
47
+
48
+ result = nil
49
+ if describe_response != nil && describe_response[:stream_description] != nil
50
+ result = describe_response[:stream_description][:shards]
51
+ end
52
+ return result
53
+ end
54
+
55
+ # Get hash describing the shard from describe_stream
56
+ #
57
+ # @param [String] stream_name The name of the kinesis stream
58
+ # @param [Integer] stream_shard_index The index of the shard in the array of shards
59
+ # @return [Hash] Information regarding the stream shard, from AWS kinesis
60
+ def get_shard_description(stream_name,stream_shard_index)
61
+ describe_response = AWS.kinesis.client.describe_stream({:stream_name => stream_name})
62
+
63
+ if describe_response != nil && describe_response[:stream_description] != nil
64
+ result = describe_response[:stream_description][:shards][stream_shard_index]
65
+ result[:success] = true
66
+ result[:stream_description] = describe_response[:stream_description]
67
+ else
68
+ result = {
69
+ success: false,
70
+ describe_response: describe_response
71
+ }
72
+ end
73
+ return result
74
+ end
75
+
76
+ # Create a kinesis stream for the redshift table
77
+ #
78
+ # @param [String] table The name of the table
79
+ # @param [integer] shard_count The number of shards in the stream
80
+ def create_kinesis_stream_for_table(table,shard_count=1)
81
+ options = {
82
+ :stream_name => stream_name(table),
83
+ :shard_count => shard_count
84
+ }
85
+ result = AWS.kinesis.client.create_stream(options)
86
+ return result
87
+ end
88
+
89
+ # Get the shard iterator given a checkpointed sequence number. If no checkpoint, start to read from start of shard
90
+ #
91
+ # @param [String] stream_name The name of the stream to get a shard iterator for
92
+ # @param [Hash] shard_description Result from describe stream request
93
+ # @param [String] starting_sequence_number The sequence number to get a shard iterator for, if doesn't exist, get one for start of shard
94
+ # @return [String] The shard iterator
95
+ def get_shard_iterator_from_sequence_number(stream_name,shard_description,starting_sequence_number=nil)
96
+
97
+ ## Get shard iterator
98
+ get_shard_iterator_options = {
99
+ :stream_name => stream_name,
100
+ :shard_id => shard_description[:shard_id]
101
+ }
102
+
103
+ ## Options based on starting sequence number
104
+ if starting_sequence_number != nil
105
+ get_shard_iterator_options[:shard_iterator_type] = 'AFTER_SEQUENCE_NUMBER'
106
+ get_shard_iterator_options[:starting_sequence_number] = starting_sequence_number
107
+ else
108
+ @logger.warn("Shard '#{shard_description[:shard_id]}' has no starting sequence number, use TRIM_HORIZON shard iterator")
109
+ get_shard_iterator_options[:shard_iterator_type] = 'TRIM_HORIZON'
110
+ end
111
+
112
+ get_shard_iterator_response = AWS.kinesis.client.get_shard_iterator(get_shard_iterator_options)
113
+ shard_iterator = get_shard_iterator_response[:shard_iterator]
114
+ return shard_iterator
115
+ end
116
+
117
+ # Read from kinesis shard into a file
118
+ #
119
+ # @param [String] shard_iterator The shard iterator to start reading from - result of get_shard_iterator
120
+ # @param [String] files Array of files to write data into
121
+ # @param [Hash] options Optional. Can specify :max_records, :max_requests
122
+ # @return [Hash] Hash of # of records read and the sequence number of the last read record, number of records, and shard iterator
123
+ def stream_read_from_shard_iterator_into_files(shard_iterator, files, options={})
124
+
125
+ max_records = options[:max_records] || DEFAULT_MAX_RECORDS
126
+ max_requests = options[:max_requests] || DEFAULT_MAX_REQUESTS
127
+
128
+ start_sequence_number=nil
129
+ end_sequence_number=nil
130
+ records = 0
131
+ num_files = files.length
132
+
133
+ for i in 0..max_requests
134
+
135
+ # Execute get_records against AWS Kinesis
136
+ get_records_response = AWS.kinesis.client.get_records({:shard_iterator => shard_iterator})
137
+
138
+ # Process records
139
+ if get_records_response != nil && get_records_response.data != nil && get_records_response.data[:records] != nil && get_records_response.data[:records].count > 0
140
+ get_records_response.data[:records].each do |record|
141
+
142
+ data_payload = JSON.parse(record[:data])
143
+ data = data_payload['data']
144
+
145
+ # rotate which file we write into
146
+ files[records % num_files].puts data + "\n"
147
+
148
+ # Seqeunce numbers
149
+ if (start_sequence_number == nil)
150
+ start_sequence_number = record[:sequence_number].to_i
151
+ end
152
+ if (end_sequence_number == nil || record[:sequence_number].to_i > end_sequence_number)
153
+ end_sequence_number = record[:sequence_number].to_i
154
+ else
155
+ @logger.warn("#{TAG} Out of order sequence number: #{end_sequence_number.to_s}")
156
+ end
157
+
158
+ # Increment records read; check exit condition
159
+ records+=1
160
+ if (records >= max_records)
161
+ break
162
+ end
163
+ end
164
+ end
165
+
166
+ # set shard iterator for next request from payload
167
+ shard_iterator=get_records_response.data[:next_shard_iterator]
168
+
169
+ # Check exit conditions
170
+ if(shard_iterator == nil || records >= max_records)
171
+ break
172
+ end
173
+ end
174
+
175
+ result = {
176
+ starting_sequence_number: start_sequence_number.to_s,
177
+ ending_sequence_number: end_sequence_number.to_s,
178
+ next_shard_iterator: shard_iterator,
179
+ records: records
180
+ }
181
+ return result
182
+ end
183
+
184
+ # Write data to a stream. This expects the data to be a serialized string
185
+ #
186
+ # @param [String] stream_name The name of the stream
187
+ # @param [String] data_string String of data to write
188
+ # @param [String] partition_key How to keep the data partitioned in kinesis. See http://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html#Kinesis-PutRecord-request-PartitionKey
189
+ # @return [Boolean] True - the write to the stream succeeded
190
+ def stream_write(stream_name,data_string,partition_key=nil)
191
+ result=false
192
+
193
+ partition_key = partition_key || rand(100).to_s
194
+
195
+ put_data = {
196
+ :data => data_string
197
+ }
198
+
199
+ put_options = {
200
+ :stream_name => stream_name,
201
+ :partition_key => partition_key,
202
+ :data => put_data.to_json
203
+ }
204
+
205
+ @logger.debug("#{TAG} write to #{stream_name} stream with data #{data_string}")
206
+
207
+ # Write to kinesis; 3 attempts
208
+ attempt_count=3
209
+ last_exception=nil
210
+ while attempt_count > 0 && !result
211
+ begin
212
+ put_record_result = AWS.kinesis.client.put_record(put_options)
213
+ puts put_record_result.to_json
214
+ @logger.warn("put record result #{put_record_result.to_json}")
215
+ if put_record_result.http_response.status < 299
216
+ result = true
217
+ else
218
+ @logger.warn("#{TAG} put_record response: HTTP #{put_record_result.http_response.status}: #{put_record_result.http_response.body}")
219
+ end
220
+ rescue Exception => e
221
+
222
+ # log exception and retry with 1 second backoff
223
+ @logger.warn("#{TAG} put_record Exception caught #{e.class}: #{e.message}\n\t#{e.backtrace.join("\n\t")}")
224
+ attempt_count-=1
225
+ last_exception=e
226
+ end
227
+ end
228
+
229
+ # If failure after 3 retries, raise the last exception
230
+ if !result
231
+ raise last_exception
232
+ end
233
+
234
+ return result
235
+ end
236
+
237
+ end
238
+ end
@@ -0,0 +1,650 @@
1
+ # Redshift s3 loader. Can copy events into s3. Can also copy events into Redshift. Copies events into Redshift so
2
+ # as to avoid duplication.
3
+ #
4
+ # Copyright (c) 2014 RedHotLabs, Inc.
5
+ # Licensed under The MIT License
6
+
7
+ require 'tempfile'
8
+
9
+ module RedTrack
10
+
11
+
12
+ class LoaderException < Exception
13
+
14
+ attr_reader :information
15
+ def initialize(information)
16
+ @information = information
17
+ end
18
+
19
+ end
20
+
21
+ class Loader
22
+
23
+ TAG='RedTrack::Loader'
24
+
25
+ # S3 parameters
26
+ @broker=nil
27
+ @s3_bucket = nil
28
+ @redshift_conn = nil
29
+ @client = nil
30
+
31
+ @options = nil
32
+
33
+ @max_error = nil
34
+
35
+ @load_start_time=nil
36
+
37
+ DEFAULT_MAX_ERROR=2 # Set max error > 0 in case of "cosmic ray" events
38
+
39
+ # Setup class variables for redshift & s3 access
40
+ #
41
+ # @param [Hash] options expects access_key_id, secret_access_key, region, region, redshift_host, redshift_port, redshift_dbname, redshift_user, redshift_password, s3_bucket
42
+ # @param [RedTrack::KinesisClient] broker The broker client, created by the RedTrack::Client object
43
+ # @param [PG::Connection] redshift_conn The redshift connection used for loading data
44
+ # @return [Boolean] Success
45
+ def initialize(options,broker=nil,redshift_conn=nil)
46
+
47
+ # Broker
48
+ if broker
49
+ @broker = broker
50
+ else
51
+ raise 'Needs to pass broker client to the loader'
52
+ end
53
+
54
+ # Check for redshift connection; otherwise create one
55
+ if redshift_conn
56
+ @redshift_conn = redshift_conn
57
+ else
58
+ raise 'Need to pass redshift connection to the loader'
59
+ end
60
+
61
+ options[:max_error] ||= DEFAULT_MAX_ERROR
62
+
63
+ @logger = options[:logger]
64
+ if @logger == nil
65
+ @logger = Logger.new(STDOUT)
66
+ end
67
+
68
+ @options = options
69
+
70
+ # Create S3 connection for bucket
71
+ @s3_bucket = AWS::S3.new.buckets[options[:s3_bucket]]
72
+
73
+ end
74
+
75
+ # Write a profiling message to the logger
76
+ #
77
+ # @param [Strimg] message The message to write to the logger
78
+ def loader_profile(message)
79
+ elapsed_time=(Time.now-@load_start_time).round(2)
80
+ @logger.info("#{TAG} (#{elapsed_time}s elapsed) #{message}")
81
+ end
82
+
83
+ # High level function - read data from broker, upload data to s3, perform COPY command to load data into Redshift
84
+ #
85
+ # @param [String] redshift_table The name of the table in redshift to load
86
+ def load_redshift_from_broker(redshift_table)
87
+
88
+ # Start time - use this for profiling messages
89
+ @load_start_time = Time.now
90
+
91
+ # Get metadata about the kinesis stream and its shards
92
+ stream_name=@broker.stream_name(redshift_table)
93
+ shards = @broker.get_shard_descriptions(stream_name)
94
+ if shards == nil
95
+ information = {
96
+ :redshift_table => redshift_table,
97
+ :stream_name => stream_name
98
+ }
99
+ raise RedTrack::LoaderException.new(information), 'Could not get shard description'
100
+ end
101
+ loader_profile('Get stream metadata complete')
102
+
103
+ # Get metadata about the redshift cluster, specifically the number of slices
104
+ num_slices = get_number_of_slices(@options[:redshift_cluster_name])
105
+ loader_profile('Get redshift metadata complete')
106
+
107
+ # Get last loads for each shard - do this pre-fork in order to avoid re-establishing Redshift connections post-fork
108
+ last_shard_loads = get_last_shard_loads(redshift_table,stream_name,shards)
109
+ loader_profile('Get last shard loads complete')
110
+
111
+ # Determine where to upload files to s3 and number of files each shard should be produce
112
+ load_s3_location = s3_prefix(redshift_table, Time.new.utc.to_date, "load-#{@load_start_time.to_i}")
113
+
114
+ # For each shard, fork a process for stream read & s3 upload; create a pipe to communicate result back
115
+ pids = {}
116
+ result_readers = {}
117
+ shards.each do |shard|
118
+ result_reader,result_writer = IO.pipe
119
+ pid = fork do
120
+ loader_profile("#{shard[:shard_id]} fork - start")
121
+ last_shard_load = last_shard_loads[shard[:shard_id]]
122
+ begin
123
+ result = read_shard_and_upload_to_s3(shard,last_shard_load,load_s3_location,stream_name,num_slices)
124
+ result_writer.puts result.to_json
125
+ rescue Exception => e
126
+ @logger.warn("#{TAG} #{shard[:shard_id]} fork - Exception caught: #{e.class}: #{e.message}\n\t#{e.backtrace.join("\n\t")}")
127
+ result = {
128
+ :shard_id => shard[:shard_id],
129
+ :exception => e
130
+ }
131
+ result_writer.puts result.to_json
132
+ end
133
+ loader_profile("#{shard[:shard_id]} fork - read shard & upload done")
134
+ end
135
+ pids[shard[:shard_id]] = pid
136
+ result_readers[shard[:shard_id]] = result_reader
137
+ end
138
+
139
+ # Wait for the forked processes to finish and read the corresponding result pipe
140
+ fork_results = []
141
+ shards.each do |shard|
142
+ Thread.new { Process.waitpid(pids[shard[:shard_id]]) }.join
143
+ result_from_fork = result_readers[shard[:shard_id]].gets
144
+ if result_from_fork != nil && result_from_fork != 'nil'
145
+ fork_results << JSON.parse(result_from_fork, {symbolize_names: true})
146
+ else
147
+ fork_results << nil
148
+ end
149
+ end
150
+ loader_profile('All shard read & upload forks complete')
151
+
152
+ @logger.info("Fork Results: #{YAML::dump(fork_results)}")
153
+
154
+ # Build manifest and check results for shards to load into redshift
155
+ shards_to_load = []
156
+ manifest = {
157
+ :entries => []
158
+ }
159
+ fork_results.each do |fork_result|
160
+ if fork_result[:exception] != nil
161
+ raise "Exception in #{fork_result[:shard_id]} fork: #{fork_result[:exception]}"
162
+ end
163
+ if fork_result[:records] > 0
164
+ fork_result[:s3_urls].each do |s3_url|
165
+ entry = {
166
+ url: s3_url,
167
+ mandatory: true
168
+ }
169
+ manifest[:entries].push(entry)
170
+ end
171
+ shards_to_load << fork_result
172
+ end
173
+ end
174
+
175
+ # Check for exit condition - no shards have anything to load
176
+ if shards_to_load.length == 0
177
+ @logger.warn("#{TAG} No events read from any shards. Exiting.")
178
+ result = {
179
+ :success => true,
180
+ :records => 0,
181
+ :information => {
182
+ :redshift_table => redshift_table,
183
+ :shards => shards,
184
+ :last_shard_loads => last_shard_loads,
185
+ :fork_results => fork_results,
186
+ }
187
+ }
188
+ return result
189
+ end
190
+
191
+ # upload manifest to s3
192
+ manifest_s3_object = @s3_bucket.objects[load_s3_location + "manifest.json"]
193
+ manifest_s3_object.write(manifest.to_json)
194
+ manifest_s3_url = "s3://#{manifest_s3_object.bucket.name}/#{manifest_s3_object.key}"
195
+ loader_profile("manifest s3 upload complete #{manifest_s3_url}.")
196
+
197
+ # reconnect to redshift
198
+ @redshift_conn = PG.connect(
199
+ :host => @options[:redshift_host],
200
+ :port => @options[:redshift_port],
201
+ :dbname => @options[:redshift_dbname],
202
+ :user => @options[:redshift_user],
203
+ :password => @options[:redshift_password])
204
+
205
+ # Load the files into redshift via manifest
206
+ load_result = load_shards_manifest_into_redshift(manifest_s3_url,redshift_table,stream_name,shards_to_load,last_shard_loads)
207
+ loader_profile("Load kinesis shard into Redshift complete (#{load_result[:records]} events)")
208
+
209
+ information = {
210
+ :redshift_table => redshift_table,
211
+ :shards => shards,
212
+ :last_shard_loads => last_shard_loads,
213
+ :fork_results => fork_results,
214
+ :shards_to_load => shards_to_load,
215
+ :manifest => manifest,
216
+ :load_result => load_result
217
+ }
218
+
219
+ if(load_result[:success] == false)
220
+ raise RedTrack::LoaderException.new(information), 'COPY into redshift failed'
221
+ end
222
+
223
+ return {
224
+ :success => true,
225
+ :records => load_result[:records],
226
+ :addtl_information => information
227
+ }
228
+ end
229
+
230
+ def read_shard_and_upload_to_s3(shard_description,last_shard_load,load_s3_location,stream_name,num_slices)
231
+
232
+ # Create local files to store data
233
+ files = []
234
+ (1..num_slices).each do |i|
235
+ file = Tempfile.new("#{shard_description[:shard_id]}")
236
+ files.push(file)
237
+ end
238
+
239
+ # Start the read from the last loaded sequence number
240
+ if last_shard_load != nil
241
+ starting_sequence_number = last_shard_load['ending_sequence_number']
242
+ else
243
+ starting_sequence_number = nil
244
+ end
245
+
246
+ # Get shard iterator for the sequence number
247
+ shard_iterator = @broker.get_shard_iterator_from_sequence_number(stream_name, shard_description, starting_sequence_number)
248
+
249
+ # Read records after shard_iterator into file
250
+ stream_read_result = @broker.stream_read_from_shard_iterator_into_files(shard_iterator, files)
251
+ loader_profile("#{shard_description[:shard_id]} fork - kinesis read complete. #{stream_read_result[:records]} events read.")
252
+
253
+ files.each do |file|
254
+ file.close
255
+ end
256
+
257
+ # if we read anything from kinesis, upload files to s3
258
+ if(stream_read_result[:records] > 0)
259
+
260
+ s3_urls = []
261
+
262
+ # Sequentially, compress each file and upload to s3
263
+ files.each do |file|
264
+
265
+ # Compress file (-f in case the file already is there for whatever reason)
266
+ system("gzip -f #{file.path}")
267
+ file_name="#{file.path}.gz"
268
+
269
+ # Upload file to s3
270
+ s3_url = upload_to_s3(file_name,load_s3_location)
271
+
272
+ # Check result,
273
+ if !s3_url
274
+ raise RedTrack::LoaderException.new(information), 'Upload to S3 failed'
275
+ end
276
+
277
+ # Delete local file
278
+ file.unlink
279
+
280
+ s3_urls << s3_url
281
+ loader_profile("#{shard_description[:shard_id]} fork - s3 upload complete #{s3_url}.")
282
+ end
283
+
284
+ result = {
285
+ :shard_id => shard_description[:shard_id],
286
+ :records => stream_read_result[:records],
287
+ :starting_sequence_number => stream_read_result[:starting_sequence_number],
288
+ :ending_sequence_number => stream_read_result[:ending_sequence_number],
289
+ :s3_urls => s3_urls
290
+ }
291
+
292
+ else
293
+ # If stream_read_result didn't read any events, return simply that
294
+ result = {
295
+ :shard_id => shard_description[:shard_id],
296
+ :records => stream_read_result[:records]
297
+ }
298
+ end
299
+
300
+ return result
301
+ end
302
+
303
+ # Calculate the number of slices for the redshift cluster
304
+ #
305
+ # @param [string] cluster_name The name of the cluster to get # of slices for
306
+ # @return [Integer] The number of slices in the cluster
307
+ def get_number_of_slices(cluster_name)
308
+ result = 0
309
+
310
+ describe_clusters_response = AWS.redshift.client.describe_clusters
311
+
312
+ describe_clusters_response[:clusters].each do |cluster|
313
+ if cluster[:cluster_identifier] == cluster_name
314
+ number_of_nodes = cluster[:number_of_nodes]
315
+
316
+ # Slices per node is equal to number of vCPUs
317
+ slices_per_node = 1
318
+ case cluster[:node_type]
319
+ when 'dw2.large','dw1.xlarge'
320
+ slices_per_node = 2
321
+ when 'dw1.8xlarge'
322
+ slices_per_node = 16
323
+ when 'dw2.8xlarge'
324
+ slices_per_node = 32
325
+ else
326
+ raise "Unrecognized node type: #{cluster[:node_type]}"
327
+ end
328
+
329
+ result = number_of_nodes * slices_per_node
330
+
331
+ puts "Result #{result}, number_of_nodes: #{number_of_nodes}, node_type: #{cluster[:node_type]}, slices_per_node: #{slices_per_node}"
332
+
333
+ break
334
+ end
335
+
336
+ if result == 0
337
+ raise "Did not find cluster with name #{cluster_name}"
338
+ end
339
+ end
340
+ return result
341
+ end
342
+
343
+ # Uploads file to s3
344
+ #
345
+ # @param [String] file_name The file to upload
346
+ # @param [String] load_s3_prefix The location to upload to in s3
347
+ # @return [Hash] Information about the upload, included :success which is whether the file was uploaded/right size in S3
348
+ def upload_to_s3(file_name,load_s3_prefix)
349
+
350
+ # determine s3 key
351
+ s3_key = load_s3_prefix + File.basename(file_name)
352
+
353
+ # Upload file to s3
354
+ object = @s3_bucket.objects[s3_key]
355
+ s3_write_result = object.write(Pathname.new(file_name))
356
+
357
+ # Verify the file size in s3 matches local file size - s3 is eventually consistency.
358
+ local_file_size = File.size(file_name)
359
+ s3_file_size=nil
360
+ attempt_count=3
361
+ success=false
362
+ while attempt_count > 0 && !success
363
+ s3_file_size = object.content_length
364
+ if (local_file_size == s3_file_size)
365
+ success=true
366
+ break
367
+ else
368
+ sleep 5
369
+ attempt_count-=1
370
+ end
371
+ end
372
+
373
+ # If not successful at verifying file size, raise exception
374
+ if !success
375
+ @logger.warn("File size mismatch. Local: #{local_file_size}. S3: #{s3_file_size}")
376
+ end
377
+
378
+ return "s3://#{@s3_bucket.name}/#{s3_key}"
379
+ end
380
+
381
+ # Cleans up entries in s3 for a particular date.
382
+ # Note: A simpler way to do this is to set a lifecycle policy for S3 objects
383
+ #
384
+ # @param [String] redshift_table The table for which we are cleaning up s3
385
+ # @param [Date] date The date for which to clean up the
386
+ def cleanup_s3_loads(redshift_table,date)
387
+ @bucket.objects.with_prefix(s3_prefix(redshift_table,date)).delete_all
388
+ end
389
+
390
+
391
+ # Get the last load kinesis -> redshift for a set of shards
392
+ #
393
+ # @param [String] redshift_table The name fo the table to get last shard load for
394
+ # @param [String] stream_name The name of the kinesis stream
395
+ # @param [Array] shards description of the shard from describe_stream
396
+ # @return [Array] Information about the last load for this kinesis shard
397
+ def get_last_shard_loads(redshift_table,stream_name,shards)
398
+
399
+ last_loads = {}
400
+ shards.each do |shard|
401
+ last_loads[shard[:shard_id]] = get_last_shard_load(redshift_table,stream_name,shard)
402
+ end
403
+
404
+ @logger.info("Last Shard Loads: #{YAML::dump(last_loads)}")
405
+
406
+ return last_loads
407
+ end
408
+
409
+ # Get the last load kinesis -> redshift for any given shard
410
+ #
411
+ # @param [String] redshift_table The name fo the table to get last shard load for
412
+ # @param [String] stream_name The name of the kinesis stream
413
+ # @param [Hash] shard_description Description of the shard from describe_stream
414
+ # @return [Hash] Information about the last load for this kinesis shard
415
+ def get_last_shard_load(redshift_table,stream_name,shard_description)
416
+
417
+ query = "SELECT * FROM kinesis_loads WHERE table_name='#{redshift_table}' AND stream_name='#{stream_name}' AND shard_id='#{shard_description[:shard_id]}' ORDER BY load_timestamp DESC LIMIT 1;"
418
+ result_set = exec(query)
419
+
420
+ if result_set.ntuples == 1
421
+ result = {}
422
+ result_set.each do |row|
423
+ row.each do |hash_key,hash_value|
424
+ result[hash_key] = hash_value
425
+ end
426
+ end
427
+ elsif result_set.ntuples == 0
428
+ result = nil
429
+ else
430
+ raise 'Invalid number of rows'
431
+ end
432
+ return result
433
+ end
434
+
435
+ # Checks to see if we've already loaded this sequence range into redshift, if not, performs redshift load.
436
+ # Inspired by https://github.com/awslabs/amazon-kinesis-connectors/blob/master/src/main/java/com/amazonaws/services/kinesis/connectors/redshift/RedshiftManifestEmitter.java
437
+ #
438
+ # @param [String] manifest_s3_url The Url of the manifest file in s3
439
+ # @param [String] redshift_table The name of the redshift table to load
440
+ # @param [String] stream_name The name of the stream where the events are loaded from
441
+ # @param [Hash] shards_to_load Shards we are loading in this loader - all shards with > 0 events
442
+ # @param [String] last_shard_loads Set of last shard loads
443
+ def load_shards_manifest_into_redshift(manifest_s3_url,redshift_table,stream_name,shards_to_load,last_shard_loads)
444
+
445
+ begin
446
+
447
+ exec('BEGIN')
448
+
449
+ # Check that there hasn't been a load since the loader started running
450
+ shards_to_load.each do |shard|
451
+ last_shard_load = last_shard_loads[shard[:shard_id]]
452
+ new_last_shard_load = get_last_shard_load(redshift_table,stream_name,shard)
453
+ if new_last_shard_load != nil && new_last_shard_load['ending_sequence_number'] != last_shard_load['ending_sequence_number']
454
+ @logger.warn("A new Redtrack load has occurred for shard #{shard[:shard_id]} since starting the loader")
455
+ exec('ROLLBACK')
456
+ result = {
457
+ :success => false,
458
+ :load_error => 'A new Redtrack load has occurred for this shard since starting the loader',
459
+ :expected_last_shard_load => last_shard_load,
460
+ :new_last_shard_load => new_last_shard_load
461
+ }
462
+
463
+ return result
464
+ end
465
+ end
466
+
467
+ # Check that there aren't overlapped loaded sequences.
468
+ # Since sequence numbers are 56 digits and Redshift handle 38 digits max - store as strings and compare in ruby locally
469
+ # http://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html
470
+ # http://patshaughnessy.net/2014/1/9/how-big-is-a-bignum
471
+ shard_ids = []
472
+ shards_to_load.each do |shard_to_load|
473
+ shard_ids << "'#{shard_to_load[:shard_id]}'"
474
+ end
475
+
476
+ # TODO: this needs to be converted over!
477
+ query = 'SELECT * FROM kinesis_loads' +
478
+ " WHERE table_name='#{redshift_table}' AND stream_name='#{stream_name}' AND shard_id in (#{shard_ids.join(',')})" +
479
+ ' ORDER BY shard_id, load_timestamp DESC'
480
+
481
+ loads_result_set = exec(query)
482
+ loads_result_set.each do |row|
483
+ row_starting_sequence_number=row['starting_sequence_number'].to_i
484
+ row_ending_sequence_number=row['ending_sequence_number'].to_i
485
+
486
+ # Get the sequence number range from the shard that's going to be loaded
487
+ starting_sequence_number = nil
488
+ ending_sequence_number = nil
489
+ shards_to_load.each do |shard_to_load|
490
+ if shard_to_load[:shard_id] == row['shard_id']
491
+ starting_sequence_number=shard_to_load[:starting_sequence_number].to_i
492
+ ending_sequence_number=shard_to_load[:ending_sequence_number].to_i
493
+ break
494
+ end
495
+ end
496
+
497
+ # Ranges are loaded with previous ending_sequence_number equaling next loads starting_sequence_number
498
+ if ( (row_starting_sequence_number < starting_sequence_number && starting_sequence_number < row_ending_sequence_number) ||
499
+ (row_starting_sequence_number < ending_sequence_number && ending_sequence_number < row_ending_sequence_number) ||
500
+ (starting_sequence_number <= row_starting_sequence_number && row_ending_sequence_number <= ending_sequence_number) )
501
+
502
+ @logger.warn("#{TAG} Overlapping load of #{redshift_table} at #{row['load_timestamp']}: Kinesis stream=#{row['stream_name']}, " +
503
+ "shard=#{row['shard_id']}. Sequence from #{row['starting_sequence_number']} to #{row['ending_sequence_number']}")
504
+
505
+ # Abort the transaction
506
+ exec('ROLLBACK')
507
+ result = {
508
+ :success => false,
509
+ :load_error => 'Duplicated kinesis range',
510
+ :duplicated_load => row
511
+ }
512
+
513
+ return result
514
+ end
515
+ end
516
+
517
+ # Insert entry for load
518
+ insert_query = 'INSERT INTO kinesis_loads VALUES '
519
+ insert_values = []
520
+ shards_to_load.each do |shard_to_load|
521
+ insert_values << "('#{stream_name}','#{shard_to_load[:shard_id]}','#{redshift_table}','#{shard_to_load[:starting_sequence_number]}','#{shard_to_load[:ending_sequence_number]}',getdate())"
522
+ end
523
+ insert_query += insert_values.join(",") + ';'
524
+ exec(insert_query)
525
+
526
+ # Load manifest into redshift & commit transaction if successful
527
+ load_file_result=load_file_into_redshift(redshift_table,manifest_s3_url)
528
+ if load_file_result[:success] == true
529
+ exec('COMMIT')
530
+ else
531
+ @logger.warn("Load file returned a failure: #{load_file_result[:load_error]}")
532
+ exec('ROLLBACK')
533
+ end
534
+
535
+ result = load_file_result
536
+
537
+ rescue Exception => e
538
+
539
+ # Catch exceptions & Abort transaction
540
+ @logger.warn("#{TAG} Exception caught: #{e.class}: #{e.message}\n\t#{e.backtrace.join("\n\t")}")
541
+ exec('ROLLBACK')
542
+
543
+ result = {
544
+ :success => false,
545
+ :exception => e
546
+ }
547
+ end
548
+
549
+ return result
550
+ end
551
+
552
+ # Load a file into redshift
553
+ #
554
+ # @param [String] redshift_table The table to load the data into
555
+ # @param [String] s3_url The s3 file to load into redshift
556
+ # @param [Boolean] manifest Whether this is a COPY of a manifest file
557
+ def load_file_into_redshift(redshift_table,s3_url,manifest=true)
558
+
559
+ ## Run the copy command to load data from s3 to Redshift. This is cleaner than doing the ssh method
560
+ cmd="COPY #{redshift_table} from '#{s3_url}' with " +
561
+ "credentials 'aws_access_key_id=#{@options[:access_key_id]};aws_secret_access_key=#{@options[:secret_access_key]}' " +
562
+ "json 'auto' timeformat 'auto' GZIP MAXERROR #{@options[:max_error]}"
563
+ if manifest
564
+ cmd += ' manifest'
565
+ end
566
+ cmd += ';'
567
+ records=nil
568
+
569
+ # Set receiver to check how many rows are loaded (via the INFO)
570
+ @redshift_conn.set_notice_receiver {|result|
571
+ matches=/.*,.(\d+).record.*/.match(result.error_message)
572
+ records = matches[1].to_i
573
+ }
574
+
575
+ begin
576
+ exec(cmd)
577
+ result = {
578
+ :success => true,
579
+ :records => records
580
+ }
581
+ rescue Exception => e
582
+ # Catch a copy command exception & Get information about the error
583
+ @logger.warn("#{TAG} Exception caught: #{e.class}: #{e.message}\n\t#{e.backtrace.join("\n\t")}")
584
+ load_error = get_last_load_errors(redshift_table,s3_url)
585
+ result = {
586
+ :success => false,
587
+ :load_error => load_error
588
+ }
589
+ end
590
+
591
+ return result
592
+ end
593
+
594
+ # Print the last load error for a specific redshift table
595
+ #
596
+ # @param [String] redshift_table The name of the redshift table
597
+ # @param [String] s3_url The s3 url that was attempted to be loaded into redshift
598
+ def get_last_load_errors(redshift_table,s3_url)
599
+
600
+ # Query to get recent load errors matching table and s3 url
601
+ cmd = 'select tbl, trim(name) as table_name, starttime, filename, line_number, raw_line,' +
602
+ 'colname, raw_field_value, err_code, trim(err_reason) as reason ' +
603
+ 'from stl_load_errors sl, stv_tbl_perm sp ' +
604
+ "where sl.tbl = sp.id AND sp.name='#{redshift_table}' AND sl.filename='#{s3_url}' " +
605
+ 'ORDER BY starttime DESC LIMIT 20;'
606
+ result_set=exec(cmd)
607
+
608
+ # Collect the results, assume the first matching query id in stl_load_errors is the one that failed.
609
+ result = []
610
+ query=nil
611
+ result_set.each do |row|
612
+ if query == nil
613
+ query=row['query']
614
+ end
615
+
616
+ if query != row['query']
617
+ break
618
+ end
619
+ result.push(row)
620
+ end
621
+ return result
622
+ end
623
+
624
+ private
625
+
626
+ # Determine the s3 prefix for loading into s3 - files are organized by date
627
+ #
628
+ # @param [String] redshift_table The table in redshift
629
+ # @param [Date] date The date when the data is loaded
630
+ # @param [String] load_identifier Identifier for the load
631
+ # @return [String] The relative s3 location in a bucket
632
+ def s3_prefix(redshift_table,date,load_identifier=nil)
633
+ dateyyyymmdd=date.strftime('%Y%m%d')
634
+ result = "redtrack/#{@options[:redshift_cluster_name]}/#{@options[:redshift_dbname]}/#{redshift_table}/#{dateyyyymmdd}/"
635
+ if load_identifier != nil
636
+ result += load_identifier + "/"
637
+ end
638
+ return result
639
+ end
640
+
641
+ # Run a command against the redshift cluster (todo, should this be done differently?)
642
+ #
643
+ # @param [String] cmd The sql command to run
644
+ def exec(cmd)
645
+ @logger.debug(cmd)
646
+ @redshift_conn.exec(cmd)
647
+ end
648
+
649
+ end
650
+ end