redtrack 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +173 -0
- data/Rakefile +2 -0
- data/lib/redtrack.rb +16 -0
- data/lib/redtrack_client.rb +286 -0
- data/lib/redtrack_datatypes.rb +175 -0
- data/lib/redtrack_kinesisclient.rb +238 -0
- data/lib/redtrack_loader.rb +650 -0
- data/lib/redtrack_local_file_stream.rb +126 -0
- data/redtrack.gemspec +17 -0
- metadata +99 -0
@@ -0,0 +1,175 @@
|
|
1
|
+
# Datatypes provides a run time bound implementation for validating passed data types
|
2
|
+
#
|
3
|
+
# Copyright (c) 2014 RedHotLabs, Inc.
|
4
|
+
# Licensed under the MIT License
|
5
|
+
|
6
|
+
module RedTrack
|
7
|
+
class DataTypes
|
8
|
+
|
9
|
+
@logger = nil
|
10
|
+
|
11
|
+
# Constructor - non-static... Want runtime bound interface
|
12
|
+
def initialize(options)
|
13
|
+
if options && options[:logger] != nil
|
14
|
+
@logger = options[:logger]
|
15
|
+
else
|
16
|
+
@logger = Logger.new(STDOUT)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# @return [Array] Return an array of valid data types
|
21
|
+
def valid_data_types
|
22
|
+
result = %w(smallint integer bigint decimal real boolean char varchar date timestamp)
|
23
|
+
return result
|
24
|
+
end
|
25
|
+
|
26
|
+
# Check and clean value to ensure it conforms to the redshfit data type
|
27
|
+
#
|
28
|
+
# @param [Object] value the value to set for the column
|
29
|
+
# @param [String] type_definition the the type defined by the schema
|
30
|
+
# @param [String] column_name The name of the redshift column
|
31
|
+
# @return [Object] The value if it is valid
|
32
|
+
def check_smallint(value,type_definition=nil,column_name=nil)
|
33
|
+
if value.is_a?(Integer) == false
|
34
|
+
raise_exception(column_name,value,type_definition)
|
35
|
+
end
|
36
|
+
# TODO: Range / overflow check
|
37
|
+
return value
|
38
|
+
end
|
39
|
+
|
40
|
+
# Check and clean value to ensure it conforms to the redshfit data type
|
41
|
+
#
|
42
|
+
# @param [Object] value the value to set for the column
|
43
|
+
# @param [String] type_definition the the type defined by the schema
|
44
|
+
# @param [String] column_name The name of the redshift column
|
45
|
+
# @return [Object] The value if it is valid
|
46
|
+
def check_integer(value,type_definition=nil,column_name=nil)
|
47
|
+
if value.is_a?(Integer) == false
|
48
|
+
raise_exception(column_name,value,type_definition)
|
49
|
+
end
|
50
|
+
# TODO: range / overflow check
|
51
|
+
return value
|
52
|
+
end
|
53
|
+
|
54
|
+
# Check and clean value to ensure it conforms to the redshfit data type
|
55
|
+
#
|
56
|
+
# @param [Object] value the value to set for the column
|
57
|
+
# @param [String] type_definition the the type defined by the schema
|
58
|
+
# @param [String] column_name The name of the redshift column
|
59
|
+
# @return [Object] The value if it is valid
|
60
|
+
def check_bigint(value,type_definition=nil,column_name=nil)
|
61
|
+
if value.is_a?(Integer) == false
|
62
|
+
raise_exception(column_name,value,type_definition)
|
63
|
+
end
|
64
|
+
# TODO: range /overflow check
|
65
|
+
return value
|
66
|
+
end
|
67
|
+
|
68
|
+
# Check and clean value to ensure it conforms to the redshfit data type
|
69
|
+
#
|
70
|
+
# @param [Object] value the value to set for the column
|
71
|
+
# @param [String] type_definition the the type defined by the schema
|
72
|
+
# @param [String] column_name The name of the redshift column
|
73
|
+
# @return [Object] The value if it is valid
|
74
|
+
def check_decimal(value,type_definition=nil,column_name=nil)
|
75
|
+
if value.is_a?(String) == false || is_numeric(value) == false
|
76
|
+
raise_exception(column_name,value,type_definition)
|
77
|
+
#raise ""
|
78
|
+
end
|
79
|
+
|
80
|
+
return value
|
81
|
+
end
|
82
|
+
|
83
|
+
# Check and clean value to ensure it conforms to the redshfit data type
|
84
|
+
#
|
85
|
+
# @param [Object] value the value to set for the column
|
86
|
+
# @param [String] type_definition the the type defined by the schema
|
87
|
+
# @param [String] column_name The name of the redshift column
|
88
|
+
# @return [Object] The value if it is valid
|
89
|
+
def check_real(value,type_definition=nil,column_name=nil)
|
90
|
+
if is_numeric(value) == false
|
91
|
+
raise_exception(column_name,value,type_definition)
|
92
|
+
end
|
93
|
+
|
94
|
+
return value
|
95
|
+
end
|
96
|
+
|
97
|
+
# Check and clean value to ensure it conforms to the redshfit data type
|
98
|
+
#
|
99
|
+
# @param [Object] value the value to set for the column
|
100
|
+
# @param [String] type_definition the the type defined by the schema
|
101
|
+
# @param [String] column_name The name of the redshift column
|
102
|
+
# @return [Object] The value if it is valid - truncated if it is too long
|
103
|
+
def check_char(value,type_definition=nil,column_name=nil)
|
104
|
+
if value.is_a?(String) == false
|
105
|
+
raise_exception(column_name,value,type_definition)
|
106
|
+
end
|
107
|
+
# Truncate values that are too long
|
108
|
+
value = truncate_string(column_name,value,type_definition)
|
109
|
+
return value
|
110
|
+
end
|
111
|
+
|
112
|
+
# Check and clean value to ensure it conforms to the redshfit data type
|
113
|
+
#
|
114
|
+
# @param [Object] value the value to set for the column
|
115
|
+
# @param [String] type_definition the the type defined by the schema
|
116
|
+
# @param [String] column_name The name of the redshift column
|
117
|
+
# @return [Object] The value if it is valid - truncated if too long
|
118
|
+
def check_varchar(value,type_definition=nil,column_name=nil)
|
119
|
+
if value.is_a?(String) == false
|
120
|
+
raise_exception(column_name,value,type_definition)
|
121
|
+
end
|
122
|
+
# Truncate values that are too long
|
123
|
+
value = truncate_string(column_name,value,type_definition)
|
124
|
+
return value
|
125
|
+
end
|
126
|
+
|
127
|
+
def check_date(column_name,value,type_definition)
|
128
|
+
|
129
|
+
end
|
130
|
+
|
131
|
+
# Check and clean value to ensure it conforms to the redshfit data type
|
132
|
+
#
|
133
|
+
# @param [Object] value the value to set for the column
|
134
|
+
# @param [String] type_definition the the type defined by the schema
|
135
|
+
# @param [String] column_name The name of the redshift column
|
136
|
+
# @return [Object] The value if it is valid
|
137
|
+
def check_timestamp(value,type_definition=nil,column_name=nil)
|
138
|
+
if value.is_a?(String) == false || value[/\A\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\z/] == nil
|
139
|
+
raise_exception(column_name,value,type_definition)
|
140
|
+
end
|
141
|
+
return value
|
142
|
+
end
|
143
|
+
|
144
|
+
private
|
145
|
+
|
146
|
+
# Helper function, raise a general exception message
|
147
|
+
#
|
148
|
+
# @param [String] column_name The name of the redshift column
|
149
|
+
# @param [Object] value the value to set for the column
|
150
|
+
# @param [String] type_definition the the type defined by the schema
|
151
|
+
def raise_exception(column_name,value,type_definition)
|
152
|
+
raise "Value for column #{column_name}, #{value.to_s}, does not conform to type '#{type_definition}'"
|
153
|
+
end
|
154
|
+
|
155
|
+
# Determine whether the typed value is a legit number, (eg, string)
|
156
|
+
#
|
157
|
+
# @param [Numeric] value The value to check as valid numeric
|
158
|
+
# @return [Boolean] Whether or not the value is a numeric
|
159
|
+
def is_numeric(value)
|
160
|
+
Float(value) != nil rescue false
|
161
|
+
end
|
162
|
+
|
163
|
+
def truncate_string(column_name,value,type_definition)
|
164
|
+
num_chars = type_definition[/\((\d*)\)/,1].to_i
|
165
|
+
puts "Num chars: #{num_chars}"
|
166
|
+
if(value.length > num_chars)
|
167
|
+
@logger.warn("#{TAG} Data for column #{column_name} is too long (#{value.length} characters) for column type and will be truncated to #{num_chars} characters: '#{value}'")
|
168
|
+
return value[0..num_chars-1]
|
169
|
+
else
|
170
|
+
return value
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
end
|
175
|
+
end
|
@@ -0,0 +1,238 @@
|
|
1
|
+
# The KinesisClient provides an application interface to aws kinesis as a data broker
|
2
|
+
#
|
3
|
+
# Copyright (c) 2014 RedHotLabs, Inc.
|
4
|
+
# Licensed under the MIT License
|
5
|
+
|
6
|
+
module RedTrack
|
7
|
+
class KinesisClient
|
8
|
+
|
9
|
+
@verbose = false
|
10
|
+
|
11
|
+
TAG='RedTrack::KinesisClient'
|
12
|
+
|
13
|
+
DEFAULT_MAX_RECORDS=1000000
|
14
|
+
DEFAULT_MAX_REQUESTS=100
|
15
|
+
|
16
|
+
# Setup instance variables for kinesis access
|
17
|
+
#
|
18
|
+
# @param [Hash] options Expects :redshift_cluster_name, :redshift_dbname. Optionally :verbose
|
19
|
+
# @return [Boolean] Success
|
20
|
+
def initialize(options)
|
21
|
+
@verbose = options[:verbose] || false
|
22
|
+
@logger = options[:logger]
|
23
|
+
if @logger == nil
|
24
|
+
@logger = Logger.new(STDOUT)
|
25
|
+
end
|
26
|
+
@options=options
|
27
|
+
end
|
28
|
+
|
29
|
+
# Name of the stream in the data broker (This is a Kinesis stream name)
|
30
|
+
#
|
31
|
+
# @param [String] redshift_table Name of the redshift table
|
32
|
+
# @return [String] Name of the stream in Kinesis
|
33
|
+
def stream_name(redshift_table)
|
34
|
+
if @options[:redshift_cluster_name] == nil || @options[:redshift_dbname] == nil
|
35
|
+
raise 'Need to specify :redshift_cluster_name and :redshift_dbname as options'
|
36
|
+
end
|
37
|
+
result= @options[:redshift_cluster_name] + '.' + @options[:redshift_dbname] + ".#{redshift_table}"
|
38
|
+
return result
|
39
|
+
end
|
40
|
+
|
41
|
+
# Get hash describing the shard from describe_stream
|
42
|
+
#
|
43
|
+
# @param [String] stream_name The name of the kinesis stream
|
44
|
+
# @return [Hash] Information regarding the stream shards
|
45
|
+
def get_shard_descriptions(stream_name)
|
46
|
+
describe_response = AWS.kinesis.client.describe_stream({:stream_name => stream_name})
|
47
|
+
|
48
|
+
result = nil
|
49
|
+
if describe_response != nil && describe_response[:stream_description] != nil
|
50
|
+
result = describe_response[:stream_description][:shards]
|
51
|
+
end
|
52
|
+
return result
|
53
|
+
end
|
54
|
+
|
55
|
+
# Get hash describing the shard from describe_stream
|
56
|
+
#
|
57
|
+
# @param [String] stream_name The name of the kinesis stream
|
58
|
+
# @param [Integer] stream_shard_index The index of the shard in the array of shards
|
59
|
+
# @return [Hash] Information regarding the stream shard, from AWS kinesis
|
60
|
+
def get_shard_description(stream_name,stream_shard_index)
|
61
|
+
describe_response = AWS.kinesis.client.describe_stream({:stream_name => stream_name})
|
62
|
+
|
63
|
+
if describe_response != nil && describe_response[:stream_description] != nil
|
64
|
+
result = describe_response[:stream_description][:shards][stream_shard_index]
|
65
|
+
result[:success] = true
|
66
|
+
result[:stream_description] = describe_response[:stream_description]
|
67
|
+
else
|
68
|
+
result = {
|
69
|
+
success: false,
|
70
|
+
describe_response: describe_response
|
71
|
+
}
|
72
|
+
end
|
73
|
+
return result
|
74
|
+
end
|
75
|
+
|
76
|
+
# Create a kinesis stream for the redshift table
|
77
|
+
#
|
78
|
+
# @param [String] table The name of the table
|
79
|
+
# @param [integer] shard_count The number of shards in the stream
|
80
|
+
def create_kinesis_stream_for_table(table,shard_count=1)
|
81
|
+
options = {
|
82
|
+
:stream_name => stream_name(table),
|
83
|
+
:shard_count => shard_count
|
84
|
+
}
|
85
|
+
result = AWS.kinesis.client.create_stream(options)
|
86
|
+
return result
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get the shard iterator given a checkpointed sequence number. If no checkpoint, start to read from start of shard
|
90
|
+
#
|
91
|
+
# @param [String] stream_name The name of the stream to get a shard iterator for
|
92
|
+
# @param [Hash] shard_description Result from describe stream request
|
93
|
+
# @param [String] starting_sequence_number The sequence number to get a shard iterator for, if doesn't exist, get one for start of shard
|
94
|
+
# @return [String] The shard iterator
|
95
|
+
def get_shard_iterator_from_sequence_number(stream_name,shard_description,starting_sequence_number=nil)
|
96
|
+
|
97
|
+
## Get shard iterator
|
98
|
+
get_shard_iterator_options = {
|
99
|
+
:stream_name => stream_name,
|
100
|
+
:shard_id => shard_description[:shard_id]
|
101
|
+
}
|
102
|
+
|
103
|
+
## Options based on starting sequence number
|
104
|
+
if starting_sequence_number != nil
|
105
|
+
get_shard_iterator_options[:shard_iterator_type] = 'AFTER_SEQUENCE_NUMBER'
|
106
|
+
get_shard_iterator_options[:starting_sequence_number] = starting_sequence_number
|
107
|
+
else
|
108
|
+
@logger.warn("Shard '#{shard_description[:shard_id]}' has no starting sequence number, use TRIM_HORIZON shard iterator")
|
109
|
+
get_shard_iterator_options[:shard_iterator_type] = 'TRIM_HORIZON'
|
110
|
+
end
|
111
|
+
|
112
|
+
get_shard_iterator_response = AWS.kinesis.client.get_shard_iterator(get_shard_iterator_options)
|
113
|
+
shard_iterator = get_shard_iterator_response[:shard_iterator]
|
114
|
+
return shard_iterator
|
115
|
+
end
|
116
|
+
|
117
|
+
# Read from kinesis shard into a file
|
118
|
+
#
|
119
|
+
# @param [String] shard_iterator The shard iterator to start reading from - result of get_shard_iterator
|
120
|
+
# @param [String] files Array of files to write data into
|
121
|
+
# @param [Hash] options Optional. Can specify :max_records, :max_requests
|
122
|
+
# @return [Hash] Hash of # of records read and the sequence number of the last read record, number of records, and shard iterator
|
123
|
+
def stream_read_from_shard_iterator_into_files(shard_iterator, files, options={})
|
124
|
+
|
125
|
+
max_records = options[:max_records] || DEFAULT_MAX_RECORDS
|
126
|
+
max_requests = options[:max_requests] || DEFAULT_MAX_REQUESTS
|
127
|
+
|
128
|
+
start_sequence_number=nil
|
129
|
+
end_sequence_number=nil
|
130
|
+
records = 0
|
131
|
+
num_files = files.length
|
132
|
+
|
133
|
+
for i in 0..max_requests
|
134
|
+
|
135
|
+
# Execute get_records against AWS Kinesis
|
136
|
+
get_records_response = AWS.kinesis.client.get_records({:shard_iterator => shard_iterator})
|
137
|
+
|
138
|
+
# Process records
|
139
|
+
if get_records_response != nil && get_records_response.data != nil && get_records_response.data[:records] != nil && get_records_response.data[:records].count > 0
|
140
|
+
get_records_response.data[:records].each do |record|
|
141
|
+
|
142
|
+
data_payload = JSON.parse(record[:data])
|
143
|
+
data = data_payload['data']
|
144
|
+
|
145
|
+
# rotate which file we write into
|
146
|
+
files[records % num_files].puts data + "\n"
|
147
|
+
|
148
|
+
# Seqeunce numbers
|
149
|
+
if (start_sequence_number == nil)
|
150
|
+
start_sequence_number = record[:sequence_number].to_i
|
151
|
+
end
|
152
|
+
if (end_sequence_number == nil || record[:sequence_number].to_i > end_sequence_number)
|
153
|
+
end_sequence_number = record[:sequence_number].to_i
|
154
|
+
else
|
155
|
+
@logger.warn("#{TAG} Out of order sequence number: #{end_sequence_number.to_s}")
|
156
|
+
end
|
157
|
+
|
158
|
+
# Increment records read; check exit condition
|
159
|
+
records+=1
|
160
|
+
if (records >= max_records)
|
161
|
+
break
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# set shard iterator for next request from payload
|
167
|
+
shard_iterator=get_records_response.data[:next_shard_iterator]
|
168
|
+
|
169
|
+
# Check exit conditions
|
170
|
+
if(shard_iterator == nil || records >= max_records)
|
171
|
+
break
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
result = {
|
176
|
+
starting_sequence_number: start_sequence_number.to_s,
|
177
|
+
ending_sequence_number: end_sequence_number.to_s,
|
178
|
+
next_shard_iterator: shard_iterator,
|
179
|
+
records: records
|
180
|
+
}
|
181
|
+
return result
|
182
|
+
end
|
183
|
+
|
184
|
+
# Write data to a stream. This expects the data to be a serialized string
|
185
|
+
#
|
186
|
+
# @param [String] stream_name The name of the stream
|
187
|
+
# @param [String] data_string String of data to write
|
188
|
+
# @param [String] partition_key How to keep the data partitioned in kinesis. See http://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html#Kinesis-PutRecord-request-PartitionKey
|
189
|
+
# @return [Boolean] True - the write to the stream succeeded
|
190
|
+
def stream_write(stream_name,data_string,partition_key=nil)
|
191
|
+
result=false
|
192
|
+
|
193
|
+
partition_key = partition_key || rand(100).to_s
|
194
|
+
|
195
|
+
put_data = {
|
196
|
+
:data => data_string
|
197
|
+
}
|
198
|
+
|
199
|
+
put_options = {
|
200
|
+
:stream_name => stream_name,
|
201
|
+
:partition_key => partition_key,
|
202
|
+
:data => put_data.to_json
|
203
|
+
}
|
204
|
+
|
205
|
+
@logger.debug("#{TAG} write to #{stream_name} stream with data #{data_string}")
|
206
|
+
|
207
|
+
# Write to kinesis; 3 attempts
|
208
|
+
attempt_count=3
|
209
|
+
last_exception=nil
|
210
|
+
while attempt_count > 0 && !result
|
211
|
+
begin
|
212
|
+
put_record_result = AWS.kinesis.client.put_record(put_options)
|
213
|
+
puts put_record_result.to_json
|
214
|
+
@logger.warn("put record result #{put_record_result.to_json}")
|
215
|
+
if put_record_result.http_response.status < 299
|
216
|
+
result = true
|
217
|
+
else
|
218
|
+
@logger.warn("#{TAG} put_record response: HTTP #{put_record_result.http_response.status}: #{put_record_result.http_response.body}")
|
219
|
+
end
|
220
|
+
rescue Exception => e
|
221
|
+
|
222
|
+
# log exception and retry with 1 second backoff
|
223
|
+
@logger.warn("#{TAG} put_record Exception caught #{e.class}: #{e.message}\n\t#{e.backtrace.join("\n\t")}")
|
224
|
+
attempt_count-=1
|
225
|
+
last_exception=e
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
# If failure after 3 retries, raise the last exception
|
230
|
+
if !result
|
231
|
+
raise last_exception
|
232
|
+
end
|
233
|
+
|
234
|
+
return result
|
235
|
+
end
|
236
|
+
|
237
|
+
end
|
238
|
+
end
|
@@ -0,0 +1,650 @@
|
|
1
|
+
# Redshift s3 loader. Can copy events into s3. Can also copy events into Redshift. Copies events into Redshift so
|
2
|
+
# as to avoid duplication.
|
3
|
+
#
|
4
|
+
# Copyright (c) 2014 RedHotLabs, Inc.
|
5
|
+
# Licensed under The MIT License
|
6
|
+
|
7
|
+
require 'tempfile'
|
8
|
+
|
9
|
+
module RedTrack
|
10
|
+
|
11
|
+
|
12
|
+
class LoaderException < Exception
|
13
|
+
|
14
|
+
attr_reader :information
|
15
|
+
def initialize(information)
|
16
|
+
@information = information
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
class Loader
|
22
|
+
|
23
|
+
TAG='RedTrack::Loader'
|
24
|
+
|
25
|
+
# S3 parameters
|
26
|
+
@broker=nil
|
27
|
+
@s3_bucket = nil
|
28
|
+
@redshift_conn = nil
|
29
|
+
@client = nil
|
30
|
+
|
31
|
+
@options = nil
|
32
|
+
|
33
|
+
@max_error = nil
|
34
|
+
|
35
|
+
@load_start_time=nil
|
36
|
+
|
37
|
+
DEFAULT_MAX_ERROR=2 # Set max error > 0 in case of "cosmic ray" events
|
38
|
+
|
39
|
+
# Setup class variables for redshift & s3 access
|
40
|
+
#
|
41
|
+
# @param [Hash] options expects access_key_id, secret_access_key, region, region, redshift_host, redshift_port, redshift_dbname, redshift_user, redshift_password, s3_bucket
|
42
|
+
# @param [RedTrack::KinesisClient] broker The broker client, created by the RedTrack::Client object
|
43
|
+
# @param [PG::Connection] redshift_conn The redshift connection used for loading data
|
44
|
+
# @return [Boolean] Success
|
45
|
+
def initialize(options,broker=nil,redshift_conn=nil)
|
46
|
+
|
47
|
+
# Broker
|
48
|
+
if broker
|
49
|
+
@broker = broker
|
50
|
+
else
|
51
|
+
raise 'Needs to pass broker client to the loader'
|
52
|
+
end
|
53
|
+
|
54
|
+
# Check for redshift connection; otherwise create one
|
55
|
+
if redshift_conn
|
56
|
+
@redshift_conn = redshift_conn
|
57
|
+
else
|
58
|
+
raise 'Need to pass redshift connection to the loader'
|
59
|
+
end
|
60
|
+
|
61
|
+
options[:max_error] ||= DEFAULT_MAX_ERROR
|
62
|
+
|
63
|
+
@logger = options[:logger]
|
64
|
+
if @logger == nil
|
65
|
+
@logger = Logger.new(STDOUT)
|
66
|
+
end
|
67
|
+
|
68
|
+
@options = options
|
69
|
+
|
70
|
+
# Create S3 connection for bucket
|
71
|
+
@s3_bucket = AWS::S3.new.buckets[options[:s3_bucket]]
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
# Write a profiling message to the logger
|
76
|
+
#
|
77
|
+
# @param [Strimg] message The message to write to the logger
|
78
|
+
def loader_profile(message)
|
79
|
+
elapsed_time=(Time.now-@load_start_time).round(2)
|
80
|
+
@logger.info("#{TAG} (#{elapsed_time}s elapsed) #{message}")
|
81
|
+
end
|
82
|
+
|
83
|
+
# High level function - read data from broker, upload data to s3, perform COPY command to load data into Redshift
|
84
|
+
#
|
85
|
+
# @param [String] redshift_table The name of the table in redshift to load
|
86
|
+
def load_redshift_from_broker(redshift_table)
|
87
|
+
|
88
|
+
# Start time - use this for profiling messages
|
89
|
+
@load_start_time = Time.now
|
90
|
+
|
91
|
+
# Get metadata about the kinesis stream and its shards
|
92
|
+
stream_name=@broker.stream_name(redshift_table)
|
93
|
+
shards = @broker.get_shard_descriptions(stream_name)
|
94
|
+
if shards == nil
|
95
|
+
information = {
|
96
|
+
:redshift_table => redshift_table,
|
97
|
+
:stream_name => stream_name
|
98
|
+
}
|
99
|
+
raise RedTrack::LoaderException.new(information), 'Could not get shard description'
|
100
|
+
end
|
101
|
+
loader_profile('Get stream metadata complete')
|
102
|
+
|
103
|
+
# Get metadata about the redshift cluster, specifically the number of slices
|
104
|
+
num_slices = get_number_of_slices(@options[:redshift_cluster_name])
|
105
|
+
loader_profile('Get redshift metadata complete')
|
106
|
+
|
107
|
+
# Get last loads for each shard - do this pre-fork in order to avoid re-establishing Redshift connections post-fork
|
108
|
+
last_shard_loads = get_last_shard_loads(redshift_table,stream_name,shards)
|
109
|
+
loader_profile('Get last shard loads complete')
|
110
|
+
|
111
|
+
# Determine where to upload files to s3 and number of files each shard should be produce
|
112
|
+
load_s3_location = s3_prefix(redshift_table, Time.new.utc.to_date, "load-#{@load_start_time.to_i}")
|
113
|
+
|
114
|
+
# For each shard, fork a process for stream read & s3 upload; create a pipe to communicate result back
|
115
|
+
pids = {}
|
116
|
+
result_readers = {}
|
117
|
+
shards.each do |shard|
|
118
|
+
result_reader,result_writer = IO.pipe
|
119
|
+
pid = fork do
|
120
|
+
loader_profile("#{shard[:shard_id]} fork - start")
|
121
|
+
last_shard_load = last_shard_loads[shard[:shard_id]]
|
122
|
+
begin
|
123
|
+
result = read_shard_and_upload_to_s3(shard,last_shard_load,load_s3_location,stream_name,num_slices)
|
124
|
+
result_writer.puts result.to_json
|
125
|
+
rescue Exception => e
|
126
|
+
@logger.warn("#{TAG} #{shard[:shard_id]} fork - Exception caught: #{e.class}: #{e.message}\n\t#{e.backtrace.join("\n\t")}")
|
127
|
+
result = {
|
128
|
+
:shard_id => shard[:shard_id],
|
129
|
+
:exception => e
|
130
|
+
}
|
131
|
+
result_writer.puts result.to_json
|
132
|
+
end
|
133
|
+
loader_profile("#{shard[:shard_id]} fork - read shard & upload done")
|
134
|
+
end
|
135
|
+
pids[shard[:shard_id]] = pid
|
136
|
+
result_readers[shard[:shard_id]] = result_reader
|
137
|
+
end
|
138
|
+
|
139
|
+
# Wait for the forked processes to finish and read the corresponding result pipe
|
140
|
+
fork_results = []
|
141
|
+
shards.each do |shard|
|
142
|
+
Thread.new { Process.waitpid(pids[shard[:shard_id]]) }.join
|
143
|
+
result_from_fork = result_readers[shard[:shard_id]].gets
|
144
|
+
if result_from_fork != nil && result_from_fork != 'nil'
|
145
|
+
fork_results << JSON.parse(result_from_fork, {symbolize_names: true})
|
146
|
+
else
|
147
|
+
fork_results << nil
|
148
|
+
end
|
149
|
+
end
|
150
|
+
loader_profile('All shard read & upload forks complete')
|
151
|
+
|
152
|
+
@logger.info("Fork Results: #{YAML::dump(fork_results)}")
|
153
|
+
|
154
|
+
# Build manifest and check results for shards to load into redshift
|
155
|
+
shards_to_load = []
|
156
|
+
manifest = {
|
157
|
+
:entries => []
|
158
|
+
}
|
159
|
+
fork_results.each do |fork_result|
|
160
|
+
if fork_result[:exception] != nil
|
161
|
+
raise "Exception in #{fork_result[:shard_id]} fork: #{fork_result[:exception]}"
|
162
|
+
end
|
163
|
+
if fork_result[:records] > 0
|
164
|
+
fork_result[:s3_urls].each do |s3_url|
|
165
|
+
entry = {
|
166
|
+
url: s3_url,
|
167
|
+
mandatory: true
|
168
|
+
}
|
169
|
+
manifest[:entries].push(entry)
|
170
|
+
end
|
171
|
+
shards_to_load << fork_result
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Check for exit condition - no shards have anything to load
|
176
|
+
if shards_to_load.length == 0
|
177
|
+
@logger.warn("#{TAG} No events read from any shards. Exiting.")
|
178
|
+
result = {
|
179
|
+
:success => true,
|
180
|
+
:records => 0,
|
181
|
+
:information => {
|
182
|
+
:redshift_table => redshift_table,
|
183
|
+
:shards => shards,
|
184
|
+
:last_shard_loads => last_shard_loads,
|
185
|
+
:fork_results => fork_results,
|
186
|
+
}
|
187
|
+
}
|
188
|
+
return result
|
189
|
+
end
|
190
|
+
|
191
|
+
# upload manifest to s3
|
192
|
+
manifest_s3_object = @s3_bucket.objects[load_s3_location + "manifest.json"]
|
193
|
+
manifest_s3_object.write(manifest.to_json)
|
194
|
+
manifest_s3_url = "s3://#{manifest_s3_object.bucket.name}/#{manifest_s3_object.key}"
|
195
|
+
loader_profile("manifest s3 upload complete #{manifest_s3_url}.")
|
196
|
+
|
197
|
+
# reconnect to redshift
|
198
|
+
@redshift_conn = PG.connect(
|
199
|
+
:host => @options[:redshift_host],
|
200
|
+
:port => @options[:redshift_port],
|
201
|
+
:dbname => @options[:redshift_dbname],
|
202
|
+
:user => @options[:redshift_user],
|
203
|
+
:password => @options[:redshift_password])
|
204
|
+
|
205
|
+
# Load the files into redshift via manifest
|
206
|
+
load_result = load_shards_manifest_into_redshift(manifest_s3_url,redshift_table,stream_name,shards_to_load,last_shard_loads)
|
207
|
+
loader_profile("Load kinesis shard into Redshift complete (#{load_result[:records]} events)")
|
208
|
+
|
209
|
+
information = {
|
210
|
+
:redshift_table => redshift_table,
|
211
|
+
:shards => shards,
|
212
|
+
:last_shard_loads => last_shard_loads,
|
213
|
+
:fork_results => fork_results,
|
214
|
+
:shards_to_load => shards_to_load,
|
215
|
+
:manifest => manifest,
|
216
|
+
:load_result => load_result
|
217
|
+
}
|
218
|
+
|
219
|
+
if(load_result[:success] == false)
|
220
|
+
raise RedTrack::LoaderException.new(information), 'COPY into redshift failed'
|
221
|
+
end
|
222
|
+
|
223
|
+
return {
|
224
|
+
:success => true,
|
225
|
+
:records => load_result[:records],
|
226
|
+
:addtl_information => information
|
227
|
+
}
|
228
|
+
end
|
229
|
+
|
230
|
+
def read_shard_and_upload_to_s3(shard_description,last_shard_load,load_s3_location,stream_name,num_slices)
|
231
|
+
|
232
|
+
# Create local files to store data
|
233
|
+
files = []
|
234
|
+
(1..num_slices).each do |i|
|
235
|
+
file = Tempfile.new("#{shard_description[:shard_id]}")
|
236
|
+
files.push(file)
|
237
|
+
end
|
238
|
+
|
239
|
+
# Start the read from the last loaded sequence number
|
240
|
+
if last_shard_load != nil
|
241
|
+
starting_sequence_number = last_shard_load['ending_sequence_number']
|
242
|
+
else
|
243
|
+
starting_sequence_number = nil
|
244
|
+
end
|
245
|
+
|
246
|
+
# Get shard iterator for the sequence number
|
247
|
+
shard_iterator = @broker.get_shard_iterator_from_sequence_number(stream_name, shard_description, starting_sequence_number)
|
248
|
+
|
249
|
+
# Read records after shard_iterator into file
|
250
|
+
stream_read_result = @broker.stream_read_from_shard_iterator_into_files(shard_iterator, files)
|
251
|
+
loader_profile("#{shard_description[:shard_id]} fork - kinesis read complete. #{stream_read_result[:records]} events read.")
|
252
|
+
|
253
|
+
files.each do |file|
|
254
|
+
file.close
|
255
|
+
end
|
256
|
+
|
257
|
+
# if we read anything from kinesis, upload files to s3
|
258
|
+
if(stream_read_result[:records] > 0)
|
259
|
+
|
260
|
+
s3_urls = []
|
261
|
+
|
262
|
+
# Sequentially, compress each file and upload to s3
|
263
|
+
files.each do |file|
|
264
|
+
|
265
|
+
# Compress file (-f in case the file already is there for whatever reason)
|
266
|
+
system("gzip -f #{file.path}")
|
267
|
+
file_name="#{file.path}.gz"
|
268
|
+
|
269
|
+
# Upload file to s3
|
270
|
+
s3_url = upload_to_s3(file_name,load_s3_location)
|
271
|
+
|
272
|
+
# Check result,
|
273
|
+
if !s3_url
|
274
|
+
raise RedTrack::LoaderException.new(information), 'Upload to S3 failed'
|
275
|
+
end
|
276
|
+
|
277
|
+
# Delete local file
|
278
|
+
file.unlink
|
279
|
+
|
280
|
+
s3_urls << s3_url
|
281
|
+
loader_profile("#{shard_description[:shard_id]} fork - s3 upload complete #{s3_url}.")
|
282
|
+
end
|
283
|
+
|
284
|
+
result = {
|
285
|
+
:shard_id => shard_description[:shard_id],
|
286
|
+
:records => stream_read_result[:records],
|
287
|
+
:starting_sequence_number => stream_read_result[:starting_sequence_number],
|
288
|
+
:ending_sequence_number => stream_read_result[:ending_sequence_number],
|
289
|
+
:s3_urls => s3_urls
|
290
|
+
}
|
291
|
+
|
292
|
+
else
|
293
|
+
# If stream_read_result didn't read any events, return simply that
|
294
|
+
result = {
|
295
|
+
:shard_id => shard_description[:shard_id],
|
296
|
+
:records => stream_read_result[:records]
|
297
|
+
}
|
298
|
+
end
|
299
|
+
|
300
|
+
return result
|
301
|
+
end
|
302
|
+
|
303
|
+
# Calculate the number of slices for the redshift cluster
|
304
|
+
#
|
305
|
+
# @param [string] cluster_name The name of the cluster to get # of slices for
|
306
|
+
# @return [Integer] The number of slices in the cluster
|
307
|
+
def get_number_of_slices(cluster_name)
|
308
|
+
result = 0
|
309
|
+
|
310
|
+
describe_clusters_response = AWS.redshift.client.describe_clusters
|
311
|
+
|
312
|
+
describe_clusters_response[:clusters].each do |cluster|
|
313
|
+
if cluster[:cluster_identifier] == cluster_name
|
314
|
+
number_of_nodes = cluster[:number_of_nodes]
|
315
|
+
|
316
|
+
# Slices per node is equal to number of vCPUs
|
317
|
+
slices_per_node = 1
|
318
|
+
case cluster[:node_type]
|
319
|
+
when 'dw2.large','dw1.xlarge'
|
320
|
+
slices_per_node = 2
|
321
|
+
when 'dw1.8xlarge'
|
322
|
+
slices_per_node = 16
|
323
|
+
when 'dw2.8xlarge'
|
324
|
+
slices_per_node = 32
|
325
|
+
else
|
326
|
+
raise "Unrecognized node type: #{cluster[:node_type]}"
|
327
|
+
end
|
328
|
+
|
329
|
+
result = number_of_nodes * slices_per_node
|
330
|
+
|
331
|
+
puts "Result #{result}, number_of_nodes: #{number_of_nodes}, node_type: #{cluster[:node_type]}, slices_per_node: #{slices_per_node}"
|
332
|
+
|
333
|
+
break
|
334
|
+
end
|
335
|
+
|
336
|
+
if result == 0
|
337
|
+
raise "Did not find cluster with name #{cluster_name}"
|
338
|
+
end
|
339
|
+
end
|
340
|
+
return result
|
341
|
+
end
|
342
|
+
|
343
|
+
# Uploads file to s3
|
344
|
+
#
|
345
|
+
# @param [String] file_name The file to upload
|
346
|
+
# @param [String] load_s3_prefix The location to upload to in s3
|
347
|
+
# @return [Hash] Information about the upload, included :success which is whether the file was uploaded/right size in S3
|
348
|
+
def upload_to_s3(file_name,load_s3_prefix)
|
349
|
+
|
350
|
+
# determine s3 key
|
351
|
+
s3_key = load_s3_prefix + File.basename(file_name)
|
352
|
+
|
353
|
+
# Upload file to s3
|
354
|
+
object = @s3_bucket.objects[s3_key]
|
355
|
+
s3_write_result = object.write(Pathname.new(file_name))
|
356
|
+
|
357
|
+
# Verify the file size in s3 matches local file size - s3 is eventually consistency.
|
358
|
+
local_file_size = File.size(file_name)
|
359
|
+
s3_file_size=nil
|
360
|
+
attempt_count=3
|
361
|
+
success=false
|
362
|
+
while attempt_count > 0 && !success
|
363
|
+
s3_file_size = object.content_length
|
364
|
+
if (local_file_size == s3_file_size)
|
365
|
+
success=true
|
366
|
+
break
|
367
|
+
else
|
368
|
+
sleep 5
|
369
|
+
attempt_count-=1
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
# If not successful at verifying file size, raise exception
|
374
|
+
if !success
|
375
|
+
@logger.warn("File size mismatch. Local: #{local_file_size}. S3: #{s3_file_size}")
|
376
|
+
end
|
377
|
+
|
378
|
+
return "s3://#{@s3_bucket.name}/#{s3_key}"
|
379
|
+
end
|
380
|
+
|
381
|
+
# Cleans up entries in s3 for a particular date.
|
382
|
+
# Note: A simpler way to do this is to set a lifecycle policy for S3 objects
|
383
|
+
#
|
384
|
+
# @param [String] redshift_table The table for which we are cleaning up s3
|
385
|
+
# @param [Date] date The date for which to clean up the
|
386
|
+
def cleanup_s3_loads(redshift_table,date)
|
387
|
+
@bucket.objects.with_prefix(s3_prefix(redshift_table,date)).delete_all
|
388
|
+
end
|
389
|
+
|
390
|
+
|
391
|
+
# Get the last load kinesis -> redshift for a set of shards
|
392
|
+
#
|
393
|
+
# @param [String] redshift_table The name fo the table to get last shard load for
|
394
|
+
# @param [String] stream_name The name of the kinesis stream
|
395
|
+
# @param [Array] shards description of the shard from describe_stream
|
396
|
+
# @return [Array] Information about the last load for this kinesis shard
|
397
|
+
def get_last_shard_loads(redshift_table,stream_name,shards)
|
398
|
+
|
399
|
+
last_loads = {}
|
400
|
+
shards.each do |shard|
|
401
|
+
last_loads[shard[:shard_id]] = get_last_shard_load(redshift_table,stream_name,shard)
|
402
|
+
end
|
403
|
+
|
404
|
+
@logger.info("Last Shard Loads: #{YAML::dump(last_loads)}")
|
405
|
+
|
406
|
+
return last_loads
|
407
|
+
end
|
408
|
+
|
409
|
+
# Get the last load kinesis -> redshift for any given shard
|
410
|
+
#
|
411
|
+
# @param [String] redshift_table The name fo the table to get last shard load for
|
412
|
+
# @param [String] stream_name The name of the kinesis stream
|
413
|
+
# @param [Hash] shard_description Description of the shard from describe_stream
|
414
|
+
# @return [Hash] Information about the last load for this kinesis shard
|
415
|
+
def get_last_shard_load(redshift_table,stream_name,shard_description)
|
416
|
+
|
417
|
+
query = "SELECT * FROM kinesis_loads WHERE table_name='#{redshift_table}' AND stream_name='#{stream_name}' AND shard_id='#{shard_description[:shard_id]}' ORDER BY load_timestamp DESC LIMIT 1;"
|
418
|
+
result_set = exec(query)
|
419
|
+
|
420
|
+
if result_set.ntuples == 1
|
421
|
+
result = {}
|
422
|
+
result_set.each do |row|
|
423
|
+
row.each do |hash_key,hash_value|
|
424
|
+
result[hash_key] = hash_value
|
425
|
+
end
|
426
|
+
end
|
427
|
+
elsif result_set.ntuples == 0
|
428
|
+
result = nil
|
429
|
+
else
|
430
|
+
raise 'Invalid number of rows'
|
431
|
+
end
|
432
|
+
return result
|
433
|
+
end
|
434
|
+
|
435
|
+
# Checks to see if we've already loaded this sequence range into redshift, if not, performs redshift load.
|
436
|
+
# Inspired by https://github.com/awslabs/amazon-kinesis-connectors/blob/master/src/main/java/com/amazonaws/services/kinesis/connectors/redshift/RedshiftManifestEmitter.java
|
437
|
+
#
|
438
|
+
# @param [String] manifest_s3_url The Url of the manifest file in s3
|
439
|
+
# @param [String] redshift_table The name of the redshift table to load
|
440
|
+
# @param [String] stream_name The name of the stream where the events are loaded from
|
441
|
+
# @param [Hash] shards_to_load Shards we are loading in this loader - all shards with > 0 events
|
442
|
+
# @param [String] last_shard_loads Set of last shard loads
|
443
|
+
def load_shards_manifest_into_redshift(manifest_s3_url,redshift_table,stream_name,shards_to_load,last_shard_loads)
|
444
|
+
|
445
|
+
begin
|
446
|
+
|
447
|
+
exec('BEGIN')
|
448
|
+
|
449
|
+
# Check that there hasn't been a load since the loader started running
|
450
|
+
shards_to_load.each do |shard|
|
451
|
+
last_shard_load = last_shard_loads[shard[:shard_id]]
|
452
|
+
new_last_shard_load = get_last_shard_load(redshift_table,stream_name,shard)
|
453
|
+
if new_last_shard_load != nil && new_last_shard_load['ending_sequence_number'] != last_shard_load['ending_sequence_number']
|
454
|
+
@logger.warn("A new Redtrack load has occurred for shard #{shard[:shard_id]} since starting the loader")
|
455
|
+
exec('ROLLBACK')
|
456
|
+
result = {
|
457
|
+
:success => false,
|
458
|
+
:load_error => 'A new Redtrack load has occurred for this shard since starting the loader',
|
459
|
+
:expected_last_shard_load => last_shard_load,
|
460
|
+
:new_last_shard_load => new_last_shard_load
|
461
|
+
}
|
462
|
+
|
463
|
+
return result
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
467
|
+
# Check that there aren't overlapped loaded sequences.
|
468
|
+
# Since sequence numbers are 56 digits and Redshift handle 38 digits max - store as strings and compare in ruby locally
|
469
|
+
# http://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html
|
470
|
+
# http://patshaughnessy.net/2014/1/9/how-big-is-a-bignum
|
471
|
+
shard_ids = []
|
472
|
+
shards_to_load.each do |shard_to_load|
|
473
|
+
shard_ids << "'#{shard_to_load[:shard_id]}'"
|
474
|
+
end
|
475
|
+
|
476
|
+
# TODO: this needs to be converted over!
|
477
|
+
query = 'SELECT * FROM kinesis_loads' +
|
478
|
+
" WHERE table_name='#{redshift_table}' AND stream_name='#{stream_name}' AND shard_id in (#{shard_ids.join(',')})" +
|
479
|
+
' ORDER BY shard_id, load_timestamp DESC'
|
480
|
+
|
481
|
+
loads_result_set = exec(query)
|
482
|
+
loads_result_set.each do |row|
|
483
|
+
row_starting_sequence_number=row['starting_sequence_number'].to_i
|
484
|
+
row_ending_sequence_number=row['ending_sequence_number'].to_i
|
485
|
+
|
486
|
+
# Get the sequence number range from the shard that's going to be loaded
|
487
|
+
starting_sequence_number = nil
|
488
|
+
ending_sequence_number = nil
|
489
|
+
shards_to_load.each do |shard_to_load|
|
490
|
+
if shard_to_load[:shard_id] == row['shard_id']
|
491
|
+
starting_sequence_number=shard_to_load[:starting_sequence_number].to_i
|
492
|
+
ending_sequence_number=shard_to_load[:ending_sequence_number].to_i
|
493
|
+
break
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
# Ranges are loaded with previous ending_sequence_number equaling next loads starting_sequence_number
|
498
|
+
if ( (row_starting_sequence_number < starting_sequence_number && starting_sequence_number < row_ending_sequence_number) ||
|
499
|
+
(row_starting_sequence_number < ending_sequence_number && ending_sequence_number < row_ending_sequence_number) ||
|
500
|
+
(starting_sequence_number <= row_starting_sequence_number && row_ending_sequence_number <= ending_sequence_number) )
|
501
|
+
|
502
|
+
@logger.warn("#{TAG} Overlapping load of #{redshift_table} at #{row['load_timestamp']}: Kinesis stream=#{row['stream_name']}, " +
|
503
|
+
"shard=#{row['shard_id']}. Sequence from #{row['starting_sequence_number']} to #{row['ending_sequence_number']}")
|
504
|
+
|
505
|
+
# Abort the transaction
|
506
|
+
exec('ROLLBACK')
|
507
|
+
result = {
|
508
|
+
:success => false,
|
509
|
+
:load_error => 'Duplicated kinesis range',
|
510
|
+
:duplicated_load => row
|
511
|
+
}
|
512
|
+
|
513
|
+
return result
|
514
|
+
end
|
515
|
+
end
|
516
|
+
|
517
|
+
# Insert entry for load
|
518
|
+
insert_query = 'INSERT INTO kinesis_loads VALUES '
|
519
|
+
insert_values = []
|
520
|
+
shards_to_load.each do |shard_to_load|
|
521
|
+
insert_values << "('#{stream_name}','#{shard_to_load[:shard_id]}','#{redshift_table}','#{shard_to_load[:starting_sequence_number]}','#{shard_to_load[:ending_sequence_number]}',getdate())"
|
522
|
+
end
|
523
|
+
insert_query += insert_values.join(",") + ';'
|
524
|
+
exec(insert_query)
|
525
|
+
|
526
|
+
# Load manifest into redshift & commit transaction if successful
|
527
|
+
load_file_result=load_file_into_redshift(redshift_table,manifest_s3_url)
|
528
|
+
if load_file_result[:success] == true
|
529
|
+
exec('COMMIT')
|
530
|
+
else
|
531
|
+
@logger.warn("Load file returned a failure: #{load_file_result[:load_error]}")
|
532
|
+
exec('ROLLBACK')
|
533
|
+
end
|
534
|
+
|
535
|
+
result = load_file_result
|
536
|
+
|
537
|
+
rescue Exception => e
|
538
|
+
|
539
|
+
# Catch exceptions & Abort transaction
|
540
|
+
@logger.warn("#{TAG} Exception caught: #{e.class}: #{e.message}\n\t#{e.backtrace.join("\n\t")}")
|
541
|
+
exec('ROLLBACK')
|
542
|
+
|
543
|
+
result = {
|
544
|
+
:success => false,
|
545
|
+
:exception => e
|
546
|
+
}
|
547
|
+
end
|
548
|
+
|
549
|
+
return result
|
550
|
+
end
|
551
|
+
|
552
|
+
# Load a file into redshift
|
553
|
+
#
|
554
|
+
# @param [String] redshift_table The table to load the data into
|
555
|
+
# @param [String] s3_url The s3 file to load into redshift
|
556
|
+
# @param [Boolean] manifest Whether this is a COPY of a manifest file
|
557
|
+
def load_file_into_redshift(redshift_table,s3_url,manifest=true)
|
558
|
+
|
559
|
+
## Run the copy command to load data from s3 to Redshift. This is cleaner than doing the ssh method
|
560
|
+
cmd="COPY #{redshift_table} from '#{s3_url}' with " +
|
561
|
+
"credentials 'aws_access_key_id=#{@options[:access_key_id]};aws_secret_access_key=#{@options[:secret_access_key]}' " +
|
562
|
+
"json 'auto' timeformat 'auto' GZIP MAXERROR #{@options[:max_error]}"
|
563
|
+
if manifest
|
564
|
+
cmd += ' manifest'
|
565
|
+
end
|
566
|
+
cmd += ';'
|
567
|
+
records=nil
|
568
|
+
|
569
|
+
# Set receiver to check how many rows are loaded (via the INFO)
|
570
|
+
@redshift_conn.set_notice_receiver {|result|
|
571
|
+
matches=/.*,.(\d+).record.*/.match(result.error_message)
|
572
|
+
records = matches[1].to_i
|
573
|
+
}
|
574
|
+
|
575
|
+
begin
|
576
|
+
exec(cmd)
|
577
|
+
result = {
|
578
|
+
:success => true,
|
579
|
+
:records => records
|
580
|
+
}
|
581
|
+
rescue Exception => e
|
582
|
+
# Catch a copy command exception & Get information about the error
|
583
|
+
@logger.warn("#{TAG} Exception caught: #{e.class}: #{e.message}\n\t#{e.backtrace.join("\n\t")}")
|
584
|
+
load_error = get_last_load_errors(redshift_table,s3_url)
|
585
|
+
result = {
|
586
|
+
:success => false,
|
587
|
+
:load_error => load_error
|
588
|
+
}
|
589
|
+
end
|
590
|
+
|
591
|
+
return result
|
592
|
+
end
|
593
|
+
|
594
|
+
# Print the last load error for a specific redshift table
|
595
|
+
#
|
596
|
+
# @param [String] redshift_table The name of the redshift table
|
597
|
+
# @param [String] s3_url The s3 url that was attempted to be loaded into redshift
|
598
|
+
def get_last_load_errors(redshift_table,s3_url)
|
599
|
+
|
600
|
+
# Query to get recent load errors matching table and s3 url
|
601
|
+
cmd = 'select tbl, trim(name) as table_name, starttime, filename, line_number, raw_line,' +
|
602
|
+
'colname, raw_field_value, err_code, trim(err_reason) as reason ' +
|
603
|
+
'from stl_load_errors sl, stv_tbl_perm sp ' +
|
604
|
+
"where sl.tbl = sp.id AND sp.name='#{redshift_table}' AND sl.filename='#{s3_url}' " +
|
605
|
+
'ORDER BY starttime DESC LIMIT 20;'
|
606
|
+
result_set=exec(cmd)
|
607
|
+
|
608
|
+
# Collect the results, assume the first matching query id in stl_load_errors is the one that failed.
|
609
|
+
result = []
|
610
|
+
query=nil
|
611
|
+
result_set.each do |row|
|
612
|
+
if query == nil
|
613
|
+
query=row['query']
|
614
|
+
end
|
615
|
+
|
616
|
+
if query != row['query']
|
617
|
+
break
|
618
|
+
end
|
619
|
+
result.push(row)
|
620
|
+
end
|
621
|
+
return result
|
622
|
+
end
|
623
|
+
|
624
|
+
private
|
625
|
+
|
626
|
+
# Determine the s3 prefix for loading into s3 - files are organized by date
|
627
|
+
#
|
628
|
+
# @param [String] redshift_table The table in redshift
|
629
|
+
# @param [Date] date The date when the data is loaded
|
630
|
+
# @param [String] load_identifier Identifier for the load
|
631
|
+
# @return [String] The relative s3 location in a bucket
|
632
|
+
def s3_prefix(redshift_table,date,load_identifier=nil)
|
633
|
+
dateyyyymmdd=date.strftime('%Y%m%d')
|
634
|
+
result = "redtrack/#{@options[:redshift_cluster_name]}/#{@options[:redshift_dbname]}/#{redshift_table}/#{dateyyyymmdd}/"
|
635
|
+
if load_identifier != nil
|
636
|
+
result += load_identifier + "/"
|
637
|
+
end
|
638
|
+
return result
|
639
|
+
end
|
640
|
+
|
641
|
+
# Run a command against the redshift cluster (todo, should this be done differently?)
|
642
|
+
#
|
643
|
+
# @param [String] cmd The sql command to run
|
644
|
+
def exec(cmd)
|
645
|
+
@logger.debug(cmd)
|
646
|
+
@redshift_conn.exec(cmd)
|
647
|
+
end
|
648
|
+
|
649
|
+
end
|
650
|
+
end
|