salesforce_bulk_query-edge 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,140 @@
1
+ require 'xmlsimple'
2
+ require 'net/http'
3
+
4
+ module SalesforceBulkQuery
5
+
6
+ # Connection to the Salesforce API
7
+ # shared in all classes that do some requests
8
+ class Connection
9
+ def initialize(client, api_version, logger=nil, filename_prefix=nil,ssl_version = nil)
10
+ @client = client
11
+ @logger = logger
12
+ @filename_prefix = filename_prefix
13
+ @ssl_version = ssl_version
14
+
15
+
16
+ @@API_VERSION = api_version
17
+ @@PATH_PREFIX = "/services/async/#{@@API_VERSION}/"
18
+ end
19
+
20
+ attr_reader :client
21
+
22
+ XML_REQUEST_HEADER = {'Content-Type' => 'application/xml; charset=utf-8'}
23
+ CSV_REQUEST_HEADER = {'Content-Type' => 'text/csv; charset=UTF-8'}
24
+
25
+ def session_header
26
+ {'X-SFDC-Session' => @client.options[:oauth_token]}
27
+ end
28
+
29
+ def parse_xml(xml)
30
+ parsed = nil
31
+ begin
32
+ parsed = XmlSimple.xml_in(xml)
33
+ rescue => e
34
+ @logger.error "Error parsing xml: #{xml}\n#{e}\n#{e.backtrace}"
35
+ raise
36
+ end
37
+
38
+ return parsed
39
+ end
40
+
41
+ def post_xml(path, xml, options={})
42
+ path = "#{@@PATH_PREFIX}#{path}"
43
+ headers = options[:csv_content_type] ? CSV_REQUEST_HEADER : XML_REQUEST_HEADER
44
+
45
+ response = nil
46
+ # do the request
47
+ with_retries do
48
+ begin
49
+ response = @client.post(path, xml, headers.merge(session_header))
50
+ rescue JSON::ParserError => e
51
+ if e.message.index('ExceededQuota')
52
+ raise "You've run out of sfdc batch api quota. Original error: #{e}\n #{e.backtrace}"
53
+ end
54
+ raise e
55
+ end
56
+ end
57
+
58
+ return parse_xml(response.body)
59
+ end
60
+
61
+ def get_xml(path, options={})
62
+ path = "#{@@PATH_PREFIX}#{path}"
63
+ headers = XML_REQUEST_HEADER
64
+
65
+ response = nil
66
+ with_retries do
67
+ response = @client.get(path, {}, headers.merge(session_header))
68
+ end
69
+
70
+ return options[:skip_parsing] ? response.body : parse_xml(response.body)
71
+ end
72
+
73
+ def get_to_file(path, filename)
74
+ path = "#{@@PATH_PREFIX}#{path}"
75
+ uri = URI.parse( @client.options[:instance_url])
76
+ # open a file
77
+ http = Net::HTTP.new(uri.host, uri.port)
78
+ http.use_ssl = true
79
+ http.ssl_version = @ssl_version if !@ssl_version.nil?
80
+ headers = XML_REQUEST_HEADER.merge(session_header)
81
+ @logger.info "Doing GET to #{path}, headers #{headers}" if @logger
82
+
83
+ if @filename_prefix
84
+ filename = "#{@filename_prefix}_#{filename}"
85
+ end
86
+
87
+ # do the request
88
+ http.request_get(path, headers) do |res|
89
+
90
+ @logger.info "Got response #{res.inspect}, reading response body by chunks and writing to #{filename}" if @logger
91
+
92
+ File.open(filename, 'w') do |file|
93
+ # write the body to the file by chunks
94
+ res.read_body do |segment|
95
+ file.write(segment.encode('UTF-8', :invalid => :replace, :undef => :replace,:replace => "?"))
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ def with_retries
102
+ i = 0
103
+ begin
104
+ yield
105
+ rescue => e
106
+ i += 1
107
+ if i < 3
108
+ @logger.warn "Retrying, got error: #{e}, #{e.backtrace}" if @logger
109
+ retry
110
+ else
111
+ @logger.error "Failed 3 times, last error: #{e}, #{e.backtrace}" if @logger
112
+ raise
113
+ end
114
+ end
115
+ end
116
+
117
+ def query_count(sobject, date_field, from, to)
118
+ # do it with retries, if it doesn't succeed, return nil, don't fail.
119
+ soql = "SELECT COUNT() FROM #{sobject} WHERE #{date_field} >= #{from} AND #{date_field} < #{to}"
120
+ begin
121
+ with_retries do
122
+ q = @client.query(soql)
123
+ return q.size
124
+ end
125
+ rescue Faraday::Error::TimeoutError => e
126
+ @logger.warn "Timeout getting count: #{soql}. Error: #{e}. Taking it as failed verification" if @logger
127
+ return nil
128
+ end
129
+ end
130
+
131
+ def to_log
132
+ return {
133
+ :client => "Restforce asi",
134
+ :filename_prefix => @filename_prefix,
135
+ :api_version => @@API_VERSION,
136
+ :path_prefix => @@PATH_PREFIX
137
+ }
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,199 @@
1
+ require "salesforce_bulk_query/batch"
2
+
3
+ module SalesforceBulkQuery
4
+
5
+ # Represents a Salesforce bulk api job, contains multiple batches.
6
+ # Many jobs contained in Query
7
+ class Job
8
+ @@operation = 'query'
9
+ @@xml_header = '<?xml version="1.0" encoding="utf-8" ?>'
10
+ JOB_TIME_LIMIT = 15 * 60
11
+ BATCH_COUNT = 15
12
+
13
+
14
+ def initialize(sobject, connection, options={})
15
+ @sobject = sobject
16
+ @connection = connection
17
+ @logger = options[:logger]
18
+ @job_time_limit = options[:job_time_limit] || JOB_TIME_LIMIT
19
+ @date_field = options[:date_field] or fail "date_field must be given when creating a batch"
20
+ @batch_count = options[:batch_count] || BATCH_COUNT
21
+
22
+ # all batches (static)
23
+ @batches = []
24
+
25
+ # unfinished batches as of last get_available_results call
26
+ @unfinished_batches = []
27
+
28
+ # filenames fort the already downloaded and verified batches
29
+ @filenames = []
30
+ end
31
+
32
+ attr_reader :job_id
33
+
34
+ # Do the API request
35
+ def create_job(csv=true)
36
+ content_type = csv ? "CSV" : "XML"
37
+ xml = "#{@@xml_header}<jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">"
38
+ xml += "<operation>#{@@operation}</operation>"
39
+ xml += "<object>#{@sobject}</object>"
40
+ xml += "<contentType>#{content_type}</contentType>"
41
+ xml += "</jobInfo>"
42
+
43
+ response_parsed = @connection.post_xml("job", xml)
44
+ @job_id = response_parsed['id'][0]
45
+ end
46
+
47
+ def get_extended_soql(soql, from, to)
48
+ return "#{soql} WHERE #{@date_field} >= #{from} AND #{@date_field} < #{to}"
49
+ end
50
+
51
+ def generate_batches(soql, start, stop, single_batch=false)
52
+ # if there's just one batch wanted, add it and we're done
53
+ if single_batch
54
+ soql_extended = get_extended_soql(soql, start, stop)
55
+ @logger.info "Adding soql #{soql_extended} as a batch to job" if @logger
56
+
57
+ add_query(soql_extended,
58
+ :start => start,
59
+ :stop => stop
60
+ )
61
+ return
62
+ end
63
+
64
+ # if there's more, generate the time intervals and generate the batches
65
+ step_size = (stop - start) / @batch_count
66
+
67
+ interval_beginings = start.step(stop - step_size, step_size).map{|f|f}
68
+ interval_ends = interval_beginings.clone
69
+ interval_ends.shift
70
+ interval_ends.push(stop)
71
+
72
+ interval_beginings.zip(interval_ends).each do |from, to|
73
+
74
+ soql_extended = get_extended_soql(soql, from, to)
75
+ @logger.info "Adding soql #{soql_extended} as a batch to job" if @logger
76
+
77
+ add_query(soql_extended,
78
+ :start => from,
79
+ :stop => to
80
+ )
81
+ end
82
+ end
83
+
84
+ def add_query(query, options={})
85
+ # create and create a batch
86
+ batch = SalesforceBulkQuery::Batch.new(
87
+ :sobject => @sobject,
88
+ :soql => query,
89
+ :job_id => @job_id,
90
+ :connection => @connection,
91
+ :start => options[:start],
92
+ :stop => options[:stop],
93
+ :logger => @logger,
94
+ :date_field => @date_field
95
+ )
96
+ batch.create
97
+
98
+ # add the batch to the list
99
+ @batches.push(batch)
100
+ @unfinished_batches.push(batch)
101
+ end
102
+
103
+ def close_job
104
+ xml = "#{@@xml_header}<jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">"
105
+ xml += "<state>Closed</state>"
106
+ xml += "</jobInfo>"
107
+
108
+ path = "job/#{@job_id}"
109
+
110
+ response_parsed = @connection.post_xml(path, xml)
111
+ @job_closed_time = Time.now
112
+ end
113
+
114
+ def check_status
115
+ path = "job/#{@job_id}"
116
+ response_parsed = @connection.get_xml(path)
117
+ @completed_count = Integer(response_parsed["numberBatchesCompleted"][0])
118
+ @succeeded = @completed_count == Integer(response_parsed["numberBatchesTotal"][0])
119
+
120
+ return {
121
+ :succeeded => @succeeded,
122
+ :some_records_failed => Integer(response_parsed["numberRecordsFailed"][0]) > 0,
123
+ :some_batches_failed => Integer(response_parsed["numberBatchesFailed"][0]) > 0,
124
+ :response => response_parsed
125
+ }
126
+ end
127
+
128
+ def over_limit?
129
+ (Time.now - @job_closed_time) > @job_time_limit
130
+ end
131
+
132
+ # downloads whatever is available, returns as unfinished whatever is not
133
+
134
+ def get_available_results(options={})
135
+ downloaded_filenames = []
136
+ unfinished_batches = []
137
+ verification_fail_batches = []
138
+ failed_batches = []
139
+
140
+ # get result for each batch in the job
141
+ @unfinished_batches.each do |batch|
142
+ batch_status = batch.check_status
143
+
144
+ # if the result is ready
145
+ if batch_status[:succeeded]
146
+ # each finished batch should go here only once
147
+
148
+ # download the result
149
+ result = batch.get_result(options)
150
+ @logger.info "get_result result: #{result}" if @logger
151
+
152
+ # if the verification failed, put it to failed
153
+ # will never ask about this one again.
154
+ if result[:verification] == false
155
+ verification_fail_batches << batch
156
+ else
157
+ # if verification ok and finished put it to filenames
158
+ downloaded_filenames << result[:filename]
159
+ end
160
+ elsif batch_status[:failed]
161
+ # put it to failed and raise error at the end
162
+ failed_batches << batch
163
+ else
164
+ # otherwise put it to unfinished
165
+ unfinished_batches << batch
166
+ end
167
+ end
168
+
169
+ unless failed_batches.empty?
170
+ details = failed_batches.map{ |b| "#{b.batch_id}: #{b.fail_message}"}.join("\n")
171
+ fail ArgumentError, "#{failed_batches.length} batches failed. Details: #{details}"
172
+ end
173
+
174
+ # cache the unfinished_batches till the next run
175
+ @unfinished_batches = unfinished_batches
176
+
177
+ # cumulate filenames
178
+ @filenames += downloaded_filenames
179
+
180
+ @logger.info "unfinished batches: #{unfinished_batches}\nverification_fail_batches: #{verification_fail_batches}" if @logger
181
+
182
+ return {
183
+ :finished => @unfinished_batches.empty?,
184
+ :filenames => @filenames,
185
+ :unfinished_batches => @unfinished_batches,
186
+ :verification_fail_batches => verification_fail_batches
187
+ }
188
+ end
189
+
190
+ def to_log
191
+ return {
192
+ :sobject => @sobject,
193
+ :connection => @connection.to_log,
194
+ :batches => @batches.map {|b| b.to_log},
195
+ :unfinished_batches => @unfinished_batches.map {|b| b.to_log}
196
+ }
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,44 @@
1
+ require 'forwardable'
2
+ require 'faraday'
3
+
4
+ module SalesforceBulkQuery
5
+ # Custom logger for Restforce that doesn't log tons of data.
6
+ class Logger < Faraday::Response::Middleware
7
+ extend Forwardable
8
+
9
+ MAX_LOG_LENGTH = 2000
10
+
11
+ def initialize(app, logger, options)
12
+ super(app)
13
+ @options = options
14
+ @logger = logger || begin
15
+ require 'logger'
16
+ ::Logger.new(STDOUT)
17
+ end
18
+ end
19
+
20
+ def_delegators :@logger, :debug, :info, :warn, :error, :fatal
21
+
22
+ def call(env)
23
+ debug('request') do
24
+ dump :url => env[:url].to_s,
25
+ :method => env[:method],
26
+ :headers => env[:request_headers],
27
+ :body => env[:body] ? env[:body][0..MAX_LOG_LENGTH] : nil
28
+ end
29
+ super
30
+ end
31
+
32
+ def on_complete(env)
33
+ debug('response') do
34
+ dump :status => env[:status].to_s,
35
+ :headers => env[:response_headers],
36
+ :body => env[:body] ? env[:body][0..MAX_LOG_LENGTH] : nil
37
+ end
38
+ end
39
+
40
+ def dump(hash)
41
+ "\n" + hash.map { |k, v| " #{k}: #{v.inspect}" }.join("\n")
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,192 @@
1
+ require 'salesforce_bulk_query/job'
2
+ require 'date'
3
+
4
+ module SalesforceBulkQuery
5
+
6
+ # Abstraction of a single user-given query. It contains multiple jobs, is tied to a specific connection
7
+ class Query
8
+
9
+ # if no date_to is given we use the current time with this offset
10
+ # subtracted (to make sure the freshest changes that can be inconsistent
11
+ # aren't there) It's in minutes
12
+ OFFSET_FROM_NOW = 10
13
+
14
+ DEFAULT_DATE_FIELD = 'CreatedDate'
15
+
16
+ def initialize(sobject, soql, connection, options={})
17
+ @sobject = sobject
18
+ @soql = soql
19
+ @connection = connection
20
+ @logger = options[:logger]
21
+ @date_field = options[:date_field] || DEFAULT_DATE_FIELD
22
+ @date_from = options[:date_from] || options[:created_from]
23
+ @date_to = options[:date_to] || options[:created_to]
24
+ @single_batch = options[:single_batch]
25
+
26
+ # jobs currently running
27
+ @jobs_in_progress = []
28
+
29
+ # successfully finished jobs with no batches to split
30
+ @jobs_done = []
31
+
32
+ # finished or timeouted jobs with some batches split into other jobs
33
+ @jobs_restarted = []
34
+
35
+ @finished_batch_filenames = []
36
+ @restarted_subqueries = []
37
+ end
38
+
39
+ attr_reader :jobs_in_progress, :jobs_restarted, :jobs_done
40
+
41
+ DEFAULT_MIN_CREATED = "1999-01-01T00:00:00.000Z"
42
+
43
+ # Creates the first job, divides the query to subqueries, puts all the subqueries as batches to the job
44
+ def start(options={})
45
+ # order by and where not allowed
46
+ if (!@single_batch) && (@soql =~ / WHERE /i || @soql =~ /ORDER BY/i)
47
+ raise "You can't have WHERE or ORDER BY in your soql. If you want to download just specific date range use date_from / date_to"
48
+ end
49
+
50
+ # create the first job
51
+ job = SalesforceBulkQuery::Job.new(
52
+ @sobject,
53
+ @connection,
54
+ {:logger => @logger, :date_field => @date_field}.merge(options)
55
+ )
56
+ job.create_job
57
+
58
+ # get the date when it should start
59
+ min_date = get_min_date
60
+
61
+ # generate intervals
62
+ start = nil
63
+ if (min_date.instance_of?(Time))
64
+ start = DateTime.parse(min_date.to_s)
65
+ else
66
+ start = DateTime.parse(min_date)
67
+ end
68
+
69
+ stop = nil
70
+ if (@date_to.nil?)
71
+ stop = DateTime.now - Rational(options[:offset_from_now] || OFFSET_FROM_NOW, 1440)
72
+ else
73
+ if (@date_to.instance_of?(Time))
74
+ stop = DateTime.parse(@date_to.to_s)
75
+ else
76
+ stop = DateTime.parse(@date_to)
77
+ end
78
+ end
79
+ job.generate_batches(@soql, start, stop, @single_batch)
80
+
81
+ job.close_job
82
+
83
+ @jobs_in_progress.push(job)
84
+ end
85
+
86
+ # Get results for all finished jobs. If there are some unfinished batches, skip them and return them as unfinished.
87
+ #
88
+ # @param options[:directory_path]
89
+ def get_available_results(options={})
90
+
91
+ unfinished_subqueries = []
92
+ jobs_in_progress = []
93
+ jobs_restarted = []
94
+ jobs_done = []
95
+
96
+ # check all jobs statuses and split what should be split
97
+ @jobs_in_progress.each do |job|
98
+
99
+ # download what's available
100
+ job_results = job.get_available_results(options)
101
+
102
+ job_over_limit = job.over_limit?
103
+ job_done = job_results[:finished] || job_over_limit
104
+
105
+ @logger.debug "job_results: #{job_results}" if @logger
106
+
107
+ unfinished_batches = job_results[:unfinished_batches]
108
+ verification_fail_batches = job_results[:verification_fail_batches]
109
+
110
+ unfinished_subqueries += unfinished_batches.map {|b| b.soql}
111
+
112
+ # split to subqueries what needs to be split
113
+ to_split = verification_fail_batches
114
+ to_split += unfinished_batches if job_over_limit
115
+
116
+ # delete files associated with batches that failed verification
117
+ verification_fail_batches.each do |b|
118
+ @logger.info "Deleting #{b.filename}, verification failed."
119
+ File.delete(b.filename)
120
+ end
121
+
122
+ to_split.each do |batch|
123
+ # for each unfinished batch create a new job and add it to new jobs
124
+ @logger.info "The following subquery didn't end in time / failed verification: #{batch.soql}. Dividing into multiple and running again" if @logger
125
+ new_job = SalesforceBulkQuery::Job.new(
126
+ @sobject,
127
+ @connection,
128
+ {:logger => @logger, :date_field => @date_field}.merge(options)
129
+ )
130
+ new_job.create_job
131
+ new_job.generate_batches(@soql, batch.start, batch.stop)
132
+ new_job.close_job
133
+ jobs_in_progress.push(new_job)
134
+ end
135
+
136
+ # what to do with the current job
137
+ # finish, some stuff restarted
138
+ if job_done
139
+ if to_split.empty?
140
+ # done, nothing left
141
+ jobs_done.push(job)
142
+
143
+ @logger.info "#{job.job_id} finished. Nothing to split. unfinished_batches: #{unfinished_batches}, verification_fail_batches: #{verification_fail_batches}" if @logger
144
+ else
145
+ # done, some batches needed to be restarted
146
+ jobs_restarted.push(job)
147
+ end
148
+
149
+ # store the filenames and restarted stuff
150
+ @finished_batch_filenames += job_results[:filenames]
151
+ @restarted_subqueries += to_split.map {|b| b.soql}
152
+ else
153
+ # still in progress
154
+ jobs_in_progress.push(job)
155
+ end
156
+ end
157
+
158
+ # remove the finished jobs from progress and add there the new ones
159
+ @jobs_in_progress = jobs_in_progress
160
+ @jobs_done += jobs_done
161
+
162
+ # we're done if there're no jobs in progress
163
+ return {
164
+ :succeeded => @jobs_in_progress.empty?,
165
+ :filenames => @finished_batch_filenames,
166
+ :unfinished_subqueries => unfinished_subqueries,
167
+ :jobs_done => @jobs_done.map { |j| j.job_id }
168
+ }
169
+ end
170
+
171
+ private
172
+
173
+ def get_min_date
174
+ if @date_from
175
+ return @date_from
176
+ end
177
+
178
+ # get the date when the first was created
179
+ min_created = nil
180
+ begin
181
+ min_created_resp = @connection.client.query("SELECT #{@date_field} FROM #{@sobject} ORDER BY #{@date_field} LIMIT 1")
182
+ min_created_resp.each {|s| min_created = s[@date_field.to_sym]}
183
+ rescue Faraday::Error::TimeoutError => e
184
+ @logger.warn "Timeout getting the oldest object for #{@sobject}. Error: #{e}. Using the default value" if @logger
185
+ min_created = DEFAULT_MIN_CREATED
186
+ rescue Faraday::Error::ClientError => e
187
+ fail ArgumentError, "Error when trying to get the oldest record according to #{@date_field}, looks like #{@date_field} is not on #{@sobject}. Original error: #{e}\n #{e.message} \n #{e.backtrace} "
188
+ end
189
+ min_created
190
+ end
191
+ end
192
+ end