salesforce_bulk_query-edge 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ require 'xmlsimple'
2
+ require 'net/http'
3
+
4
+ module SalesforceBulkQuery
5
+
6
+ # Connection to the Salesforce API
7
+ # shared in all classes that do some requests
8
+ class Connection
9
+ def initialize(client, api_version, logger=nil, filename_prefix=nil,ssl_version = nil)
10
+ @client = client
11
+ @logger = logger
12
+ @filename_prefix = filename_prefix
13
+ @ssl_version = ssl_version
14
+
15
+
16
+ @@API_VERSION = api_version
17
+ @@PATH_PREFIX = "/services/async/#{@@API_VERSION}/"
18
+ end
19
+
20
+ attr_reader :client
21
+
22
+ XML_REQUEST_HEADER = {'Content-Type' => 'application/xml; charset=utf-8'}
23
+ CSV_REQUEST_HEADER = {'Content-Type' => 'text/csv; charset=UTF-8'}
24
+
25
+ def session_header
26
+ {'X-SFDC-Session' => @client.options[:oauth_token]}
27
+ end
28
+
29
+ def parse_xml(xml)
30
+ parsed = nil
31
+ begin
32
+ parsed = XmlSimple.xml_in(xml)
33
+ rescue => e
34
+ @logger.error "Error parsing xml: #{xml}\n#{e}\n#{e.backtrace}"
35
+ raise
36
+ end
37
+
38
+ return parsed
39
+ end
40
+
41
+ def post_xml(path, xml, options={})
42
+ path = "#{@@PATH_PREFIX}#{path}"
43
+ headers = options[:csv_content_type] ? CSV_REQUEST_HEADER : XML_REQUEST_HEADER
44
+
45
+ response = nil
46
+ # do the request
47
+ with_retries do
48
+ begin
49
+ response = @client.post(path, xml, headers.merge(session_header))
50
+ rescue JSON::ParserError => e
51
+ if e.message.index('ExceededQuota')
52
+ raise "You've run out of sfdc batch api quota. Original error: #{e}\n #{e.backtrace}"
53
+ end
54
+ raise e
55
+ end
56
+ end
57
+
58
+ return parse_xml(response.body)
59
+ end
60
+
61
+ def get_xml(path, options={})
62
+ path = "#{@@PATH_PREFIX}#{path}"
63
+ headers = XML_REQUEST_HEADER
64
+
65
+ response = nil
66
+ with_retries do
67
+ response = @client.get(path, {}, headers.merge(session_header))
68
+ end
69
+
70
+ return options[:skip_parsing] ? response.body : parse_xml(response.body)
71
+ end
72
+
73
+ def get_to_file(path, filename)
74
+ path = "#{@@PATH_PREFIX}#{path}"
75
+ uri = URI.parse( @client.options[:instance_url])
76
+ # open a file
77
+ http = Net::HTTP.new(uri.host, uri.port)
78
+ http.use_ssl = true
79
+ http.ssl_version = @ssl_version if !@ssl_version.nil?
80
+ headers = XML_REQUEST_HEADER.merge(session_header)
81
+ @logger.info "Doing GET to #{path}, headers #{headers}" if @logger
82
+
83
+ if @filename_prefix
84
+ filename = "#{@filename_prefix}_#{filename}"
85
+ end
86
+
87
+ # do the request
88
+ http.request_get(path, headers) do |res|
89
+
90
+ @logger.info "Got response #{res.inspect}, reading response body by chunks and writing to #{filename}" if @logger
91
+
92
+ File.open(filename, 'w') do |file|
93
+ # write the body to the file by chunks
94
+ res.read_body do |segment|
95
+ file.write(segment.encode('UTF-8', :invalid => :replace, :undef => :replace,:replace => "?"))
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ def with_retries
102
+ i = 0
103
+ begin
104
+ yield
105
+ rescue => e
106
+ i += 1
107
+ if i < 3
108
+ @logger.warn "Retrying, got error: #{e}, #{e.backtrace}" if @logger
109
+ retry
110
+ else
111
+ @logger.error "Failed 3 times, last error: #{e}, #{e.backtrace}" if @logger
112
+ raise
113
+ end
114
+ end
115
+ end
116
+
117
+ def query_count(sobject, date_field, from, to)
118
+ # do it with retries, if it doesn't succeed, return nil, don't fail.
119
+ soql = "SELECT COUNT() FROM #{sobject} WHERE #{date_field} >= #{from} AND #{date_field} < #{to}"
120
+ begin
121
+ with_retries do
122
+ q = @client.query(soql)
123
+ return q.size
124
+ end
125
+ rescue Faraday::Error::TimeoutError => e
126
+ @logger.warn "Timeout getting count: #{soql}. Error: #{e}. Taking it as failed verification" if @logger
127
+ return nil
128
+ end
129
+ end
130
+
131
+ def to_log
132
+ return {
133
+ :client => "Restforce asi",
134
+ :filename_prefix => @filename_prefix,
135
+ :api_version => @@API_VERSION,
136
+ :path_prefix => @@PATH_PREFIX
137
+ }
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,199 @@
1
+ require "salesforce_bulk_query/batch"
2
+
3
+ module SalesforceBulkQuery
4
+
5
+ # Represents a Salesforce bulk api job, contains multiple batches.
6
+ # Many jobs contained in Query
7
+ class Job
8
+ @@operation = 'query'
9
+ @@xml_header = '<?xml version="1.0" encoding="utf-8" ?>'
10
+ JOB_TIME_LIMIT = 15 * 60
11
+ BATCH_COUNT = 15
12
+
13
+
14
+ def initialize(sobject, connection, options={})
15
+ @sobject = sobject
16
+ @connection = connection
17
+ @logger = options[:logger]
18
+ @job_time_limit = options[:job_time_limit] || JOB_TIME_LIMIT
19
+ @date_field = options[:date_field] or fail "date_field must be given when creating a batch"
20
+ @batch_count = options[:batch_count] || BATCH_COUNT
21
+
22
+ # all batches (static)
23
+ @batches = []
24
+
25
+ # unfinished batches as of last get_available_results call
26
+ @unfinished_batches = []
27
+
28
+ # filenames fort the already downloaded and verified batches
29
+ @filenames = []
30
+ end
31
+
32
+ attr_reader :job_id
33
+
34
+ # Do the API request
35
+ def create_job(csv=true)
36
+ content_type = csv ? "CSV" : "XML"
37
+ xml = "#{@@xml_header}<jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">"
38
+ xml += "<operation>#{@@operation}</operation>"
39
+ xml += "<object>#{@sobject}</object>"
40
+ xml += "<contentType>#{content_type}</contentType>"
41
+ xml += "</jobInfo>"
42
+
43
+ response_parsed = @connection.post_xml("job", xml)
44
+ @job_id = response_parsed['id'][0]
45
+ end
46
+
47
+ def get_extended_soql(soql, from, to)
48
+ return "#{soql} WHERE #{@date_field} >= #{from} AND #{@date_field} < #{to}"
49
+ end
50
+
51
+ def generate_batches(soql, start, stop, single_batch=false)
52
+ # if there's just one batch wanted, add it and we're done
53
+ if single_batch
54
+ soql_extended = get_extended_soql(soql, start, stop)
55
+ @logger.info "Adding soql #{soql_extended} as a batch to job" if @logger
56
+
57
+ add_query(soql_extended,
58
+ :start => start,
59
+ :stop => stop
60
+ )
61
+ return
62
+ end
63
+
64
+ # if there's more, generate the time intervals and generate the batches
65
+ step_size = (stop - start) / @batch_count
66
+
67
+ interval_beginings = start.step(stop - step_size, step_size).map{|f|f}
68
+ interval_ends = interval_beginings.clone
69
+ interval_ends.shift
70
+ interval_ends.push(stop)
71
+
72
+ interval_beginings.zip(interval_ends).each do |from, to|
73
+
74
+ soql_extended = get_extended_soql(soql, from, to)
75
+ @logger.info "Adding soql #{soql_extended} as a batch to job" if @logger
76
+
77
+ add_query(soql_extended,
78
+ :start => from,
79
+ :stop => to
80
+ )
81
+ end
82
+ end
83
+
84
+ def add_query(query, options={})
85
+ # create and create a batch
86
+ batch = SalesforceBulkQuery::Batch.new(
87
+ :sobject => @sobject,
88
+ :soql => query,
89
+ :job_id => @job_id,
90
+ :connection => @connection,
91
+ :start => options[:start],
92
+ :stop => options[:stop],
93
+ :logger => @logger,
94
+ :date_field => @date_field
95
+ )
96
+ batch.create
97
+
98
+ # add the batch to the list
99
+ @batches.push(batch)
100
+ @unfinished_batches.push(batch)
101
+ end
102
+
103
+ def close_job
104
+ xml = "#{@@xml_header}<jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">"
105
+ xml += "<state>Closed</state>"
106
+ xml += "</jobInfo>"
107
+
108
+ path = "job/#{@job_id}"
109
+
110
+ response_parsed = @connection.post_xml(path, xml)
111
+ @job_closed_time = Time.now
112
+ end
113
+
114
+ def check_status
115
+ path = "job/#{@job_id}"
116
+ response_parsed = @connection.get_xml(path)
117
+ @completed_count = Integer(response_parsed["numberBatchesCompleted"][0])
118
+ @succeeded = @completed_count == Integer(response_parsed["numberBatchesTotal"][0])
119
+
120
+ return {
121
+ :succeeded => @succeeded,
122
+ :some_records_failed => Integer(response_parsed["numberRecordsFailed"][0]) > 0,
123
+ :some_batches_failed => Integer(response_parsed["numberBatchesFailed"][0]) > 0,
124
+ :response => response_parsed
125
+ }
126
+ end
127
+
128
+ def over_limit?
129
+ (Time.now - @job_closed_time) > @job_time_limit
130
+ end
131
+
132
+ # downloads whatever is available, returns as unfinished whatever is not
133
+
134
+ def get_available_results(options={})
135
+ downloaded_filenames = []
136
+ unfinished_batches = []
137
+ verification_fail_batches = []
138
+ failed_batches = []
139
+
140
+ # get result for each batch in the job
141
+ @unfinished_batches.each do |batch|
142
+ batch_status = batch.check_status
143
+
144
+ # if the result is ready
145
+ if batch_status[:succeeded]
146
+ # each finished batch should go here only once
147
+
148
+ # download the result
149
+ result = batch.get_result(options)
150
+ @logger.info "get_result result: #{result}" if @logger
151
+
152
+ # if the verification failed, put it to failed
153
+ # will never ask about this one again.
154
+ if result[:verification] == false
155
+ verification_fail_batches << batch
156
+ else
157
+ # if verification ok and finished put it to filenames
158
+ downloaded_filenames << result[:filename]
159
+ end
160
+ elsif batch_status[:failed]
161
+ # put it to failed and raise error at the end
162
+ failed_batches << batch
163
+ else
164
+ # otherwise put it to unfinished
165
+ unfinished_batches << batch
166
+ end
167
+ end
168
+
169
+ unless failed_batches.empty?
170
+ details = failed_batches.map{ |b| "#{b.batch_id}: #{b.fail_message}"}.join("\n")
171
+ fail ArgumentError, "#{failed_batches.length} batches failed. Details: #{details}"
172
+ end
173
+
174
+ # cache the unfinished_batches till the next run
175
+ @unfinished_batches = unfinished_batches
176
+
177
+ # cumulate filenames
178
+ @filenames += downloaded_filenames
179
+
180
+ @logger.info "unfinished batches: #{unfinished_batches}\nverification_fail_batches: #{verification_fail_batches}" if @logger
181
+
182
+ return {
183
+ :finished => @unfinished_batches.empty?,
184
+ :filenames => @filenames,
185
+ :unfinished_batches => @unfinished_batches,
186
+ :verification_fail_batches => verification_fail_batches
187
+ }
188
+ end
189
+
190
+ def to_log
191
+ return {
192
+ :sobject => @sobject,
193
+ :connection => @connection.to_log,
194
+ :batches => @batches.map {|b| b.to_log},
195
+ :unfinished_batches => @unfinished_batches.map {|b| b.to_log}
196
+ }
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,44 @@
1
+ require 'forwardable'
2
+ require 'faraday'
3
+
4
+ module SalesforceBulkQuery
5
+ # Custom logger for Restforce that doesn't log tons of data.
6
+ class Logger < Faraday::Response::Middleware
7
+ extend Forwardable
8
+
9
+ MAX_LOG_LENGTH = 2000
10
+
11
+ def initialize(app, logger, options)
12
+ super(app)
13
+ @options = options
14
+ @logger = logger || begin
15
+ require 'logger'
16
+ ::Logger.new(STDOUT)
17
+ end
18
+ end
19
+
20
+ def_delegators :@logger, :debug, :info, :warn, :error, :fatal
21
+
22
+ def call(env)
23
+ debug('request') do
24
+ dump :url => env[:url].to_s,
25
+ :method => env[:method],
26
+ :headers => env[:request_headers],
27
+ :body => env[:body] ? env[:body][0..MAX_LOG_LENGTH] : nil
28
+ end
29
+ super
30
+ end
31
+
32
+ def on_complete(env)
33
+ debug('response') do
34
+ dump :status => env[:status].to_s,
35
+ :headers => env[:response_headers],
36
+ :body => env[:body] ? env[:body][0..MAX_LOG_LENGTH] : nil
37
+ end
38
+ end
39
+
40
+ def dump(hash)
41
+ "\n" + hash.map { |k, v| " #{k}: #{v.inspect}" }.join("\n")
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,192 @@
1
+ require 'salesforce_bulk_query/job'
2
+ require 'date'
3
+
4
+ module SalesforceBulkQuery
5
+
6
+ # Abstraction of a single user-given query. It contains multiple jobs, is tied to a specific connection
7
+ class Query
8
+
9
+ # if no date_to is given we use the current time with this offset
10
+ # subtracted (to make sure the freshest changes that can be inconsistent
11
+ # aren't there) It's in minutes
12
+ OFFSET_FROM_NOW = 10
13
+
14
+ DEFAULT_DATE_FIELD = 'CreatedDate'
15
+
16
+ def initialize(sobject, soql, connection, options={})
17
+ @sobject = sobject
18
+ @soql = soql
19
+ @connection = connection
20
+ @logger = options[:logger]
21
+ @date_field = options[:date_field] || DEFAULT_DATE_FIELD
22
+ @date_from = options[:date_from] || options[:created_from]
23
+ @date_to = options[:date_to] || options[:created_to]
24
+ @single_batch = options[:single_batch]
25
+
26
+ # jobs currently running
27
+ @jobs_in_progress = []
28
+
29
+ # successfully finished jobs with no batches to split
30
+ @jobs_done = []
31
+
32
+ # finished or timeouted jobs with some batches split into other jobs
33
+ @jobs_restarted = []
34
+
35
+ @finished_batch_filenames = []
36
+ @restarted_subqueries = []
37
+ end
38
+
39
+ attr_reader :jobs_in_progress, :jobs_restarted, :jobs_done
40
+
41
+ DEFAULT_MIN_CREATED = "1999-01-01T00:00:00.000Z"
42
+
43
+ # Creates the first job, divides the query to subqueries, puts all the subqueries as batches to the job
44
+ def start(options={})
45
+ # order by and where not allowed
46
+ if (!@single_batch) && (@soql =~ / WHERE /i || @soql =~ /ORDER BY/i)
47
+ raise "You can't have WHERE or ORDER BY in your soql. If you want to download just specific date range use date_from / date_to"
48
+ end
49
+
50
+ # create the first job
51
+ job = SalesforceBulkQuery::Job.new(
52
+ @sobject,
53
+ @connection,
54
+ {:logger => @logger, :date_field => @date_field}.merge(options)
55
+ )
56
+ job.create_job
57
+
58
+ # get the date when it should start
59
+ min_date = get_min_date
60
+
61
+ # generate intervals
62
+ start = nil
63
+ if (min_date.instance_of?(Time))
64
+ start = DateTime.parse(min_date.to_s)
65
+ else
66
+ start = DateTime.parse(min_date)
67
+ end
68
+
69
+ stop = nil
70
+ if (@date_to.nil?)
71
+ stop = DateTime.now - Rational(options[:offset_from_now] || OFFSET_FROM_NOW, 1440)
72
+ else
73
+ if (@date_to.instance_of?(Time))
74
+ stop = DateTime.parse(@date_to.to_s)
75
+ else
76
+ stop = DateTime.parse(@date_to)
77
+ end
78
+ end
79
+ job.generate_batches(@soql, start, stop, @single_batch)
80
+
81
+ job.close_job
82
+
83
+ @jobs_in_progress.push(job)
84
+ end
85
+
86
+ # Get results for all finished jobs. If there are some unfinished batches, skip them and return them as unfinished.
87
+ #
88
+ # @param options[:directory_path]
89
+ def get_available_results(options={})
90
+
91
+ unfinished_subqueries = []
92
+ jobs_in_progress = []
93
+ jobs_restarted = []
94
+ jobs_done = []
95
+
96
+ # check all jobs statuses and split what should be split
97
+ @jobs_in_progress.each do |job|
98
+
99
+ # download what's available
100
+ job_results = job.get_available_results(options)
101
+
102
+ job_over_limit = job.over_limit?
103
+ job_done = job_results[:finished] || job_over_limit
104
+
105
+ @logger.debug "job_results: #{job_results}" if @logger
106
+
107
+ unfinished_batches = job_results[:unfinished_batches]
108
+ verification_fail_batches = job_results[:verification_fail_batches]
109
+
110
+ unfinished_subqueries += unfinished_batches.map {|b| b.soql}
111
+
112
+ # split to subqueries what needs to be split
113
+ to_split = verification_fail_batches
114
+ to_split += unfinished_batches if job_over_limit
115
+
116
+ # delete files associated with batches that failed verification
117
+ verification_fail_batches.each do |b|
118
+ @logger.info "Deleting #{b.filename}, verification failed."
119
+ File.delete(b.filename)
120
+ end
121
+
122
+ to_split.each do |batch|
123
+ # for each unfinished batch create a new job and add it to new jobs
124
+ @logger.info "The following subquery didn't end in time / failed verification: #{batch.soql}. Dividing into multiple and running again" if @logger
125
+ new_job = SalesforceBulkQuery::Job.new(
126
+ @sobject,
127
+ @connection,
128
+ {:logger => @logger, :date_field => @date_field}.merge(options)
129
+ )
130
+ new_job.create_job
131
+ new_job.generate_batches(@soql, batch.start, batch.stop)
132
+ new_job.close_job
133
+ jobs_in_progress.push(new_job)
134
+ end
135
+
136
+ # what to do with the current job
137
+ # finish, some stuff restarted
138
+ if job_done
139
+ if to_split.empty?
140
+ # done, nothing left
141
+ jobs_done.push(job)
142
+
143
+ @logger.info "#{job.job_id} finished. Nothing to split. unfinished_batches: #{unfinished_batches}, verification_fail_batches: #{verification_fail_batches}" if @logger
144
+ else
145
+ # done, some batches needed to be restarted
146
+ jobs_restarted.push(job)
147
+ end
148
+
149
+ # store the filenames and restarted stuff
150
+ @finished_batch_filenames += job_results[:filenames]
151
+ @restarted_subqueries += to_split.map {|b| b.soql}
152
+ else
153
+ # still in progress
154
+ jobs_in_progress.push(job)
155
+ end
156
+ end
157
+
158
+ # remove the finished jobs from progress and add there the new ones
159
+ @jobs_in_progress = jobs_in_progress
160
+ @jobs_done += jobs_done
161
+
162
+ # we're done if there're no jobs in progress
163
+ return {
164
+ :succeeded => @jobs_in_progress.empty?,
165
+ :filenames => @finished_batch_filenames,
166
+ :unfinished_subqueries => unfinished_subqueries,
167
+ :jobs_done => @jobs_done.map { |j| j.job_id }
168
+ }
169
+ end
170
+
171
+ private
172
+
173
+ def get_min_date
174
+ if @date_from
175
+ return @date_from
176
+ end
177
+
178
+ # get the date when the first was created
179
+ min_created = nil
180
+ begin
181
+ min_created_resp = @connection.client.query("SELECT #{@date_field} FROM #{@sobject} ORDER BY #{@date_field} LIMIT 1")
182
+ min_created_resp.each {|s| min_created = s[@date_field.to_sym]}
183
+ rescue Faraday::Error::TimeoutError => e
184
+ @logger.warn "Timeout getting the oldest object for #{@sobject}. Error: #{e}. Using the default value" if @logger
185
+ min_created = DEFAULT_MIN_CREATED
186
+ rescue Faraday::Error::ClientError => e
187
+ fail ArgumentError, "Error when trying to get the oldest record according to #{@date_field}, looks like #{@date_field} is not on #{@sobject}. Original error: #{e}\n #{e.message} \n #{e.backtrace} "
188
+ end
189
+ min_created
190
+ end
191
+ end
192
+ end