RubyGems - salesforce_bulk_query-edge - Versions diffs - 0.2.1 - Mend

salesforce_bulk_query-edge 0.2.1

Files changed (22) hide show

checksums.yaml +7 -0
data/.gitignore +4 -0
data/.rspec +3 -0
data/.travis.yml +26 -0
data/Gemfile +4 -0
data/LICENSE +22 -0
data/README.md +168 -0
data/Rakefile +20 -0
data/env_setup-example.sh +13 -0
data/lib/salesforce_bulk_query.rb +108 -0
data/lib/salesforce_bulk_query/batch.rb +153 -0
data/lib/salesforce_bulk_query/connection.rb +140 -0
data/lib/salesforce_bulk_query/job.rb +199 -0
data/lib/salesforce_bulk_query/logger.rb +44 -0
data/lib/salesforce_bulk_query/query.rb +192 -0
data/lib/salesforce_bulk_query/utils.rb +16 -0
data/lib/salesforce_bulk_query/version.rb +3 -0
data/new-version.sh +22 -0
data/salesforce_bulk_query.gemspec +34 -0
data/spec/salesforce_bulk_query_spec.rb +227 -0
data/spec/spec_helper.rb +9 -0
metadata +207 -0

data/lib/salesforce_bulk_query/connection.rb ADDED

@@ -0,0 +1,140 @@
+require 'xmlsimple'
+require 'net/http'
+module SalesforceBulkQuery
+  # Connection to the Salesforce API
+  # shared in all classes that do some requests
+  class Connection
+    def initialize(client, api_version, logger=nil, filename_prefix=nil,ssl_version = nil)
+      @client = client
+      @logger = logger
+      @filename_prefix = filename_prefix
+      @ssl_version = ssl_version
+      @@API_VERSION = api_version
+      @@PATH_PREFIX = "/services/async/#{@@API_VERSION}/"
+    end
+    attr_reader :client
+    XML_REQUEST_HEADER = {'Content-Type' => 'application/xml; charset=utf-8'}
+    CSV_REQUEST_HEADER = {'Content-Type' => 'text/csv; charset=UTF-8'}
+    def session_header
+      {'X-SFDC-Session' => @client.options[:oauth_token]}
+    end
+    def parse_xml(xml)
+      parsed = nil
+      begin
+        parsed = XmlSimple.xml_in(xml)
+      rescue => e
+        @logger.error "Error parsing xml: #{xml}\n#{e}\n#{e.backtrace}"
+        raise
+      end
+      return parsed
+    end
+    def post_xml(path, xml, options={})
+      path = "#{@@PATH_PREFIX}#{path}"
+      headers = options[:csv_content_type] ? CSV_REQUEST_HEADER : XML_REQUEST_HEADER
+      response = nil
+      # do the request
+      with_retries do
+        begin
+          response = @client.post(path, xml, headers.merge(session_header))
+        rescue JSON::ParserError => e
+          if e.message.index('ExceededQuota')
+            raise "You've run out of sfdc batch api quota. Original error: #{e}\n #{e.backtrace}"
+          end
+          raise e
+        end
+      end
+      return parse_xml(response.body)
+    end
+    def get_xml(path, options={})
+      path = "#{@@PATH_PREFIX}#{path}"
+      headers = XML_REQUEST_HEADER
+      response = nil
+      with_retries do
+        response = @client.get(path, {}, headers.merge(session_header))
+      end
+      return options[:skip_parsing] ? response.body : parse_xml(response.body)
+    end
+    def get_to_file(path, filename)
+      path = "#{@@PATH_PREFIX}#{path}"
+      uri = URI.parse( @client.options[:instance_url])
+      # open a file
+      http = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = true
+      http.ssl_version = @ssl_version if !@ssl_version.nil?
+      headers = XML_REQUEST_HEADER.merge(session_header)
+      @logger.info "Doing GET to #{path}, headers #{headers}" if @logger
+      if @filename_prefix
+        filename = "#{@filename_prefix}_#{filename}"
+      end
+      # do the request
+      http.request_get(path, headers) do |res|
+        @logger.info "Got response #{res.inspect}, reading response body by chunks and writing to #{filename}" if @logger
+        File.open(filename, 'w') do |file|
+          # write the body to the file by chunks
+          res.read_body do |segment|
+            file.write(segment.encode('UTF-8', :invalid => :replace, :undef => :replace,:replace => "?"))
+          end
+        end
+      end
+    end
+    def with_retries
+      i = 0
+      begin
+        yield
+      rescue => e
+        i += 1
+        if i < 3
+          @logger.warn "Retrying, got error: #{e}, #{e.backtrace}" if @logger
+          retry
+        else
+          @logger.error "Failed 3 times, last error: #{e}, #{e.backtrace}" if @logger
+          raise
+        end
+      end
+    end
+    def query_count(sobject, date_field, from, to)
+      # do it with retries, if it doesn't succeed, return nil, don't fail.
+      soql = "SELECT COUNT() FROM #{sobject} WHERE #{date_field} >= #{from} AND #{date_field} < #{to}"
+      begin
+        with_retries do
+          q = @client.query(soql)
+          return q.size
+        end
+      rescue Faraday::Error::TimeoutError => e
+        @logger.warn "Timeout getting count: #{soql}. Error: #{e}. Taking it as failed verification" if @logger
+        return nil
+      end
+    end
+    def to_log
+      return {
+        :client => "Restforce asi",
+        :filename_prefix => @filename_prefix,
+        :api_version => @@API_VERSION,
+        :path_prefix => @@PATH_PREFIX
+      }
+    end
+  end
+end

data/lib/salesforce_bulk_query/job.rb ADDED

@@ -0,0 +1,199 @@
+require "salesforce_bulk_query/batch"
+module SalesforceBulkQuery
+  # Represents a Salesforce bulk api job, contains multiple batches.
+  # Many jobs contained in Query
+  class Job
+    @@operation = 'query'
+    @@xml_header = '<?xml version="1.0" encoding="utf-8" ?>'
+    JOB_TIME_LIMIT = 15 * 60
+    BATCH_COUNT = 15
+    def initialize(sobject, connection, options={})
+      @sobject = sobject
+      @connection = connection
+      @logger = options[:logger]
+      @job_time_limit = options[:job_time_limit] || JOB_TIME_LIMIT
+      @date_field = options[:date_field] or fail "date_field must be given when creating a batch"
+      @batch_count = options[:batch_count] || BATCH_COUNT
+      # all batches (static)
+      @batches = []
+      # unfinished batches as of last get_available_results call
+      @unfinished_batches = []
+      # filenames fort the already downloaded and verified batches
+      @filenames = []
+    end
+    attr_reader :job_id
+    # Do the API request
+    def create_job(csv=true)
+      content_type = csv ? "CSV" : "XML"
+      xml = "#{@@xml_header}<jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">"
+      xml += "<operation>#{@@operation}</operation>"
+      xml += "<object>#{@sobject}</object>"
+      xml += "<contentType>#{content_type}</contentType>"
+      xml += "</jobInfo>"
+      response_parsed = @connection.post_xml("job", xml)
+      @job_id = response_parsed['id'][0]
+    end
+    def get_extended_soql(soql, from, to)
+      return "#{soql} WHERE #{@date_field} >= #{from} AND #{@date_field} < #{to}"
+    end
+    def generate_batches(soql, start, stop, single_batch=false)
+      # if there's just one batch wanted, add it and we're done
+      if single_batch
+        soql_extended = get_extended_soql(soql, start, stop)
+        @logger.info "Adding soql #{soql_extended} as a batch to job" if @logger
+        add_query(soql_extended,
+          :start => start,
+          :stop => stop
+        )
+        return
+      end
+      # if there's more, generate the time intervals and generate the batches
+      step_size = (stop - start) / @batch_count
+      interval_beginings = start.step(stop - step_size, step_size).map{|f|f}
+      interval_ends = interval_beginings.clone
+      interval_ends.shift
+      interval_ends.push(stop)
+      interval_beginings.zip(interval_ends).each do |from, to|
+        soql_extended = get_extended_soql(soql, from, to)
+        @logger.info "Adding soql #{soql_extended} as a batch to job" if @logger
+        add_query(soql_extended,
+          :start => from,
+          :stop => to
+        )
+      end
+    end
+    def add_query(query, options={})
+      # create and create a batch
+      batch = SalesforceBulkQuery::Batch.new(
+        :sobject => @sobject,
+        :soql => query,
+        :job_id => @job_id,
+        :connection => @connection,
+        :start => options[:start],
+        :stop => options[:stop],
+        :logger => @logger,
+        :date_field => @date_field
+      )
+      batch.create
+      # add the batch to the list
+      @batches.push(batch)
+      @unfinished_batches.push(batch)
+    end
+    def close_job
+      xml = "#{@@xml_header}<jobInfo xmlns=\"http://www.force.com/2009/06/asyncapi/dataload\">"
+      xml += "<state>Closed</state>"
+      xml += "</jobInfo>"
+      path = "job/#{@job_id}"
+      response_parsed = @connection.post_xml(path, xml)
+      @job_closed_time = Time.now
+    end
+    def check_status
+      path = "job/#{@job_id}"
+      response_parsed = @connection.get_xml(path)
+      @completed_count = Integer(response_parsed["numberBatchesCompleted"][0])
+      @succeeded = @completed_count == Integer(response_parsed["numberBatchesTotal"][0])
+      return {
+        :succeeded => @succeeded,
+        :some_records_failed => Integer(response_parsed["numberRecordsFailed"][0]) > 0,
+        :some_batches_failed => Integer(response_parsed["numberBatchesFailed"][0]) > 0,
+        :response => response_parsed
+      }
+    end
+    def over_limit?
+      (Time.now - @job_closed_time) > @job_time_limit
+    end
+    # downloads whatever is available, returns as unfinished whatever is not
+    def get_available_results(options={})
+      downloaded_filenames = []
+      unfinished_batches = []
+      verification_fail_batches = []
+      failed_batches = []
+      # get result for each batch in the job
+      @unfinished_batches.each do |batch|
+        batch_status = batch.check_status
+        # if the result is ready
+        if batch_status[:succeeded]
+          # each finished batch should go here only once
+          # download the result
+          result = batch.get_result(options)
+          @logger.info "get_result result: #{result}" if @logger
+          # if the verification failed, put it to failed
+          # will never ask about this one again.
+          if result[:verification] == false
+            verification_fail_batches << batch
+          else
+            # if verification ok and finished put it to filenames
+            downloaded_filenames << result[:filename]
+          end
+        elsif batch_status[:failed]
+          # put it to failed and raise error at the end
+          failed_batches << batch
+        else
+          # otherwise put it to unfinished
+          unfinished_batches << batch
+        end
+      end
+      unless failed_batches.empty?
+        details = failed_batches.map{ |b| "#{b.batch_id}: #{b.fail_message}"}.join("\n")
+        fail ArgumentError, "#{failed_batches.length} batches failed. Details: #{details}"
+      end
+      # cache the unfinished_batches till the next run
+      @unfinished_batches = unfinished_batches
+      # cumulate filenames
+      @filenames += downloaded_filenames
+      @logger.info "unfinished batches: #{unfinished_batches}\nverification_fail_batches: #{verification_fail_batches}" if @logger
+      return {
+        :finished => @unfinished_batches.empty?,
+        :filenames => @filenames,
+        :unfinished_batches => @unfinished_batches,
+        :verification_fail_batches => verification_fail_batches
+      }
+    end
+    def to_log
+      return {
+        :sobject => @sobject,
+        :connection => @connection.to_log,
+        :batches => @batches.map {|b| b.to_log},
+        :unfinished_batches => @unfinished_batches.map {|b| b.to_log}
+      }
+    end
+  end
+end

data/lib/salesforce_bulk_query/logger.rb ADDED

@@ -0,0 +1,44 @@
+require 'forwardable'
+require 'faraday'
+module SalesforceBulkQuery
+  # Custom logger for Restforce that doesn't log tons of data.
+  class Logger < Faraday::Response::Middleware
+    extend Forwardable
+    MAX_LOG_LENGTH = 2000
+    def initialize(app, logger, options)
+      super(app)
+      @options = options
+      @logger = logger || begin
+        require 'logger'
+        ::Logger.new(STDOUT)
+      end
+    end
+    def_delegators :@logger, :debug, :info, :warn, :error, :fatal
+    def call(env)
+      debug('request') do
+        dump :url => env[:url].to_s,
+          :method => env[:method],
+          :headers => env[:request_headers],
+          :body => env[:body] ? env[:body][0..MAX_LOG_LENGTH] : nil
+      end
+      super
+    end
+    def on_complete(env)
+      debug('response') do
+        dump :status => env[:status].to_s,
+          :headers => env[:response_headers],
+          :body => env[:body] ? env[:body][0..MAX_LOG_LENGTH] : nil
+      end
+    end
+    def dump(hash)
+      "\n" + hash.map { |k, v| " #{k}: #{v.inspect}" }.join("\n")
+    end
+  end
+end

data/lib/salesforce_bulk_query/query.rb ADDED

@@ -0,0 +1,192 @@
+require 'salesforce_bulk_query/job'
+require 'date'
+module SalesforceBulkQuery
+  # Abstraction of a single user-given query. It contains multiple jobs, is tied to a specific connection
+  class Query
+    # if no date_to is given we use the current time with this offset
+    # subtracted (to make sure the freshest changes that can be inconsistent
+    # aren't there) It's in minutes
+    OFFSET_FROM_NOW = 10
+    DEFAULT_DATE_FIELD = 'CreatedDate'
+    def initialize(sobject, soql, connection, options={})
+      @sobject = sobject
+      @soql = soql
+      @connection = connection
+      @logger = options[:logger]
+      @date_field = options[:date_field] || DEFAULT_DATE_FIELD
+      @date_from = options[:date_from] || options[:created_from]
+      @date_to = options[:date_to] || options[:created_to]
+      @single_batch = options[:single_batch]
+      # jobs currently running
+      @jobs_in_progress = []
+      # successfully finished jobs with no batches to split
+      @jobs_done = []
+      # finished or timeouted jobs with some batches split into other jobs
+      @jobs_restarted = []
+      @finished_batch_filenames = []
+      @restarted_subqueries = []
+    end
+    attr_reader :jobs_in_progress, :jobs_restarted, :jobs_done
+    DEFAULT_MIN_CREATED = "1999-01-01T00:00:00.000Z"
+    # Creates the first job, divides the query to subqueries, puts all the subqueries as batches to the job
+    def start(options={})
+      # order by and where not allowed
+      if (!@single_batch) && (@soql =~ / WHERE /i || @soql =~ /ORDER BY/i)
+        raise "You can't have WHERE or ORDER BY in your soql. If you want to download just specific date range use date_from / date_to"
+      end
+      # create the first job
+      job = SalesforceBulkQuery::Job.new(
+        @sobject,
+        @connection,
+        {:logger => @logger, :date_field => @date_field}.merge(options)
+      )
+      job.create_job
+      # get the date when it should start
+      min_date = get_min_date
+      # generate intervals
+      start = nil
+      if (min_date.instance_of?(Time))
+        start = DateTime.parse(min_date.to_s)
+      else
+        start = DateTime.parse(min_date)
+      end
+      stop = nil
+      if (@date_to.nil?)
+        stop = DateTime.now - Rational(options[:offset_from_now] || OFFSET_FROM_NOW, 1440)
+      else
+        if (@date_to.instance_of?(Time))
+          stop = DateTime.parse(@date_to.to_s)
+        else
+          stop = DateTime.parse(@date_to)
+        end
+      end
+      job.generate_batches(@soql, start, stop, @single_batch)
+      job.close_job
+      @jobs_in_progress.push(job)
+    end
+    # Get results for all finished jobs. If there are some unfinished batches, skip them and return them as unfinished.
+    #
+    # @param options[:directory_path]
+    def get_available_results(options={})
+      unfinished_subqueries = []
+      jobs_in_progress = []
+      jobs_restarted = []
+      jobs_done = []
+      # check all jobs statuses and split what should be split
+      @jobs_in_progress.each do |job|
+        # download what's available
+        job_results = job.get_available_results(options)
+        job_over_limit = job.over_limit?
+        job_done = job_results[:finished] || job_over_limit
+        @logger.debug "job_results: #{job_results}" if @logger
+        unfinished_batches = job_results[:unfinished_batches]
+        verification_fail_batches = job_results[:verification_fail_batches]
+        unfinished_subqueries += unfinished_batches.map {|b| b.soql}
+        # split to subqueries what needs to be split
+        to_split = verification_fail_batches
+        to_split += unfinished_batches if job_over_limit
+        # delete files associated with batches that failed verification
+        verification_fail_batches.each do |b|
+          @logger.info "Deleting #{b.filename}, verification failed."
+          File.delete(b.filename)
+        end
+        to_split.each do |batch|
+          # for each unfinished batch create a new job and add it to new jobs
+          @logger.info "The following subquery didn't end in time / failed verification: #{batch.soql}. Dividing into multiple and running again" if @logger
+          new_job = SalesforceBulkQuery::Job.new(
+            @sobject,
+            @connection,
+            {:logger => @logger, :date_field => @date_field}.merge(options)
+          )
+          new_job.create_job
+          new_job.generate_batches(@soql, batch.start, batch.stop)
+          new_job.close_job
+          jobs_in_progress.push(new_job)
+        end
+        # what to do with the current job
+        # finish, some stuff restarted
+        if job_done
+          if to_split.empty?
+            # done, nothing left
+            jobs_done.push(job)
+            @logger.info "#{job.job_id} finished. Nothing to split. unfinished_batches: #{unfinished_batches}, verification_fail_batches: #{verification_fail_batches}" if @logger
+          else
+            # done, some batches needed to be restarted
+            jobs_restarted.push(job)
+          end
+          # store the filenames and restarted stuff
+          @finished_batch_filenames += job_results[:filenames]
+          @restarted_subqueries += to_split.map {|b| b.soql}
+        else
+          # still in progress
+          jobs_in_progress.push(job)
+        end
+      end
+      # remove the finished jobs from progress and add there the new ones
+      @jobs_in_progress = jobs_in_progress
+      @jobs_done += jobs_done
+      # we're done if there're no jobs in progress
+      return {
+        :succeeded => @jobs_in_progress.empty?,
+        :filenames => @finished_batch_filenames,
+        :unfinished_subqueries => unfinished_subqueries,
+        :jobs_done => @jobs_done.map { |j| j.job_id }
+      }
+    end
+    private
+    def get_min_date
+      if @date_from
+        return @date_from
+      end
+      # get the date when the first was created
+      min_created = nil
+      begin
+        min_created_resp = @connection.client.query("SELECT #{@date_field} FROM #{@sobject} ORDER BY #{@date_field} LIMIT 1")
+        min_created_resp.each {|s| min_created = s[@date_field.to_sym]}
+      rescue Faraday::Error::TimeoutError => e
+        @logger.warn "Timeout getting the oldest object for #{@sobject}. Error: #{e}. Using the default value" if @logger
+        min_created = DEFAULT_MIN_CREATED
+      rescue Faraday::Error::ClientError => e
+        fail ArgumentError, "Error when trying to get the oldest record according to #{@date_field}, looks like #{@date_field} is not on #{@sobject}. Original error: #{e}\n #{e.message} \n #{e.backtrace} "
+      end
+      min_created
+    end
+  end
+end