RubyGems - datahen - Versions diffs - 0.10.4 - Mend

datahen 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

checksums.yaml +7 -0
data/.gitignore +12 -0
data/.travis.yml +7 -0
data/CODE_OF_CONDUCT.md +74 -0
data/Gemfile +6 -0
data/LICENSE.txt +21 -0
data/README.md +29 -0
data/Rakefile +22 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/datahen.gemspec +47 -0
data/examples/fetchtest/libraries/hello.rb +9 -0
data/examples/fetchtest/libraries/hello_fail.rb +10 -0
data/examples/fetchtest/parsers/failed.rb +2 -0
data/examples/fetchtest/parsers/find_outputs.rb +18 -0
data/examples/fetchtest/parsers/home.rb +50 -0
data/examples/fetchtest/parsers/nested_fail.rb +3 -0
data/examples/fetchtest/parsers/simple.rb +14 -0
data/examples/fetchtest/seeders/csv_seeder.rb +12 -0
data/examples/fetchtest/seeders/failed.rb +1 -0
data/examples/fetchtest/seeders/list_of_urls.csv +5 -0
data/examples/fetchtest/seeders/seed.rb +28 -0
data/examples/fetchtest/seeders/test_reset_page.rb +4 -0
data/exe/hen +3 -0
data/lib/datahen.rb +5 -0
data/lib/datahen/cli.rb +45 -0
data/lib/datahen/cli/env_var.rb +48 -0
data/lib/datahen/cli/finisher.rb +40 -0
data/lib/datahen/cli/global_page.rb +39 -0
data/lib/datahen/cli/job.rb +30 -0
data/lib/datahen/cli/job_output.rb +69 -0
data/lib/datahen/cli/parser.rb +64 -0
data/lib/datahen/cli/scraper.rb +185 -0
data/lib/datahen/cli/scraper_deployment.rb +24 -0
data/lib/datahen/cli/scraper_export.rb +51 -0
data/lib/datahen/cli/scraper_exporter.rb +40 -0
data/lib/datahen/cli/scraper_finisher.rb +20 -0
data/lib/datahen/cli/scraper_job.rb +75 -0
data/lib/datahen/cli/scraper_job_var.rb +48 -0
data/lib/datahen/cli/scraper_page.rb +203 -0
data/lib/datahen/cli/scraper_var.rb +48 -0
data/lib/datahen/cli/seeder.rb +40 -0
data/lib/datahen/client.rb +29 -0
data/lib/datahen/client/auth_token.rb +50 -0
data/lib/datahen/client/backblaze_content.rb +45 -0
data/lib/datahen/client/base.rb +69 -0
data/lib/datahen/client/deploy_key.rb +21 -0
data/lib/datahen/client/env_var.rb +28 -0
data/lib/datahen/client/export.rb +10 -0
data/lib/datahen/client/global_page.rb +18 -0
data/lib/datahen/client/job.rb +64 -0
data/lib/datahen/client/job_export.rb +10 -0
data/lib/datahen/client/job_log.rb +26 -0
data/lib/datahen/client/job_output.rb +19 -0
data/lib/datahen/client/job_page.rb +58 -0
data/lib/datahen/client/job_stat.rb +16 -0
data/lib/datahen/client/scraper.rb +57 -0
data/lib/datahen/client/scraper_deployment.rb +18 -0
data/lib/datahen/client/scraper_export.rb +22 -0
data/lib/datahen/client/scraper_exporter.rb +14 -0
data/lib/datahen/client/scraper_finisher.rb +16 -0
data/lib/datahen/client/scraper_job.rb +49 -0
data/lib/datahen/client/scraper_job_output.rb +19 -0
data/lib/datahen/client/scraper_job_page.rb +67 -0
data/lib/datahen/client/scraper_job_var.rb +28 -0
data/lib/datahen/client/scraper_var.rb +28 -0
data/lib/datahen/plugin.rb +6 -0
data/lib/datahen/plugin/context_exposer.rb +55 -0
data/lib/datahen/scraper.rb +18 -0
data/lib/datahen/scraper/executor.rb +373 -0
data/lib/datahen/scraper/finisher.rb +18 -0
data/lib/datahen/scraper/parser.rb +18 -0
data/lib/datahen/scraper/ruby_finisher_executor.rb +116 -0
data/lib/datahen/scraper/ruby_parser_executor.rb +200 -0
data/lib/datahen/scraper/ruby_seeder_executor.rb +120 -0
data/lib/datahen/scraper/seeder.rb +18 -0
data/lib/datahen/version.rb +3 -0
metadata +270 -0

data/lib/datahen/plugin.rb ADDED

@@ -0,0 +1,6 @@
+require 'datahen/plugin/context_exposer'
+module Datahen
+  module Plugin
+  end
+end

data/lib/datahen/plugin/context_exposer.rb ADDED

@@ -0,0 +1,55 @@
+module Datahen
+  module Plugin
+    module ContextExposer
+      def self.exposed_methods
+        raise NotImplementedError.new('Specify methods exposed to isolated env')
+      end
+      def exposed_methods
+        self.class.exposed_methods
+      end
+      # Create lambda to retrieve a variable or call instance method
+      def var_or_proc vars, key
+        myself = self # Avoid stack overflow
+        return lambda{vars[key]} if vars.has_key?(key)
+        lambda{|*args| myself.send(key, *args)}
+      end
+      def exposed_env vars
+        keys = exposed_methods + vars.keys
+        Hash[keys.uniq.map{|key|[key, var_or_proc(vars, key)]}]
+      end
+      def expose_to object, env
+        metaclass = class << object; self; end
+        env.each do |key, block|
+          metaclass.send(:define_method, key, block)
+        end
+        object
+      end
+      # Create isolated context object from self
+      def create_context vars = {}
+        create_top_object_script = '(
+          lambda do
+            object = Object.new
+            metaclass = class << object
+              define_method(:context_binding){binding}
+            end
+            object
+          end
+        ).call'
+        object = TOPLEVEL_BINDING.eval(create_top_object_script)
+        env = exposed_env(vars)
+        expose_to object, env
+        object
+      end
+      # Create an isolated binding
+      def isolated_binding vars = {}
+        create_context(vars).context_binding
+      end
+    end
+  end
+end

data/lib/datahen/scraper.rb ADDED

@@ -0,0 +1,18 @@
+require "datahen/plugin"
+require "datahen/scraper/parser"
+require "datahen/scraper/seeder"
+require "datahen/scraper/finisher"
+require "datahen/scraper/executor"
+require "datahen/scraper/ruby_parser_executor"
+require "datahen/scraper/ruby_seeder_executor"
+require "datahen/scraper/ruby_finisher_executor"
+require "datahen/client"
+module Datahen
+  module Scraper
+    # def self.list(opts={})
+    #   scraper = Client::Scraper.new(opts)
+    #   "Listing scrapers #{ENV['DATAHEN_TOKEN']} for #{scraper.all}"
+    # end
+  end
+end

data/lib/datahen/scraper/executor.rb ADDED

@@ -0,0 +1,373 @@
+require 'nokogiri'
+module Datahen
+  module Scraper
+    # @abstract
+    class Executor
+      # Max allowed page size when query outputs (see #find_outputs).
+      MAX_FIND_OUTPUTS_PER_PAGE = 500
+      attr_accessor :filename, :gid, :job_id
+      include Datahen::Plugin::ContextExposer
+      def exec_parser(save=false)
+        raise "should be implemented in subclass"
+      end
+      def init_page()
+        if job_id
+          puts "getting Job Page"
+          init_job_page
+        else
+          puts "getting Global Page"
+          init_global_page()
+        end
+      end
+      def init_job_page()
+        client = Client::JobPage.new()
+        job_page = client.find(job_id, gid)
+        unless job_page.code == 200
+          raise "Job #{job_id} or GID #{gid} not found. Aborting execution!"
+        else
+          job_page
+        end
+      end
+      def parsing_update(options={})
+        client = Client::JobPage.new()
+        job_id = options.fetch(:job_id)
+        gid = options.fetch(:gid)
+        client.parsing_update(job_id, gid, options)
+      end
+      def seeding_update(options={})
+        client = Client::Job.new()
+        job_id = options.fetch(:job_id)
+        client.seeding_update(job_id, options)
+      end
+      def finisher_update(options={})
+        client = Client::Job.new()
+        job_id = options.fetch(:job_id)
+        client.finisher_update(job_id, options)
+      end
+      def init_global_page()
+        client = Client::GlobalPage.new()
+        client.find(gid)
+      end
+      def get_content(gid)
+        client = Client::GlobalPage.new()
+        content_json = client.find_content(gid)
+        if content_json['available']
+          signed_url = content_json['signed_url']
+          Client::BackblazeContent.new.get_gunzipped_content(signed_url)
+        else
+          nil
+        end
+      end
+      def get_failed_content(gid)
+        client = Client::GlobalPage.new()
+        content_json = client.find_failed_content(gid)
+        if content_json['available']
+          signed_url = content_json['signed_url']
+          Client::BackblazeContent.new.get_gunzipped_content(signed_url)
+        else
+          nil
+        end
+      end
+      # Get current job id from scraper or default when scraper_name is null.
+      #
+      # @param [String|nil] scraper_name Scraper name.
+      # @param [Integer|nil] default (nil) Default job id when no scraper name.
+      #
+      # @raise [Exception] When scraper name is not null, and scraper doesn't
+      #   exists or it has no current job.
+      def get_job_id scraper_name, default = nil
+        return default if scraper_name.nil?
+        job = Client::ScraperJob.new().find(scraper_name)
+        raise JSON.pretty_generate(job) if job['id'].nil?
+        job['id']
+      end
+      # Find outputs by collection and query with pagination.
+      #
+      # @param [String] collection ('default') Collection name.
+      # @param [Hash] query ({}) Filters to query.
+      # @param [Integer] page (1) Page number.
+      # @param [Integer] per_page (100) Page size.
+      # @param [Hash] opts ({}) Configuration options.
+      # @option opts [String|nil] :scraper_name (nil) Scraper name to query
+      #   from.
+      # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
+      #
+      # @raise [ArgumentError] +collection+ is not String.
+      # @raise [ArgumentError] +query+ is not a Hash.
+      # @raise [ArgumentError] +page+ is not an Integer greater than 0.
+      # @raise [ArgumentError] +per_page+ is not an Integer between 1 and 500.
+      #
+      # @return [Array]
+      #
+      # @example
+      #   find_outputs
+      # @example
+      #   find_outputs 'my_collection'
+      # @example
+      #   find_outputs 'my_collection', {}
+      # @example
+      #   find_outputs 'my_collection', {}, 1
+      # @example
+      #   find_outputs 'my_collection', {}, 1, 100
+      # @example Find from another scraper by name
+      #   find_outputs 'my_collection', {}, 1, 100, scraper_name: 'my_scraper'
+      # @example Find from another scraper by job_id
+      #   find_outputs 'my_collection', {}, 1, 100, job_id: 123
+      #
+      # @note *opts `:job_id` option is prioritize over `:scraper_name` when
+      #   both exists. If none add provided or nil values, then current job
+      #   will be used to query instead, this is the defaul behavior.
+      def find_outputs(collection='default', query={}, page=1, per_page=100, opts = {})
+        # Validate parameters out from nil for easier user usage.
+        raise ArgumentError.new("collection needs to be a String") unless collection.is_a?(String)
+        raise ArgumentError.new("query needs to be a Hash, instead of: #{query}") unless query.is_a?(Hash)
+        unless page.is_a?(Integer) && page > 0
+          raise ArgumentError.new("page needs to be an Integer greater than 0")
+        end
+        unless per_page.is_a?(Integer) && per_page > 0 && per_page <= MAX_FIND_OUTPUTS_PER_PAGE
+          raise ArgumentError.new("per_page needs to be an Integer between 1 and #{MAX_FIND_OUTPUTS_PER_PAGE}")
+        end
+        options = {
+          query: query,
+          page: page,
+          per_page: per_page}
+        # Get job_id
+        query_job_id = opts[:job_id] || get_job_id(opts[:scraper_name], self.job_id)
+        client = Client::JobOutput.new(options)
+        response = client.all(query_job_id, collection)
+        if response.code != 200
+          raise "response_code: #{response.code}|#{response.parsed_response}"
+        end
+        (response.body != 'null') ? response.parsed_response : []
+      end
+      # Find one output by collection and query with pagination.
+      #
+      # @param [String] collection ('default') Collection name.
+      # @param [Hash] query ({}) Filters to query.
+      # @param [Hash] opts ({}) Configuration options.
+      # @option opts [String|nil] :scraper_name (nil) Scraper name to query
+      #   from.
+      # @option opts [Integer|nil] :job_id (nil) Job's id to query from.
+      #
+      # @raise [ArgumentError] +collection+ is not String.
+      # @raise [ArgumentError] +query+ is not a Hash.
+      #
+      # @return [Hash|nil] `Hash` when found, and `nil` when no output is found.
+      #
+      # @example
+      #   find_output
+      # @example
+      #   find_output 'my_collection'
+      # @example
+      #   find_output 'my_collection', {}
+      # @example Find from another scraper by name
+      #   find_output 'my_collection', {}, scraper_name: 'my_scraper'
+      # @example Find from another scraper by job_id
+      #   find_output 'my_collection', {}, job_id: 123
+      #
+      # @note *opts `:job_id` option is prioritize over `:scraper_name` when
+      #   both exists. If none add provided or nil values, then current job
+      #   will be used to query instead, this is the defaul behavior.
+      def find_output(collection='default', query={}, opts = {})
+        result = find_outputs(collection, query, 1, 1, opts)
+        result.respond_to?(:first) ? result.first : nil
+      end
+      # Remove dups by prioritizing the latest dup.
+      #
+      # @param [Array] list List of hashes to dedup.
+      # @param [Hash] key_defaults Key and default value pair hash to use on
+      #   uniq validation.
+      #
+      # @return [Integer] Removed duplicated items count.
+      def remove_old_dups!(list, key_defaults)
+        raw_count = list.count
+        keys = key_defaults.keys
+        force_uniq = 0
+        list.reverse!.uniq! do |item|
+          # Extract stringify keys as hash
+          key_hash = Hash[item.map{|k,v|keys.include?(k.to_s) ? [k.to_s,v] : nil}.select{|i|!i.nil?}]
+          # Apply defaults for uniq validation
+          key_defaults.each{|k,v| key_hash[k] = v if key_hash[k].nil?}
+          # Don't dedup nil key defaults
+          skip_dedup = !keys.find{|k| key_hash[k].nil?}.nil?
+          skip_dedup ? (force_uniq += 1) : key_hash
+        end
+        list.reverse!
+        dup_count = raw_count - list.count
+        dup_count
+      end
+      # Remove page dups by prioritizing the latest dup.
+      #
+      # @param [Array] list List of pages to dedup.
+      #
+      # @return [Integer] Removed duplicated items count.
+      #
+      # @note It will not dedup for now as it is hard to build gid.
+      #   TODO: Build gid so we can dedup
+      def remove_old_page_dups!(list)
+        key_defaults = {
+          'gid' => nil
+        }
+        remove_old_dups! list, key_defaults
+      end
+      # Remove dups by prioritizing the latest dup.
+      #
+      # @param [Array] list List of outputs to dedup.
+      #
+      # @return [Integer] Removed duplicated items count.
+      def remove_old_output_dups!(list)
+        key_defaults = {
+          '_id' => nil,
+          '_collection' => 'default'
+        }
+        remove_old_dups! list, key_defaults
+      end
+      def save_pages_and_outputs(pages = [], outputs = [], status)
+        total_pages = pages.count
+        total_outputs = outputs.count
+        records_per_slice = 100
+        until pages.empty? && outputs.empty?
+          pages_slice = pages.shift(records_per_slice)
+          pages_dup_count = remove_old_page_dups! pages_slice
+          outputs_slice = outputs.shift(records_per_slice)
+          outputs_dup_count = remove_old_output_dups! outputs_slice
+          log_msgs = []
+          unless pages_slice.empty?
+            page_dups_ignored = pages_dup_count > 0 ? " (#{pages_dup_count} dups ignored)" : ''
+            log_msgs << "#{pages_slice.count} out of #{total_pages} Pages#{page_dups_ignored}"
+            unless save
+              puts '----------------------------------------'
+              puts "Trying to validate #{log_msgs.last}#{page_dups_ignored}"
+              puts JSON.pretty_generate pages_slice
+            end
+          end
+          unless outputs_slice.empty?
+            output_dups_ignored = outputs_dup_count > 0 ? " (#{outputs_dup_count} dups ignored)" : ''
+            log_msgs << "#{outputs_slice.count} out of #{total_outputs} Outputs#{output_dups_ignored}"
+            unless save
+              puts '----------------------------------------'
+              puts "Trying to validate #{log_msgs.last}#{output_dups_ignored}"
+              puts JSON.pretty_generate outputs_slice
+            end
+          end
+          # behave differently if it is a real save
+          if save
+            log_msg = "Saving #{log_msgs.join(' and ')}."
+            puts "#{log_msg}"
+          else
+            status = "#{status}_try"
+          end
+          # saving to server
+          response = update_to_server(
+            job_id: job_id,
+            gid: gid,
+            pages: pages_slice,
+            outputs: outputs_slice,
+            status: status)
+          if response.code == 200
+            if save
+              log_msg = "Saved."
+              puts "#{log_msg}"
+            else
+              puts "Validation successful"
+            end
+          else
+            if save
+              puts "Error: Unable to save Pages and/or Outputs to server: #{response.body}"
+              raise "Unable to save Pages and/or Outputs to server: #{response.body}"
+            else
+              puts "Error: Invalid Pages and/or Outputs: #{response.body}"
+              raise "Invalid Pages and/or Outputs: #{response.body}"
+            end
+          end
+        end
+      end
+      def update_to_server(opts = {})
+        raise "Implemented in Subclass"
+      end
+      def clean_backtrace(backtrace)
+        i = backtrace.index{|x| x =~ /gems\/datahen/i}
+        if i.to_i < 1
+          return []
+        else
+          return backtrace[0..(i-1)]
+        end
+      end
+      def save_type
+        raise NotImplementedError.new('Need to implement "save_type" method.')
+      end
+      # Saves pages from an array and clear it.
+      #
+      # @param [Array] pages ([]) Page array to save. Warning: all elements will
+      #   be removed from the array.
+      #
+      # @note IMPORTANT: +pages+ array's elements will be removed.
+      def save_pages(pages=[])
+        save_pages_and_outputs(pages, [], save_type)
+      end
+      # Saves outputs from an array and clear it.
+      #
+      # @param [Array] outputs ([]) Output array to save. Warning: all elements
+      #   will be removed from the array.
+      #
+      # @note IMPORTANT: +outputs+ array's elements will be removed.
+      def save_outputs(outputs=[])
+        save_pages_and_outputs([], outputs, save_type)
+      end
+      # Eval a filename with a custom binding
+      #
+      # @param [String] file_path File path to read.
+      # @param [Binding] context Context binding to evaluate with.
+      #
+      # @note Using this method will allow scripts to contain `return` to
+      #   exit the script sooner along some improved security.
+      def eval_with_context file_path, context
+        eval(File.read(file_path), context, file_path)
+      end
+    end
+  end
+end