RubyGems - extraloop - Versions diffs - 0.0.1 - Mend

extraloop 0.0.1

Files changed (27) hide show

data/History.txt +2 -0
data/README.md +135 -0
data/examples/google_news_scraper.rb +22 -0
data/examples/wikipedia_categories.rb +49 -0
data/lib/extraloop/dom_extractor.rb +45 -0
data/lib/extraloop/extraction_environment.rb +20 -0
data/lib/extraloop/extraction_loop.rb +46 -0
data/lib/extraloop/extractor_base.rb +40 -0
data/lib/extraloop/hookable.rb +26 -0
data/lib/extraloop/iterative_scraper.rb +291 -0
data/lib/extraloop/json_extractor.rb +36 -0
data/lib/extraloop/loggable.rb +64 -0
data/lib/extraloop/scraper_base.rb +166 -0
data/lib/extraloop/utils.rb +75 -0
data/lib/extraloop.rb +43 -0
data/spec/dom_extractor_spec.rb +165 -0
data/spec/extraction_loop_spec.rb +76 -0
data/spec/fixtures/doc.html +1324 -0
data/spec/fixtures/doc.json +1 -0
data/spec/helpers/scraper_helper.rb +46 -0
data/spec/helpers/spec_helper.rb +12 -0
data/spec/iterative_scraper_spec.rb +175 -0
data/spec/json_extractor_spec.rb +146 -0
data/spec/loggable_spec.rb +25 -0
data/spec/scraper_base_spec.rb +178 -0
data/spec/utils_spec.rb +44 -0
metadata +140 -0

data/lib/extraloop/json_extractor.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'pry'
+class JsonExtractor < ExtractorBase
+  def initialize(*args)
+    @path = args[2] && args[2].is_a?(Array) ? args[2] : nil
+    super(*args)
+  end
+  def extract_field(node, record=nil)
+    output = node = node.is_a?(String) ? parse(node) : node
+    output = node.get_in(@path) if @path
+    output = node[@attribute.to_s] if @attribute
+    output = @environment.run(output, record, &@callback) if @callback
+    # when no attribute and no callback is provided, try fetching by field name
+    if !@attribute && !@callback
+     output = node[@field_name.to_s] if node[@field_name.to_s]
+    end
+    output
+  end
+  def extract_list(input)
+    #TODO: implement more clever stuff here after looking
+    # into possible hash traversal techniques
+    input = input.is_a?(String) ? parse(input) : input
+    input = input.get_in(@path) if @path
+    @callback && Array(@environment.run(input, &@callback)) || input
+  end
+  def parse(input)
+    super(input)
+    @environment.document = (Yajl::Parser.new).parse(input).extend(Utils::DeepFetchable)
+  end
+end

data/lib/extraloop/loggable.rb ADDED Viewed

@@ -0,0 +1,64 @@
+autoload :Logging,  "logging"
+# Decorates a class with an instance of Logging.logger and a convenient
+# helper method to log messages.
+module Loggable
+  protected
+  #
+  # Initializes the incorporated logger object.
+  #
+  # Returns nothing.
+  #
+  def init_log!
+    return unless @options[:log]
+    @options[:log] ||= {
+      :appenders => [ Logging.appenders.stderr ],
+      :log_level => :info
+    }
+    if @options[:log] && @options[:log][:appenders] && @options[:log][:appenders].any?
+      @log = Logging.logger["#{self}"]
+      @log.add_appenders(@options[:log][:appenders])
+      @log.level = @options[:log] && @options[:log][:log_level] || :info
+    end
+  end
+  #
+  # Convenience method for logging messages.
+  #
+  # messages  - the message content
+  # log_level - the message's log level (can be either :info, :debug, :error, :warning; defaults to :info)
+  #
+  # Returns nothing.
+  #
+  def log(message, log_level = :info)
+    @log.send(log_level, message) if @log
+  end
+end
+#
+#  Monkey patches ScraperBase.
+#
+class ScraperBase
+  include Loggable
+  alias_method :base_initialize, :initialize
+  #
+  # Wrapp ScraperBase#initialize method into Loggable#initialize
+  #
+  # args - The arguments to be passed over to the ScraperBase#initialize method.
+  #
+  # Returns itself.
+  #
+  def initialize(*args)
+    base_initialize(*args)
+    init_log!
+    self
+  end
+end

data/lib/extraloop/scraper_base.rb ADDED Viewed

@@ -0,0 +1,166 @@
+class ScraperBase
+  include Hookable
+  include Utils::Support
+  attr_reader :results, :options
+  #
+  # Public: Initalizes a web scraper.
+  #
+  # urls      - One or several urls.
+  # options   - Hash of scraper options
+  #   async        : Whether the scraper should issue HTTP requests in series or in parallel (set to false to suppress logging completely).
+  #   log          : logging options (defaults to standard error).
+  #     appenders    : specifies where the log messages should be appended to (defaults to standard error).
+  #     log_level    : specifies the log level (defaults to info).
+  # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
+  #
+  #
+  #
+  # Returns itself.
+  #
+  def initialize(urls, options = {}, arguments = {})
+    @urls = Array(urls)
+    @loop_extractor_args = nil
+    @extractor_args = []
+    @loop = nil
+    @request_arguments = arguments
+    @options = {
+      :async  => false
+    }.merge(options)
+    @response_count = 0
+    @queued_count = 0
+    @hooks = {}
+    @failed_requests = []
+    hydra_options = @options[:hydra] && @options[:hydra][:max_concurrency] || {:max_concurrency => 10}
+    @hydra = Typhoeus::Hydra.new hydra_options
+    self
+  end
+  # Public: Sets the scraper extraction loop.
+  #
+  # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
+  #
+  #
+  # selector  - The CSS3 selector identifying the node list over which iterate (optional).
+  # callback  - A block of code (optional).
+  # attribute - An attribute name (optional).
+  #
+  # Returns itself.
+  #
+  def loop_on(*args)
+    @loop_extractor_args = args.insert(0, nil, ExtractionEnvironment.new(self))
+    self
+  end
+  # Public: Registers a new extractor to be added to the loop.
+  #
+  # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
+  #
+  # selector  - The CSS3 selector identifying the node list over which iterate (optional).
+  # callback  - A block of code (optional).
+  # attribute - An attribute name (optional).
+  #
+  # Returns itself.
+  #
+  #
+  def extract(*args)
+    @extractor_args << args.insert(1, ExtractionEnvironment.new(self))
+    self
+  end
+  #
+  # Public: Runs the main scraping loop.
+  #
+  # Returns nothing
+  #
+  def run
+    @urls.each do |url|
+      issue_request(url)
+      # if the scraper is asynchronous start processing the Hydra HTTP queue
+      # only after that the last url has been appended to the queue (see #issue_request).
+      #
+      if @options[:async]
+        if url == @urls.last
+          @hydra.run
+        end
+      else
+        @hydra.run
+      end
+    end
+    self
+  end
+  protected
+  def issue_request(url)
+    @request_arguments[:params] = merge_request_parameters(url)
+    url_without_params = url.gsub(/\?.*/,"")
+    arguments = {
+      'headers' => [
+        'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
+        'accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+      ].join("\n")
+    }
+    arguments.merge!(@request_arguments)
+    request = Typhoeus::Request.new(*[url_without_params, arguments])
+    request.on_complete do |response|
+      handle_response(response)
+    end
+    log("queueing url: #{url}, params #{arguments[:params]}", :debug)
+    @queued_count += 1
+    @hydra.queue(request)
+  end
+  def merge_request_parameters(url)
+    url_params = URI::parse(url).extend(Utils::URIAddition).query_hash
+    return @request_arguments[:params] || {} unless url_params && url_params.respond_to?(:merge)
+    params = symbolize_keys(@request_arguments[:params] ||= {})
+    url_params.merge(params)
+  end
+  def handle_response(response)
+    @response_count += 1
+    @loop = prepare_loop(response)
+    log("response ##{@response_count} of #{@queued_count}, status code: [#{response.code}], URL fragment: ...#{response.effective_url.split('/').last if response.effective_url}")
+    @loop.run
+    @environment = @loop.environment
+    run_hook(:data, [@loop.records, response])
+  end
+  def prepare_loop(response)
+    format = @options[:format] || detect_format(response.headers_hash.fetch('Content-Type', nil))
+    extractor_class = format == :json ? JsonExtractor : DomExtractor
+    loop_extractor = extractor_class.new(*@loop_extractor_args)
+    extractors = @extractor_args.map { |args|  extractor_class.new(*args) }
+    ExtractionLoop.new(loop_extractor, extractors, response.body, @hooks, self)
+  end
+  def detect_format(content_type)
+    #TODO: add support for xml/rdf documents
+    if content_type && content_type =~ /json$/
+      :json
+    else
+      :html
+    end
+  end
+end

data/lib/extraloop/utils.rb ADDED Viewed

@@ -0,0 +1,75 @@
+module Utils
+  module ScrapingHelpers
+    #
+    # Generates a proc that iterates over a list of anchors
+    # and collects the value of the specified paramenter
+    #
+    def values_for_param(param)
+      lambda { |nodeList|
+        nodeList.collect {|node|
+          query = URI::parse(node.attr(:href)).query
+          query.split("&").collect { |token| token.split("=") }.
+            detect{ |chunks| chunks.first == param.to_s }.last
+        }.uniq
+      }
+    end
+  end
+  module URIAddition
+    #
+    # Public
+    #
+    # Generates a hash representation of a uri's query string.
+    #
+    # Returns a hash mapping the URL query parameters to their respective values
+    #
+    # NOTE: this is intended as a decorator method for instances of URI::HTTP.
+    #
+    # examples:
+    #
+    # URI::parse(url).extend(URIAddition).query_hash
+    #
+    def query_hash
+      return unless self.query
+      self.query.split("&").reduce({}) do |memo, item|
+        param, value = *item.split("=")
+        memo.merge(param.to_sym => value)
+      end
+    end
+  end
+  module DeepFetchable
+    def get_in(path)
+      keys, node = Array(path), self
+      keys.each_with_index do |key, index|
+        node = node[key]
+        next_key = keys[index + 1]
+        break unless node
+      end
+      node
+    end
+  end
+  module Support
+    def symbolize_keys(hash)
+      hash.reduce({}) { |memo, (k,v)| memo.merge(k => v) }
+    end
+    #
+    # Creates instance variables from a hash.
+    #
+    # hash     - An hash representing of instance variables to be created.
+    # defaults - An hash representing the attributes' default values (optional).
+    #
+    protected
+    def set_attributes(hash, defaults={})
+      allowed = defaults.keys
+      hash.each { |key, value| self.instance_variable_set("@#{key}", value)}
+      defaults.each do |key, value|
+        self.instance_variable_set("@#{key}", value) unless self.instance_variable_get("@#{key}")
+      end
+    end
+  end
+end

data/lib/extraloop.rb ADDED Viewed

@@ -0,0 +1,43 @@
+base_path = File.expand_path(File.dirname(__FILE__) + "/extraloop"  )
+# Standard library
+autoload :OpenStruct, "ostruct"
+# Rubygems
+gem "yajl-ruby"
+gem "nokogiri"
+gem "typhoeus"
+gem "logging"
+autoload :Nokogiri, "nokogiri"
+autoload :Yajl,     "yajl"
+autoload :Typhoeus, "typhoeus"
+# Extraloop components
+autoload :Utils                 , "#{base_path}/utils"
+autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
+autoload :ExtractorBase         , "#{base_path}/extractor_base"
+autoload :DomExtractor          , "#{base_path}/dom_extractor"
+autoload :JsonExtractor         , "#{base_path}/json_extractor"
+autoload :ExtractionLoop        , "#{base_path}/extraction_loop"
+autoload :ScraperBase           , "#{base_path}/scraper_base"
+autoload :Loggable              , "#{base_path}/loggable"
+autoload :Hookable             , "#{base_path}/hookable"
+autoload :IterativeScraper      , "#{base_path}/iterative_scraper"
+# monkey patch scraperbase with the Loggable module.
+#
+# This is the equivalent adding extra_loop/ to the path and requiring both ScraperBase and Loggable
+#
+ScraperBase
+Loggable
+class ExtraLoop
+  VERSION = '0.0.1'
+end

data/spec/dom_extractor_spec.rb ADDED Viewed

@@ -0,0 +1,165 @@
+require 'helpers/spec_helper'
+describe DomExtractor do
+  before(:each) do
+    stub(scraper = Object.new).options
+    stub(scraper).results
+    @env = ExtractionEnvironment.new(scraper)
+    @html ||= <<-EOF
+  <div class="entry">
+    <p><a href="http://example.com">my dummy link</a></p>
+  </div>
+  <div class="entry exclude" />
+  <div class="entry" />
+    EOF
+    @xml ||= <<-EOF
+    <?xml version="1.0"?>
+    <StandardDataObject xmlns="myns">
+      <InteractionElements>
+        <TargetCenter>92f4-MPA</TargetCenter>
+        <Trace>7.19879</Trace>
+      </InteractionElements>
+    </StandardDataObject>
+    EOF
+  end
+  describe "#new" do
+    subject { DomExtractor.new(:my_field, @env,  "p a", :href) }
+     it { subject.field_name.should eql(:my_field) }
+  end
+  context "when no attribute is provided" do
+    before do
+      @extractor = DomExtractor.new(:anchor, @env, "p a")
+      @node = @extractor.parse(@html)
+    end
+    describe "#extract_field" do
+      subject { @extractor.extract_field(@node) }
+      it { should eql("my dummy link") }
+    end
+  end
+  context "when an attribute is provided" do
+    before do
+      @extractor = DomExtractor.new(:anchor, @env, "p a", :href)
+      @node = @extractor.parse(@html)
+    end
+    describe "#extract_field" do
+      subject { @extractor.extract_field(@node) }
+      it { should eql("http://example.com") }
+    end
+  end
+  context "when a selector and a block is provided" do
+    before do
+      @extractor = DomExtractor.new(:anchor, @env, "p a", proc { |node|
+        node.text.gsub("dummy", "fancy")
+      })
+      @node = @extractor.parse(@html)
+    end
+    describe "#extract_field" do
+      subject { @extractor.extract_field(@node) }
+      it { should match(/my fancy/) }
+    end
+  end
+  context "when only a block is provided" do
+    before do
+      @extractor = DomExtractor.new(:anchor, @env, proc { |document|
+        document.at_css("p a").text.gsub(/dummy/,'fancy')
+      })
+      @node = @extractor.parse(@html)
+    end
+    describe "#extract_field" do
+      subject { @extractor.extract_field(@node) }
+      it { should match(/my fancy/) }
+    end
+  end
+  context "when only an attribute is provided" do
+    before do
+      @extractor = DomExtractor.new(:url, @env, :href)
+      @node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
+    end
+    describe "#extract_field" do
+      subject { @extractor.extract_field(@node) }
+      it { should eql("hello-world") }
+    end
+  end
+  context "when nothing but a field name is provided" do
+    before do
+      @extractor = DomExtractor.new(:url, @env)
+      @node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
+    end
+    describe "#extract_field" do
+      subject { @extractor.extract_field(@node) }
+      it { should eql("Hello") }
+    end
+  end
+  describe "extract_list" do
+    context "no block provided" do
+      before do
+        @extractor = DomExtractor.new(nil, @env, "div.entry")
+        @node = @extractor.parse(@html)
+      end
+      subject { @extractor.extract_list(@node) }
+      it { subject.should have(3).items }
+    end
+    context "block provided" do
+      before do
+        @extractor = DomExtractor.new(nil, @env, "div.entry", lambda { |nodeList|
+          nodeList.reject {|node| node.attr(:class).split(" ").include?('exclude')  }
+        })
+      end
+      subject { @extractor.extract_list(@html) }
+      it { subject.should have(2).items }
+    end
+  end
+  context "xml input" do
+    describe "#parse" do
+      before do
+        @extractor = DomExtractor.new(nil, @env, "entry")
+      end
+      subject { @extractor.parse(@xml) }
+      it { should be_an_instance_of(Nokogiri::XML::Document)}
+    end
+  end
+  context "html input" do
+    describe "#parse" do
+      before do
+        @extractor = DomExtractor.new(nil, @env, "entry")
+      end
+      subject { @extractor.parse(@html) }
+      it { should be_an_instance_of(Nokogiri::HTML::Document)}
+    end
+  end
+  context "non-string input" do
+    describe "#parse" do
+      before do
+        @extractor = DomExtractor.new(nil, @env, "entry")
+      end
+      it "Should raise an exception" do
+        expect { @extractor.parse(Nokogiri::HTML(@html)) }.to raise_exception(ExtractorBase::Exceptions::ExtractorParseError)
+      end
+    end
+  end
+end

data/spec/extraction_loop_spec.rb ADDED Viewed

@@ -0,0 +1,76 @@
+require 'helpers/spec_helper'
+describe ExtractionLoop do
+  before(:each) do
+    @fake_scraper = stub!.options
+    stub(@fake_scraper).results
+  end
+  describe "#new" do
+    before do
+      @mock_loop = Object.new
+      stub(@mock_loop).parse {}
+    end
+    subject { ExtractionLoop.new(@mock_loop ) }
+    it "should allow read/write access to public attributes" do
+      {:extractors => [:fake, :fake],
+       :document => nil,
+       :hooks => { }
+      }.each do |k, v|
+        subject.send("#{k}=", v)
+        subject.send(k).should eql(v)
+      end
+    end
+  end
+  describe "run" do
+    before(:each) do
+      @extractors = [:a, :b].map do |field_name|
+        object = Object.new
+        stub(object).extract_field { |node, record| node[field_name] }
+        stub(object).field_name { field_name }
+        object
+      end
+      @loop_extractor = Object.new
+      stub(@loop_extractor).parse { |input| Nokogiri::HTML("<html><body>Hello test!</body></html>") }
+      stub(@loop_extractor).extract_list { |document|
+        #list of fake dom elements
+        (0..9).to_a.map { |n| {:a => n, :b => n*n } }
+      }
+      before, before_extract, after_extract, after = *(1..4).to_a.map { proc {} }
+      hooks = {before: [before], before_extract: [before_extract], after_extract: [after_extract], after: [after]}
+      any_instance_of(ExtractionEnvironment) do |env|
+        mock(env).run.with_any_args.times(20 + 2)
+      end
+      @extraction_loop = ExtractionLoop.new(@loop_extractor, @extractors, "fake document", hooks, @fake_scraper).run
+    end
+    subject { @extraction_loop.run }
+    it "should produce 10 records" do
+      @extraction_loop.records.size.should eql(10)
+    end
+    it "should run extractors" do
+      @extraction_loop.records.all? { |record| record.a && record.b && record.b == record.a ** 2 }
+    end
+    it "should convert extracted records into OpenStruct instances" do
+      @extraction_loop.records.all? { |record| record.is_a?(OpenStruct) }
+    end
+  end
+end