RubyGems - scrubber-scrubyt - Versions diffs - 0.4.11 - Mend

scrubber-scrubyt 0.4.11

Files changed (45) hide show

data/CHANGELOG +343 -0
data/COPYING +340 -0
data/README +99 -0
data/Rakefile +101 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +167 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +140 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/lib/scrubyt.rb +43 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +115 -0

data/lib/scrubyt/core/scraping/filters/base_filter.rb ADDED Viewed

@@ -0,0 +1,112 @@
+module Scrubyt
+  ##
+  #=<tt>Filter out relevant pieces from the parent pattern</tt>
+  #
+  #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
+  #it reaches the bottom. The biggest difference is that instead of water, a HTML
+  #document travels through the space.
+  #
+  #Of course Scrubyt would not make much sense if the same document would arrive at
+  #the bottom that was poured in at the top - since in this case we might use an
+  #indentity transformation (i.e. do nothing with the input) as well.
+  #
+  #This is where filters came in: as they name says, they filter the stuff that is
+  #pouring from above, to leave the interesting parts and discard the rest.
+  #The working of a filter will be explained most easily by the help of an example.
+  #Let's consider that we would like to extract information from a webshop; Concretely
+  #we are interested in the name of the items and the URL pointing to the image of the
+  #item.
+  #
+  #To accomplish this, first we select the items with the pattern item (a pattern is
+  #a logical grouping of fillters; see Pattern documentation) Then our new
+  #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
+  #extract the name and the image of the item; and finally, extract the href attribute
+  #of the image. Let's see an illustration:
+  #
+  #   root             --> This pattern is called a 'root pattern', It is invisible to you
+  #   |                    and basically it represents the document; it has no filters
+  #   +-- item         --> Filter what's coming from above (the whole document) to get
+  #       |                relevant pieces of data (in this case webshop items)
+  #       +-- name     --> Again, filter what's coming from above (a webshop item) and
+  #       |                leave only item names after this operation
+  #       +-- image    --> This time filter the image of the item
+  #           |
+  #           +-- href --> And finally, from the image elements, get the attribute 'href'
+  class BaseFilter
+    #Type of the example this filter is extracted with
+    #XPath example, like html/body/tr/td[1] etc.
+    EXAMPLE_TYPE_XPATH = 0
+    #String from the document, for example 'Canon EOS 300 D'.
+    EXAMPLE_TYPE_STRING = 1
+    #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
+    EXAMPLE_TYPE_IMAGE = 2
+    #No example - the actual XPath is determined from the children XPaths (their LCA)
+    EXAMPLE_TYPE_CHILDREN = 3
+    #Regexp example, like /\d+@*\d+[a-z]/
+    EXAMPLE_TYPE_REGEXP = 4
+    #Compound example, like :contains => 'goodies'
+    EXAMPLE_TYPE_COMPOUND = 5
+    attr_accessor(:example_type, :parent_pattern, :temp_sink,
+                  :constraints, :xpath, :regexp, :example, :final_result)
+    def self.create(parent_pattern, example=nil)
+      filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
+      if filter_name == 'RootFilter'
+        BaseFilter.new(parent_pattern, example)
+      else
+        instance_eval("#{filter_name}.new(parent_pattern, example)")
+      end
+    end
+    #Dispatcher method to add constraints; of course, as with any method_missing, this method
+    #should not be called directly
+    #TODO still used?
+    alias_method :throw_method_missing, :method_missing
+    def method_missing(method_name, *args, &block)
+      case method_name.to_s
+      when /^ensure.+/
+        constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
+      else
+        throw_method_missing(method_name, *args, &block)
+      end
+    end
+    private
+    #We don't want this to be accessible from outside
+    def initialize(parent_pattern, example)
+      case parent_pattern.example_type
+      when :xpath
+        @example_type = EXAMPLE_TYPE_XPATH
+      else
+        @example_type = BaseFilter.determine_example_type(example)
+      end
+      @parent_pattern = parent_pattern
+      @example = example
+      @xpath = nil                #The xpath to evaluate this filter
+      @constraints = [] #list of constraints
+    end
+    def self.determine_example_type(example)
+      if example.instance_of? Regexp
+        EXAMPLE_TYPE_REGEXP
+      elsif example.instance_of? Hash
+        EXAMPLE_TYPE_COMPOUND
+      else
+        case example
+        when nil
+          EXAMPLE_TYPE_CHILDREN
+        when /\.(jpg|png|gif|jpeg)(\[\d+\])?$/
+          EXAMPLE_TYPE_IMAGE
+        when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*(\[@.+=.+\])?(\/@.+)?$/
+         (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
+        else
+          EXAMPLE_TYPE_STRING
+        end
+      end
+    end #end of method
+  end #End of class
+end #End of module

data/lib/scrubyt/core/scraping/filters/constant_filter.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Scrubyt
+  class ConstantFilter < BaseFilter
+    def evaluate(source)
+      return @example
+    end
+  end #End of class ConstantFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module Scrubyt
+  class DetailPageFilter < BaseFilter
+    def evaluate(source)
+      if source.is_a?(String)
+        url = source
+      else
+        url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
+      end
+      @parent_pattern.extractor.store_page
+      original_host_name = @parent_pattern.extractor.get_host_name
+      @parent_pattern.extractor.restore_host_name
+      begin
+        FetchAction.fetch url, :resolve => @parent_pattern.resolve
+      rescue
+        Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
+      end
+      if @detail_extractor.nil?
+        @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
+        root_results = @detail_extractor.result
+      else
+        root_results = @detail_extractor.evaluate_extractor
+      end
+      @parent_pattern.extractor.restore_page
+      @parent_pattern.extractor.store_host_name original_host_name
+      root_results
+    end
+  end
+end

data/lib/scrubyt/core/scraping/filters/download_filter.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'net/http'
+require 'fileutils'
+module Scrubyt
+  class DownloadFilter < BaseFilter
+    def evaluate(source)
+      download_file(source)
+    end #end of method
+private
+    def download_file(source)
+      return '' if source.size < 4
+      host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
+      outfile = nil
+      host_name += "/" if host_name[-1..-1] != "/"
+      base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
+      file_name = source.scan(/.+\/(.*)/)[0][0]
+      return nil if @parent_pattern.except.include? file_name
+      Net::HTTP.start(base_url) { |http|
+        Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
+        begin
+          ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
+          path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
+          resp = http.get(path, {'User-Agent'=> ua})
+          outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
+          FileUtils.mkdir_p @example
+          open(outfile, 'wb') {|f| f.write(resp.body) }
+        rescue Timeout::Error
+          outfile = "[FAILED]#{file_name}"
+        end
+       }
+       outfile.scan(/.+\/(.*)/)[0][0]
+    end
+   def self.find_nonexisting_file_name(file_name)
+      already_found = false
+      loop do
+        if File.exists? file_name
+          if already_found
+            if file_name.include?('.')
+              last_no = file_name.scan(/_(\d+)\./)[0][0]
+              file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
+            else
+              last_no = file_name.scan(/_(\d+)$/)[0][0]
+              file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
+            end
+          else
+            if file_name.include?('.')
+              file_name.sub!(/\./) {"_1\."}
+              already_found = true
+            else
+              file_name << '_1'
+              already_found = true
+            end
+          end
+        else
+          break
+        end
+      end
+      file_name
+   end #end of method
+  end #End of class DownloadFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Scrubyt
+  class HtmlSubtreeFilter < BaseFilter
+    def evaluate(source)
+      source.inner_html
+    end
+  end #End of class TreeFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/regexp_filter.rb ADDED Viewed

@@ -0,0 +1,13 @@
+module Scrubyt
+  class RegexpFilter < BaseFilter
+    def evaluate(source)
+      if source.is_a? String
+        source.scan(@example).flatten
+      else
+        source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
+      end
+    end
+  end #End of class TreeFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/script_filter.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Scrubyt
+  class ScriptFilter < BaseFilter
+    def evaluate(source)
+      param = source
+      param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
+      @example.call param
+    end
+  end #End of class ConstantFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/text_filter.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module Scrubyt
+  class TextFilter < BaseFilter
+    def evaluate(source)
+        return find_string(source) if @example =~ /^find\(/
+        final_element_name = @example.scan(/^(.+?)\[/)[0][0]
+        text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
+        index = @example.scan(/\]:(.+)/).flatten
+        index = 0 if index.empty?
+        index = index[0].to_i unless index[0] == "all"
+        result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
+        return "" unless result
+        if index[0] == "all"
+          result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
+        else
+          [XPathUtils.traverse_up_until_name(result,final_element_name)]
+        end
+    end
+    def find_string(source)
+      str = @example.scan(/find\((.+)\)/).flatten[0]
+      strings_to_find = str.include?('|') ? str.split('|') : [str]
+      strings_to_find.each do |s|
+        result = SharedUtils.traverse_for_match(source,/#{s}/i)
+        return [s] unless result.empty?
+      end
+      return []
+    end
+  end #End of class TextFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/tree_filter.rb ADDED Viewed

@@ -0,0 +1,138 @@
+module Scrubyt
+  class TreeFilter < BaseFilter
+    def evaluate(source)
+      return [@final_result] if @final_result
+      #Crude hack! Drop it after it will be supported in Hpricot
+      if @xpath =~ /.+\/@.+$/
+        @example = @xpath
+        @xpath = @xpath.scan(/^(.+?)\/@/)[0][0]
+      end
+      result = source/@xpath
+      Scrubyt.log :ACTION, "Evaluating #{@parent_pattern.name} with #{@xpath}"
+      xpath_results = Hpricot::Elements === result ? result : [result]
+      if @example =~ /.+\/@.+$/
+        result_attribute = @example.scan(/.+\/@(.+?)$/)[0][0]
+        xpath_results.map! {|r| r.attributes[result_attribute] }
+      end
+      if @regexp == nil
+        xpath_results
+      else
+        regexp_results = []
+        xpath_results.each do |entry|
+          text = SharedUtils.prepare_text_for_comparison(result.inner_html)
+          if text =~ @regexp
+            regexp_results << $1
+          end
+        end
+        regexp_results
+      end
+    end
+    def generate_regexp_for_example
+      return if @example_type != EXAMPLE_TYPE_STRING
+      return if @temp_sink.nil?
+      return if @temp_sink.is_a? String
+      return if @example =~ /.+\[.+\]$/
+      text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
+      match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
+      return if match_range == (0..text.length)
+      @regexp = text
+      @temp_sink.changing_ranges.sort.reverse.each do |range|
+        @regexp[range] = if range == match_range then '<<<regexp_selection>>>' else '<<<regexp_changing>>>' end
+      end
+      @regexp = Regexp.escape(@regexp)
+      @regexp = @regexp.gsub('<<<regexp_changing>>>', '.*?')
+      @regexp = @regexp.gsub('<<<regexp_selection>>>', '(.*?)')
+      @regexp = '^' + @regexp + '$'
+      @regexp = /#{@regexp}/
+    end
+    #For all the tree patterns, generate an XPath based on the given example
+    #Also this method should not be called directly; It is automatically called for every tree
+    #pattern directly after wrapper definition
+    def generate_XPath_for_example(next_page_example=false)
+      #puts "generating example for: #{@parent_pattern.name}"
+      #puts @example_type
+      case @example_type
+      when EXAMPLE_TYPE_XPATH
+        @xpath = @example
+      when EXAMPLE_TYPE_STRING
+        @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
+                                                             @example,
+                                                             next_page_example)
+        return if @temp_sink == nil
+        if @temp_sink.is_a? String
+          @final_result = @temp_sink
+          return
+        end
+        mark_changing_ranges = lambda { |element, range|
+          element.instance_eval do
+            @changing_ranges ||= [] << range
+            def changing_ranges
+              @changing_ranges
+            end
+          end
+        }
+        mark_changing_ranges.call(@temp_sink, @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0))
+        write_indices = next_page_example ? true : !@parent_pattern.generalize
+        @xpath = XPathUtils.generate_XPath(@temp_sink, nil, write_indices)
+      when EXAMPLE_TYPE_CHILDREN
+        current_example_index = 0
+        loop do
+          all_child_temp_sinks = []
+          @parent_pattern.children.each do |child_pattern|
+            all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink if child_pattern.filters[current_example_index].temp_sink
+          end
+          result = all_child_temp_sinks.pop
+          if all_child_temp_sinks.empty?
+            result = result.parent
+          else
+            all_child_temp_sinks.each do |child_sink|
+              result = XPathUtils.lowest_common_ancestor(result, child_sink)
+            end
+          end
+          xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
+                                               XPathUtils.generate_XPath(result, nil, true)
+          if @parent_pattern.filters.size < current_example_index + 1
+            @parent_pattern.filters << Scrubyt::BaseFilter.create(@parent_pattern)
+          end
+          @parent_pattern.filters[current_example_index].xpath = xpath
+          @parent_pattern.filters[current_example_index].temp_sink = result
+          @parent_pattern.children.each do |child_pattern|
+          next if child_pattern.type == :detail_page
+            child_pattern.filters[current_example_index].xpath =
+            child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
+            XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)
+          end
+          break if @parent_pattern.children[0].filters.size == current_example_index + 1
+          current_example_index += 1
+        end
+      when EXAMPLE_TYPE_IMAGE
+        @temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
+        @xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
+      when EXAMPLE_TYPE_COMPOUND
+        @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
+                                                                          @example,
+                                                                          next_page_example)
+        @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
+                                              XPathUtils.generate_XPath(@temp_sink, nil, true)
+      end
+    end
+    def generate_relative_XPath(parent_xpath)
+      parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
+                                              parent_xpath,
+                                              @parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
+      @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
+    end
+  end #End of class TreeFilter
+end #End of module Scrubyt