RubyGems - sutch-scrubyt - Versions diffs - 0.4.20 - Mend

sutch-scrubyt 0.4.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data/CHANGELOG +350 -0
data/COPYING +340 -0
data/README +121 -0
data/Rakefile +101 -0
data/lib/scrubyt.rb +45 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +253 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +168 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +140 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +117 -0

data/lib/scrubyt/core/scraping/constraint.rb ADDED Viewed

@@ -0,0 +1,169 @@
+module Scrubyt
+  ##
+  #=<tt>Rejecting result instances based on further rules</tt>
+  #
+  #The two  most trivial problems with a set of rules is that they match either less
+  #or more instances than we would like them to. Constraints are a way to remedy the second problem:
+  #they serve as a tool to filter out some result instances based on rules. A typical
+  #example:
+  #
+  #* *ensure_presence_of_ancestor_pattern* consider this model:
+  #    <book>
+  #      <author>...</author>
+  #      <title>...</title>
+  #    </book>
+  #
+  #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
+  #'author' and 'title', only those books will be matched which have an author and a
+  #title (i.e.the child patterns author and title must extract something). This is a way
+  #to say 'a book MUST have an author and a title'.
+  class Constraint
+    #There are more possible ways of applying/checking constraints in the case of
+    #ones that can not be checked in the context node (e.g. ensure_presence_of -
+    #since it may require the evaluation of child patterns of the context pattern to
+    #arbitray level)
+    #
+    #In such cases, the possibilities are:
+    #
+    #1) make a depth-first evaluation from the context pattern until the needed ancestor
+    #   pattern is evaluated. This can mess things up, since if any ancestor node uses
+    #   the sinks of predecessor(s) other than the context node, those need to be evaluated
+    #   too, and we may run into a cyclyc dependency or at least a complicated recursion
+    #
+    #2) Post processing - evaluate normally and throw out results which do not pass the
+    #   constraint
+    #
+    #2b) Do it on the XML level - most probably this solution will be implemented
+    # Different constraint types
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
+    CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
+    CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
+    attr_reader :type, :target
+    #Add 'ensure presence of ancestor pattern' constraint
+    #If this type of constraint is added to a pattern, it must have an ancestor pattern
+    #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
+    #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
+    #(just by looking at the wrapper model, the ancestor pattern is always present)
+    #Note that from this type of constraint there is no 'ensure_absence' version, since
+    #I could not think about an use case for that
+    def self.add_ensure_presence_of_pattern(ancestor)
+      Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
+    end
+    #Add 'ensure absence of attribute' constraint
+    #If this type of constraint is added to a pattern, the HTML node it targets
+    #must NOT have an attribute named "attribute_name" with the value "attribute_value"
+    def self.add_ensure_absence_of_attribute(attribute_hash)
+      Constraint.new(attribute_hash,
+                     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
+    end
+    #Add 'ensure presence of attribute' constraint
+    #If this type of constraint is added to a pattern, the HTML node it targets
+    #must have an attribute named "attribute_name" with the value "attribute_value"
+    def self.add_ensure_presence_of_attribute(attribute_hash)
+      Constraint.new(attribute_hash,
+                     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
+    end
+    #Add 'ensure absence of ancestor node' constraint
+    #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
+    #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
+    #
+    #"attributes" is an array of hashes, for example
+    #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
+    #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
+    #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
+    #
+    #"attributes" can be empty - in this case just the 'node_name' is checked
+    def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
+      Constraint.new([node_name, attributes],
+                     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
+    end
+    #Add 'ensure presence of ancestor node' constraint
+    #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
+    #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
+    #
+    #"attributes" is an array of hashes, for example
+    #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
+    #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
+    #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
+    #
+    #"attributes" can be empty - in this case just the 'node_name' is checked
+    def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
+      Constraint.new([node_name, attributes],
+                     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
+    end
+    #Evaluate the constraint; if this function returns true,
+    #it means that the constraint passed, i.e. its filter will be added to the exctracted
+    #content of the pattern
+    def check(result)
+      case @type
+        #checked after evaluation, so here always return true
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN
+          return true
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE
+          attribute_present(result)
+        when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
+          !attribute_present(result)
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
+          ancestor_node_present(result)
+        when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
+          !ancestor_node_present(result)
+      end
+    end
+  private
+    #We would not like these to be called from outside
+    def initialize(target, type)
+      @target = target
+      @type = type
+    end
+    #Implementation of the ancestor node presence test
+    #Check the documentation of the add_ensure_presence_of_ancestor_node method
+    #for further information on the result parameter
+    def ancestor_node_present(result)
+      found = false
+      node_name = @target[0]
+      node_attributes = @target[1]
+      node_attributes.each do |pair|
+        return true if !result.search("//#{node_name}[@#{pair[0]}='#{pair[1]}']").empty?
+      end
+      if node_attributes.empty?
+        return true if !result.search("//#{node_name}").empty?
+      end
+      false
+    end
+    def attribute_present(result)
+      return unless result.is_a? Hpricot::Elem
+      match = true
+      #If v = nil, the value of the attribute can be arbitrary;
+      #Therefore, in this case we just have to make sure that the attribute is
+      #present (i.e. != nil), we don't care about the value
+      @target.each do |k,v|
+        if v == nil
+            match &&= (result.attributes[k.to_s] != nil)
+          else
+            match &&= (result.attributes[k.to_s] == v.to_s)
+        end
+      end
+      match
+    end
+  end #end of class
+end #end of module

data/lib/scrubyt/core/scraping/constraint_adder.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Scrubyt
+  ##
+  #=<tt>Utility class for adding constraints</tt>
+  #
+  #Originally methods of Pattern - but since Pattern was already too heavy (and after
+  #all, adding a constraint (logically) does not belong to Pattern anyway) it was moved
+  #to this utility class. In pattern everything that begins with ensure_
+  #is automatically dispatched here.
+  #
+  #I will not document the functions since these are just forwarders; See the 'real'
+  #functions with their documentation in Scrubyt::Constraint.rb
+  class ConstraintAdder
+    def self.ensure_presence_of_pattern(ancestor_node_name)
+      Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
+    end
+    def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
+      Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
+                                                          prepare_attributes(attributes))
+    end
+    def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
+      Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
+                                                         prepare_attributes(attributes))
+    end
+    def self.ensure_presence_of_attribute(attribute_hash)
+      Constraint.add_ensure_presence_of_attribute(attribute_hash)
+    end
+    def self.ensure_absence_of_attribute(attribute_hash)
+      Constraint.add_ensure_absence_of_attribute(attribute_hash)
+    end
+    private
+    def self.prepare_attributes(attributes)
+      attribute_pairs = []
+      attributes.each do |key, value|
+        if (value.instance_of? Array)
+          value.each {|val| attribute_pairs << [key,val]}
+        else
+          attribute_pairs << [key, value]
+        end
+      end
+      return attribute_pairs
+    end #end of method prepare_attributes
+  end #end of class ConstraintAddere
+end #end of module Scrubyt

data/lib/scrubyt/core/scraping/filters/attribute_filter.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Scrubyt
+  class AttributeFilter < BaseFilter
+    def evaluate(source)
+      elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
+      if elem.is_a? Hpricot::Elem
+        return [elem.attributes[@example]]
+      else
+        return nil
+      end
+    end
+  end #End of class AttributeFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/base_filter.rb ADDED Viewed

@@ -0,0 +1,112 @@
+module Scrubyt
+  ##
+  #=<tt>Filter out relevant pieces from the parent pattern</tt>
+  #
+  #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
+  #it reaches the bottom. The biggest difference is that instead of water, a HTML
+  #document travels through the space.
+  #
+  #Of course Scrubyt would not make much sense if the same document would arrive at
+  #the bottom that was poured in at the top - since in this case we might use an
+  #indentity transformation (i.e. do nothing with the input) as well.
+  #
+  #This is where filters came in: as they name says, they filter the stuff that is
+  #pouring from above, to leave the interesting parts and discard the rest.
+  #The working of a filter will be explained most easily by the help of an example.
+  #Let's consider that we would like to extract information from a webshop; Concretely
+  #we are interested in the name of the items and the URL pointing to the image of the
+  #item.
+  #
+  #To accomplish this, first we select the items with the pattern item (a pattern is
+  #a logical grouping of fillters; see Pattern documentation) Then our new
+  #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
+  #extract the name and the image of the item; and finally, extract the href attribute
+  #of the image. Let's see an illustration:
+  #
+  #   root             --> This pattern is called a 'root pattern', It is invisible to you
+  #   |                    and basically it represents the document; it has no filters
+  #   +-- item         --> Filter what's coming from above (the whole document) to get
+  #       |                relevant pieces of data (in this case webshop items)
+  #       +-- name     --> Again, filter what's coming from above (a webshop item) and
+  #       |                leave only item names after this operation
+  #       +-- image    --> This time filter the image of the item
+  #           |
+  #           +-- href --> And finally, from the image elements, get the attribute 'href'
+  class BaseFilter
+    #Type of the example this filter is extracted with
+    #XPath example, like html/body/tr/td[1] etc.
+    EXAMPLE_TYPE_XPATH = 0
+    #String from the document, for example 'Canon EOS 300 D'.
+    EXAMPLE_TYPE_STRING = 1
+    #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
+    EXAMPLE_TYPE_IMAGE = 2
+    #No example - the actual XPath is determined from the children XPaths (their LCA)
+    EXAMPLE_TYPE_CHILDREN = 3
+    #Regexp example, like /\d+@*\d+[a-z]/
+    EXAMPLE_TYPE_REGEXP = 4
+    #Compound example, like :contains => 'goodies'
+    EXAMPLE_TYPE_COMPOUND = 5
+    attr_accessor(:example_type, :parent_pattern, :temp_sink,
+                  :constraints, :xpath, :regexp, :example, :final_result)
+    def self.create(parent_pattern, example=nil)
+      filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
+      if filter_name == 'RootFilter'
+        BaseFilter.new(parent_pattern, example)
+      else
+        instance_eval("#{filter_name}.new(parent_pattern, example)")
+      end
+    end
+    #Dispatcher method to add constraints; of course, as with any method_missing, this method
+    #should not be called directly
+    #TODO still used?
+    alias_method :throw_method_missing, :method_missing
+    def method_missing(method_name, *args, &block)
+      case method_name.to_s
+      when /^ensure.+/
+        constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
+      else
+        throw_method_missing(method_name, *args, &block)
+      end
+    end
+    private
+    #We don't want this to be accessible from outside
+    def initialize(parent_pattern, example)
+      case parent_pattern.example_type
+      when :xpath
+        @example_type = EXAMPLE_TYPE_XPATH
+      else
+        @example_type = BaseFilter.determine_example_type(example)
+      end
+      @parent_pattern = parent_pattern
+      @example = example
+      @xpath = nil                #The xpath to evaluate this filter
+      @constraints = [] #list of constraints
+    end
+    def self.determine_example_type(example)
+      if example.instance_of? Regexp
+        EXAMPLE_TYPE_REGEXP
+      elsif example.instance_of? Hash
+        EXAMPLE_TYPE_COMPOUND
+      else
+        case example
+        when nil
+          EXAMPLE_TYPE_CHILDREN
+        when /\.(jpg|png|gif|jpeg)(\[\d+\])?$/
+          EXAMPLE_TYPE_IMAGE
+        when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*(\[@.+=.+\])?(\/@.+)?$/
+         (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
+        else
+          EXAMPLE_TYPE_STRING
+        end
+      end
+    end #end of method
+  end #End of class
+end #End of module

data/lib/scrubyt/core/scraping/filters/constant_filter.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Scrubyt
+  class ConstantFilter < BaseFilter
+    def evaluate(source)
+      return @example
+    end
+  end #End of class ConstantFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module Scrubyt
+  class DetailPageFilter < BaseFilter
+    def evaluate(source)
+      if source.is_a?(String)
+        url = source
+      else
+        url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
+      end
+      @parent_pattern.extractor.store_page
+      original_host_name = @parent_pattern.extractor.get_host_name
+      @parent_pattern.extractor.restore_host_name
+      begin
+        FetchAction.fetch url, :resolve => @parent_pattern.resolve
+      rescue
+        Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
+      end
+      if @detail_extractor.nil?
+        @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
+        root_results = @detail_extractor.result
+      else
+        root_results = @detail_extractor.evaluate_extractor
+      end
+      @parent_pattern.extractor.restore_page
+      @parent_pattern.extractor.store_host_name original_host_name
+      root_results
+    end
+  end
+end

data/lib/scrubyt/core/scraping/filters/download_filter.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'net/http'
+require 'fileutils'
+module Scrubyt
+  class DownloadFilter < BaseFilter
+    def evaluate(source)
+      download_file(source)
+    end #end of method
+private
+    def download_file(source)
+      return '' if source.size < 4
+      host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
+      outfile = nil
+      host_name += "/" if host_name[-1..-1] != "/"
+      base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
+      file_name = source.scan(/.+\/(.*)/)[0][0]
+      return nil if @parent_pattern.except.include? file_name
+      Net::HTTP.start(base_url) { |http|
+        Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
+        begin
+          ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
+          path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
+          resp = http.get(path, {'User-Agent'=> ua})
+          outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
+          FileUtils.mkdir_p @example
+          open(outfile, 'wb') {|f| f.write(resp.body) }
+        rescue Timeout::Error
+          outfile = "[FAILED]#{file_name}"
+        end
+       }
+       outfile.scan(/.+\/(.*)/)[0][0]
+    end
+   def self.find_nonexisting_file_name(file_name)
+      already_found = false
+      loop do
+        if File.exists? file_name
+          if already_found
+            if file_name.include?('.')
+              last_no = file_name.scan(/_(\d+)\./)[0][0]
+              file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
+            else
+              last_no = file_name.scan(/_(\d+)$/)[0][0]
+              file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
+            end
+          else
+            if file_name.include?('.')
+              file_name.sub!(/\./) {"_1\."}
+              already_found = true
+            else
+              file_name << '_1'
+              already_found = true
+            end
+          end
+        else
+          break
+        end
+      end
+      file_name
+   end #end of method
+  end #End of class DownloadFilter
+end #End of module Scrubyt