RubyGems - andyverprauskus-scrubyt - Versions diffs - 0.5.1 - Mend

andyverprauskus-scrubyt 0.5.1

Files changed (45) hide show

data/CHANGELOG +355 -0
data/COPYING +340 -0
data/README.rdoc +121 -0
data/Rakefile +101 -0
data/lib/scrubyt.rb +53 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +318 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +312 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +63 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +107 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +183 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +145 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +120 -0

data/lib/scrubyt/core/scraping/compound_example.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module Scrubyt
+  ##
+  #=<tt>Represents a compund example</tt>
+  #
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example. The simple example
+  #is specified by a string, and a compound example is specified with
+  #:contains, :begins_with and :ends_with descriptors - which can be
+  #both regexps or strings
+  class CompoundExample
+    DESCRIPTORS = [:contains, :begins_with, :ends_with]
+    attr_accessor :descriptor_hash
+    def initialize(descriptor_hash)
+      @descriptor_hash = descriptor_hash
+    end
+    ##
+    #Is the hash passed to this function a compound example descriptor hash?
+    #Need to decide this when parsing pattern parameters
+    def self.compound_example?(hash)
+      hash.each do |k,v|
+        return false if !DESCRIPTORS.include? k
+      end
+      true
+    end# end of method
+  end# #end of class CompoundExample
+end# end of module Scrubyt

data/lib/scrubyt/core/scraping/constraint.rb ADDED Viewed

@@ -0,0 +1,169 @@
+module Scrubyt
+  ##
+  #=<tt>Rejecting result instances based on further rules</tt>
+  #
+  #The two  most trivial problems with a set of rules is that they match either less
+  #or more instances than we would like them to. Constraints are a way to remedy the second problem:
+  #they serve as a tool to filter out some result instances based on rules. A typical
+  #example:
+  #
+  #* *ensure_presence_of_ancestor_pattern* consider this model:
+  #    <book>
+  #      <author>...</author>
+  #      <title>...</title>
+  #    </book>
+  #
+  #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
+  #'author' and 'title', only those books will be matched which have an author and a
+  #title (i.e.the child patterns author and title must extract something). This is a way
+  #to say 'a book MUST have an author and a title'.
+  class Constraint
+    #There are more possible ways of applying/checking constraints in the case of
+    #ones that can not be checked in the context node (e.g. ensure_presence_of -
+    #since it may require the evaluation of child patterns of the context pattern to
+    #arbitray level)
+    #
+    #In such cases, the possibilities are:
+    #
+    #1) make a depth-first evaluation from the context pattern until the needed ancestor
+    #   pattern is evaluated. This can mess things up, since if any ancestor node uses
+    #   the sinks of predecessor(s) other than the context node, those need to be evaluated
+    #   too, and we may run into a cyclyc dependency or at least a complicated recursion
+    #
+    #2) Post processing - evaluate normally and throw out results which do not pass the
+    #   constraint
+    #
+    #2b) Do it on the XML level - most probably this solution will be implemented
+    # Different constraint types
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
+    CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
+    CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
+    CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
+    attr_reader :type, :target
+    #Add 'ensure presence of ancestor pattern' constraint
+    #If this type of constraint is added to a pattern, it must have an ancestor pattern
+    #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
+    #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
+    #(just by looking at the wrapper model, the ancestor pattern is always present)
+    #Note that from this type of constraint there is no 'ensure_absence' version, since
+    #I could not think about an use case for that
+    def self.add_ensure_presence_of_pattern(ancestor)
+      Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
+    end
+    #Add 'ensure absence of attribute' constraint
+    #If this type of constraint is added to a pattern, the HTML node it targets
+    #must NOT have an attribute named "attribute_name" with the value "attribute_value"
+    def self.add_ensure_absence_of_attribute(attribute_hash)
+      Constraint.new(attribute_hash,
+                     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
+    end
+    #Add 'ensure presence of attribute' constraint
+    #If this type of constraint is added to a pattern, the HTML node it targets
+    #must have an attribute named "attribute_name" with the value "attribute_value"
+    def self.add_ensure_presence_of_attribute(attribute_hash)
+      Constraint.new(attribute_hash,
+                     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
+    end
+    #Add 'ensure absence of ancestor node' constraint
+    #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
+    #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
+    #
+    #"attributes" is an array of hashes, for example
+    #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
+    #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
+    #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
+    #
+    #"attributes" can be empty - in this case just the 'node_name' is checked
+    def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
+      Constraint.new([node_name, attributes],
+                     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
+    end
+    #Add 'ensure presence of ancestor node' constraint
+    #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
+    #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
+    #
+    #"attributes" is an array of hashes, for example
+    #[{'font' => 'red'}, {'href' => 'http://www.google.com'}]
+    #in the case that more values have to be checked with the same key (e.g. 'class' => 'small' and '
+    #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
+    #
+    #"attributes" can be empty - in this case just the 'node_name' is checked
+    def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
+      Constraint.new([node_name, attributes],
+                     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
+    end
+    #Evaluate the constraint; if this function returns true,
+    #it means that the constraint passed, i.e. its filter will be added to the exctracted
+    #content of the pattern
+    def check(result)
+      case @type
+        #checked after evaluation, so here always return true
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN
+          return true
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE
+          attribute_present(result)
+        when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
+          !attribute_present(result)
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
+          ancestor_node_present(result)
+        when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
+          !ancestor_node_present(result)
+      end
+    end
+  private
+    #We would not like these to be called from outside
+    def initialize(target, type)
+      @target = target
+      @type = type
+    end
+    #Implementation of the ancestor node presence test
+    #Check the documentation of the add_ensure_presence_of_ancestor_node method
+    #for further information on the result parameter
+    def ancestor_node_present(result)
+      found = false
+      node_name = @target[0]
+      node_attributes = @target[1]
+      node_attributes.each do |pair|
+        return true if !result.search("//#{node_name}[@#{pair[0]}='#{pair[1]}']").empty?
+      end
+      if node_attributes.empty?
+        return true if !result.search("//#{node_name}").empty?
+      end
+      false
+    end
+    def attribute_present(result)
+      return unless result.is_a? Hpricot::Elem
+      match = true
+      #If v = nil, the value of the attribute can be arbitrary;
+      #Therefore, in this case we just have to make sure that the attribute is
+      #present (i.e. != nil), we don't care about the value
+      @target.each do |k,v|
+        if v == nil
+            match &&= (result.attributes[k.to_s] != nil)
+          else
+            match &&= (result.attributes[k.to_s] == v.to_s)
+        end
+      end
+      match
+    end
+  end #end of class
+end #end of module

data/lib/scrubyt/core/scraping/constraint_adder.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Scrubyt
+  ##
+  #=<tt>Utility class for adding constraints</tt>
+  #
+  #Originally methods of Pattern - but since Pattern was already too heavy (and after
+  #all, adding a constraint (logically) does not belong to Pattern anyway) it was moved
+  #to this utility class. In pattern everything that begins with ensure_
+  #is automatically dispatched here.
+  #
+  #I will not document the functions since these are just forwarders; See the 'real'
+  #functions with their documentation in Scrubyt::Constraint.rb
+  class ConstraintAdder
+    def self.ensure_presence_of_pattern(ancestor_node_name)
+      Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
+    end
+    def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
+      Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
+                                                          prepare_attributes(attributes))
+    end
+    def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
+      Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
+                                                         prepare_attributes(attributes))
+    end
+    def self.ensure_presence_of_attribute(attribute_hash)
+      Constraint.add_ensure_presence_of_attribute(attribute_hash)
+    end
+    def self.ensure_absence_of_attribute(attribute_hash)
+      Constraint.add_ensure_absence_of_attribute(attribute_hash)
+    end
+    private
+    def self.prepare_attributes(attributes)
+      attribute_pairs = []
+      attributes.each do |key, value|
+        if (value.instance_of? Array)
+          value.each {|val| attribute_pairs << [key,val]}
+        else
+          attribute_pairs << [key, value]
+        end
+      end
+      return attribute_pairs
+    end #end of method prepare_attributes
+  end #end of class ConstraintAddere
+end #end of module Scrubyt

data/lib/scrubyt/core/scraping/filters/attribute_filter.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Scrubyt
+  class AttributeFilter < BaseFilter
+    def evaluate(source)
+      elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
+      if elem.is_a? Hpricot::Elem
+        return [elem.attributes[@example]]
+      else
+        return nil
+      end
+    end
+  end #End of class AttributeFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/base_filter.rb ADDED Viewed

@@ -0,0 +1,112 @@
+module Scrubyt
+  ##
+  #=<tt>Filter out relevant pieces from the parent pattern</tt>
+  #
+  #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
+  #it reaches the bottom. The biggest difference is that instead of water, a HTML
+  #document travels through the space.
+  #
+  #Of course Scrubyt would not make much sense if the same document would arrive at
+  #the bottom that was poured in at the top - since in this case we might use an
+  #indentity transformation (i.e. do nothing with the input) as well.
+  #
+  #This is where filters came in: as they name says, they filter the stuff that is
+  #pouring from above, to leave the interesting parts and discard the rest.
+  #The working of a filter will be explained most easily by the help of an example.
+  #Let's consider that we would like to extract information from a webshop; Concretely
+  #we are interested in the name of the items and the URL pointing to the image of the
+  #item.
+  #
+  #To accomplish this, first we select the items with the pattern item (a pattern is
+  #a logical grouping of fillters; see Pattern documentation) Then our new
+  #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
+  #extract the name and the image of the item; and finally, extract the href attribute
+  #of the image. Let's see an illustration:
+  #
+  #   root             --> This pattern is called a 'root pattern', It is invisible to you
+  #   |                    and basically it represents the document; it has no filters
+  #   +-- item         --> Filter what's coming from above (the whole document) to get
+  #       |                relevant pieces of data (in this case webshop items)
+  #       +-- name     --> Again, filter what's coming from above (a webshop item) and
+  #       |                leave only item names after this operation
+  #       +-- image    --> This time filter the image of the item
+  #           |
+  #           +-- href --> And finally, from the image elements, get the attribute 'href'
+  class BaseFilter
+    #Type of the example this filter is extracted with
+    #XPath example, like html/body/tr/td[1] etc.
+    EXAMPLE_TYPE_XPATH = 0
+    #String from the document, for example 'Canon EOS 300 D'.
+    EXAMPLE_TYPE_STRING = 1
+    #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
+    EXAMPLE_TYPE_IMAGE = 2
+    #No example - the actual XPath is determined from the children XPaths (their LCA)
+    EXAMPLE_TYPE_CHILDREN = 3
+    #Regexp example, like /\d+@*\d+[a-z]/
+    EXAMPLE_TYPE_REGEXP = 4
+    #Compound example, like :contains => 'goodies'
+    EXAMPLE_TYPE_COMPOUND = 5
+    attr_accessor(:example_type, :parent_pattern, :temp_sink,
+                  :constraints, :xpath, :regexp, :example, :final_result)
+    def self.create(parent_pattern, example=nil)
+      filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
+      if filter_name == 'RootFilter'
+        BaseFilter.new(parent_pattern, example)
+      else
+        instance_eval("#{filter_name}.new(parent_pattern, example)")
+      end
+    end
+    #Dispatcher method to add constraints; of course, as with any method_missing, this method
+    #should not be called directly
+    #TODO still used?
+    alias_method :throw_method_missing, :method_missing
+    def method_missing(method_name, *args, &block)
+      case method_name.to_s
+      when /^ensure.+/
+        constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
+      else
+        throw_method_missing(method_name, *args, &block)
+      end
+    end
+    private
+    #We don't want this to be accessible from outside
+    def initialize(parent_pattern, example)
+      case parent_pattern.example_type
+      when :xpath
+        @example_type = EXAMPLE_TYPE_XPATH
+      else
+        @example_type = BaseFilter.determine_example_type(example)
+      end
+      @parent_pattern = parent_pattern
+      @example = example
+      @xpath = nil                #The xpath to evaluate this filter
+      @constraints = [] #list of constraints
+    end
+    def self.determine_example_type(example)
+      if example.instance_of? Regexp
+        EXAMPLE_TYPE_REGEXP
+      elsif example.instance_of? Hash
+        EXAMPLE_TYPE_COMPOUND
+      else
+        case example
+        when nil
+          EXAMPLE_TYPE_CHILDREN
+        when /\.(jpg|png|gif|jpeg)(\[\d+\])?$/
+          EXAMPLE_TYPE_IMAGE
+        when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*(\[@.+=.+\])?(\/@.+)?$/
+         (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
+        else
+          EXAMPLE_TYPE_STRING
+        end
+      end
+    end #end of method
+  end #End of class
+end #End of module

data/lib/scrubyt/core/scraping/filters/constant_filter.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Scrubyt
+  class ConstantFilter < BaseFilter
+    def evaluate(source)
+      return @example
+    end
+  end #End of class ConstantFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module Scrubyt
+  class DetailPageFilter < BaseFilter
+    def evaluate(source)
+      if source.is_a?(String)
+        url = source
+      else
+        url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
+      end
+      @parent_pattern.extractor.store_page
+      original_host_name = @parent_pattern.extractor.get_host_name
+      @parent_pattern.extractor.restore_host_name
+      begin
+        FetchAction.fetch url, :resolve => @parent_pattern.resolve
+      rescue
+        Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
+      end
+      if @detail_extractor.nil?
+        @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
+        root_results = @detail_extractor.result
+      else
+        root_results = @detail_extractor.evaluate_extractor
+      end
+      @parent_pattern.extractor.restore_page
+      @parent_pattern.extractor.store_host_name original_host_name
+      root_results
+    end
+  end
+end

data/lib/scrubyt/core/scraping/filters/download_filter.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'net/http'
+require 'fileutils'
+module Scrubyt
+  class DownloadFilter < BaseFilter
+    def evaluate(source)
+      download_file(source)
+    end #end of method
+private
+    def download_file(source)
+      return '' if source.size < 4
+      host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
+      outfile = nil
+      host_name += "/" if host_name[-1..-1] != "/"
+      base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
+      file_name = source.scan(/.+\/(.*)/)[0][0]
+      return nil if @parent_pattern.except.include? file_name
+      Net::HTTP.start(base_url) { |http|
+        Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
+        begin
+          ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
+          path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
+          resp = http.get(path, {'User-Agent'=> ua})
+          outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
+          FileUtils.mkdir_p @example
+          open(outfile, 'wb') {|f| f.write(resp.body) }
+        rescue Timeout::Error
+          outfile = "[FAILED]#{file_name}"
+        end
+       }
+       outfile.scan(/.+\/(.*)/)[0][0]
+    end
+   def self.find_nonexisting_file_name(file_name)
+      already_found = false
+      loop do
+        if File.exists? file_name
+          if already_found
+            if file_name.include?('.')
+              last_no = file_name.scan(/_(\d+)\./)[0][0]
+              file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
+            else
+              last_no = file_name.scan(/_(\d+)$/)[0][0]
+              file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
+            end
+          else
+            if file_name.include?('.')
+              file_name.sub!(/\./) {"_1\."}
+              already_found = true
+            else
+              file_name << '_1'
+              already_found = true
+            end
+          end
+        else
+          break
+        end
+      end
+      file_name
+   end #end of method
+  end #End of class DownloadFilter
+end #End of module Scrubyt