RubyGems - scrubyt - Versions diffs - 0.2.6 → 0.2.8 - Mend

scrubyt 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/CHANGELOG +59 -12
data/Rakefile +2 -2
data/lib/scrubyt.rb +24 -6
data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
data/lib/scrubyt/core/scraping/constraint.rb +53 -57
data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
data/lib/scrubyt/core/scraping/pattern.rb +292 -157
data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
data/lib/scrubyt/core/shared/extractor.rb +122 -163
data/lib/scrubyt/output/export.rb +59 -174
data/lib/scrubyt/output/post_processor.rb +4 -3
data/lib/scrubyt/output/result.rb +8 -9
data/lib/scrubyt/output/result_dumper.rb +81 -42
data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
data/lib/scrubyt/utils/shared_utils.rb +39 -26
data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
data/lib/scrubyt/utils/xpathutils.rb +31 -30
data/test/unittests/constraint_test.rb +11 -7
data/test/unittests/extractor_test.rb +6 -6
data/test/unittests/filter_test.rb +66 -66
metadata +22 -15
data/lib/scrubyt/core/scraping/filter.rb +0 -201

data/lib/scrubyt/core/scraping/constraint.rb CHANGED

@@ -1,27 +1,27 @@
 module Scrubyt
   ##
   #=<tt>Rejecting result instances based on further rules</tt>
-  #
+  #
   #The two  most trivial problems with a set of rules is that they match either less
   #or more instances than we would like them to. Constraints are a way to remedy the second problem:
   #they serve as a tool to filter out some result instances based on rules. A typical
   #example:
-  #
+  #
   #* *ensure_presence_of_ancestor_pattern* consider this model:
   #    <book>
   #      <author>...</author>
   #      <title>...</title>
   #    </book>
-  #
+  #
   #If I attach the *ensure_presence_of_ancestor_pattern* to the pattern 'book' with values
-  #'author' and 'title', only those books will be matched which have an author and a
+  #'author' and 'title', only those books will be matched which have an author and a
   #title (i.e.the child patterns author and title must extract something). This is a way
-  #to say 'a book MUST have an author and a title'.
+  #to say 'a book MUST have an author and a title'.
   class Constraint
     #There are more possible ways of applying/checking constraints in the case of
-    #ones that can not be checked in the context node (e.g. ensure_presence_of -
+    #ones that can not be checked in the context node (e.g. ensure_presence_of -
     #since it may require the evaluation of child patterns of the context pattern to
-    #arbitray level)
+    #arbitray level)
     #
     #In such cases, the possibilities are:
     #
@@ -29,56 +29,54 @@ module Scrubyt
     #   pattern is evaluated. This can mess things up, since if any ancestor node uses
     #   the sinks of predecessor(s) other than the context node, those need to be evaluated
     #   too, and we may run into a cyclyc dependency or at least a complicated recursion
-    #
-    #2) Post processing - evaluate normally and throw out results which do not pass the
+    #
+    #2) Post processing - evaluate normally and throw out results which do not pass the
     #   constraint
     #
     #2b) Do it on the XML level - most probably this solution will be implemented
     # Different constraint types
     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
-    attr_reader :type, :target, :parent_filter
+    attr_reader :type, :target
     #Add 'ensure presence of ancestor pattern' constraint
     #If this type of constraint is added to a pattern, it must have an ancestor pattern
     #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
     #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
     #(just by looking at the wrapper model, the ancestor pattern is always present)
     #Note that from this type of constraint there is no 'ensure_absence' version, since
-    #I could not think about an use case for that
-    def self.add_ensure_presence_of_pattern(parent_filter, ancestor)
-      Constraint.new(parent_filter, ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
+    #I could not think about an use case for that
+    def self.add_ensure_presence_of_pattern(ancestor)
+      Constraint.new(ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
     end
     #Add 'ensure absence of attribute' constraint
     #If this type of constraint is added to a pattern, the HTML node it targets
     #must NOT have an attribute named "attribute_name" with the value "attribute_value"
-    def self.add_ensure_absence_of_attribute(parent_filter, attribute_hash)
-      Constraint.new(parent_filter,
-                     attribute_hash,
+    def self.add_ensure_absence_of_attribute(attribute_hash)
+      Constraint.new(attribute_hash,
                      CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE)
     end
     #Add 'ensure presence of attribute' constraint
     #If this type of constraint is added to a pattern, the HTML node it targets
     #must have an attribute named "attribute_name" with the value "attribute_value"
-    def self.add_ensure_presence_of_attribute(parent_filter, attribute_hash)
-      Constraint.new(parent_filter,
-                     attribute_hash,
+    def self.add_ensure_presence_of_attribute(attribute_hash)
+      Constraint.new(attribute_hash,
                      CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE)
     end
-    #Add 'ensure absence of ancestor node' constraint
+    #Add 'ensure absence of ancestor node' constraint
     #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
     #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
     #
@@ -88,14 +86,13 @@ module Scrubyt
     #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
     #
     #"attributes" can be empty - in this case just the 'node_name' is checked
-    def self.add_ensure_absence_of_ancestor_node(parent_filter, node_name, attributes)
-      Constraint.new(parent_filter,
-                     [node_name, attributes],
-                     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
+    def self.add_ensure_absence_of_ancestor_node(node_name, attributes)
+      Constraint.new([node_name, attributes],
+                     CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE)
     end
-    #Add 'ensure presence of ancestor node' constraint
+    #Add 'ensure presence of ancestor node' constraint
     #If this type of constraint is added to a pattern, the HTML node extracted by the pattern
     #must NOT contain a HTML ancestor node called 'node_name' with the attribute set 'attributes'.
     #
@@ -105,12 +102,11 @@ module Scrubyt
     #class' => 'wide' it has to be written as [{'class' => ['small','wide']}]
     #
     #"attributes" can be empty - in this case just the 'node_name' is checked
-    def self.add_ensure_presence_of_ancestor_node(parent_filter, node_name, attributes)
-      Constraint.new(parent_filter,
-                     [node_name, attributes],
-                     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
+    def self.add_ensure_presence_of_ancestor_node(node_name, attributes)
+      Constraint.new([node_name, attributes],
+                     CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE)
     end
     #Evaluate the constraint; if this function returns true,
     #it means that the constraint passed, i.e. its filter will be added to the exctracted
     #content of the pattern
@@ -123,21 +119,20 @@ module Scrubyt
           attribute_present(result)
         when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
           !attribute_present(result)
-        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
-          ancestor_node_present(result)
-        when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
+        when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE
+          ancestor_node_present(result)
+        when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE
           !ancestor_node_present(result)
       end
     end
   private
     #We would not like these to be called from outside
-    def initialize(parent_filter, target, type)
-      @type = type
-      @parent_filter = parent_filter
+    def initialize(target, type)
       @target = target
-    end
+      @type = type
+    end
     #Implementation of the ancestor node presence test
     #Check the documentation of the add_ensure_presence_of_ancestor_node method
     #for further information on the result parameter
@@ -153,21 +148,22 @@ module Scrubyt
       end
       false
     end
     def attribute_present(result)
+      return unless result.is_a? Hpricot::Elem
       match = true
       #If v = nil, the value of the attribute can be arbitrary;
       #Therefore, in this case we just have to make sure that the attribute is
       #present (i.e. != nil), we don't care about the value
       @target.each do |k,v|
         if v == nil
-            match &&= (result.attributes[k.to_s] != nil)
+            match &&= (result.attributes[k.to_s] != nil)
           else
-            match &&= (result.attributes[k.to_s] == v.to_s)
-        end
+            match &&= (result.attributes[k.to_s] == v.to_s)
+        end
       end
       match
     end
   end #end of class
 end #end of module

data/lib/scrubyt/core/scraping/constraint_adder.rb CHANGED

@@ -10,58 +10,35 @@ module Scrubyt
   #I will not document the functions since these are just forwarders; See the 'real'
   #functions with their documentation in Scrubyt::Constraint.rb
   class ConstraintAdder
-    def self.ensure_presence_of_pattern(pattern, ancestor_node_name)
-      pattern.filters[0].ensure_presence_of_pattern(ancestor_node_name)
-      pattern #To make chaining possible
+    def self.ensure_presence_of_pattern(ancestor_node_name)
+      Constraint.add_ensure_presence_of_pattern(ancestor_node_name)
     end
-    def self.ensure_presence_of_ancestor_node(pattern, ancestor_node_name, attributes=[])
-       pattern.filters[0].ensure_presence_of_ancestor_node(ancestor_node_name,
-                                                           prepare_attributes(attributes))
-       pattern #To make chaining possible
+    def self.ensure_presence_of_ancestor_node(ancestor_node_name, attributes=[])
+      Constraint.add_ensure_presence_of_ancestor_node(ancestor_node_name,
+                                                          prepare_attributes(attributes))
     end
-    def self.ensure_absence_of_ancestor_node(pattern, ancestor_node_name, attributes=[])
-      pattern.filters[0].ensure_absence_of_ancestor_node(ancestor_node_name,
+    def self.ensure_absence_of_ancestor_node(ancestor_node_name, attributes=[])
+      Constraint.add_ensure_absence_of_ancestor_node(ancestor_node_name,
                                                          prepare_attributes(attributes))
-      pattern #To make chaining possible
     end
-    def self.ensure_presence_of_attribute(pattern, attribute_hash)
-       pattern.filters[0].ensure_presence_of_attribute(attribute_hash)
-       pattern #To make chaining possible
+    def self.ensure_presence_of_attribute(attribute_hash)
+      Constraint.add_ensure_presence_of_attribute(attribute_hash)
     end
-    def self.ensure_absence_of_attribute(pattern, attribute_hash)
-       pattern.filters[0].ensure_absence_of_attribute(attribute_hash)
-       pattern #To make chaining possible
+    def self.ensure_absence_of_attribute(attribute_hash)
+      Constraint.add_ensure_absence_of_attribute(attribute_hash)
     end
-private
-    def self.find_by_name(root_pattern, name)
-      @found_pattern = nil
-      find_by_name_recursive(root_pattern, name)
-      if (@found_pattern == nil)
-        #$Logger.error("Fatal: No pattern named #{name} exists!")
-	puts "Fatal: No pattern named #{name} exists!"
-      end
-      @found_pattern
-    end
-    def self.find_by_name_recursive(pattern, name)
-      if pattern.name == name
-        @found_pattern = pattern
-      else
-        pattern.children.each {|child| find_by_name_recursive(child, name)}
-      end
-    end
+    private
     def self.prepare_attributes(attributes)
       attribute_pairs = []
       attributes.each do |key, value|
         if (value.instance_of? Array)
-          value.each {|val| attribute_pairs << [key,val]}
+          value.each {|val| attribute_pairs << [key,val]}
         else
           attribute_pairs << [key, value]
         end

data/lib/scrubyt/core/scraping/filters/attribute_filter.rb ADDED

@@ -0,0 +1,17 @@
+module Scrubyt
+  class AttributeFilter < BaseFilter
+    def evaluate(source)
+      elem = XPathUtils.find_nearest_node_with_attribute(source, @example)
+      if elem.is_a? Hpricot::Elem
+        return [elem.attributes[@example]]
+      else
+        return nil
+      end
+    end
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
+  end #End of class AttributeFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/base_filter.rb ADDED

@@ -0,0 +1,111 @@
+module Scrubyt
+  ##
+  #=<tt>Filter out relevant pieces from the parent pattern</tt>
+  #
+  #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
+  #it reaches the bottom. The biggest difference is that instead of water, a HTML
+  #document travels through the space.
+  #
+  #Of course Scrubyt would not make much sense if the same document would arrive at
+  #the bottom that was poured in at the top - since in this case we might use an
+  #indentity transformation (i.e. do nothing with the input) as well.
+  #
+  #This is where filters came in: as they name says, they filter the stuff that is
+  #pouring from above, to leave the interesting parts and discard the rest.
+  #The working of a filter will be explained most easily by the help of an example.
+  #Let's consider that we would like to extract information from a webshop; Concretely
+  #we are interested in the name of the items and the URL pointing to the image of the
+  #item.
+  #
+  #To accomplish this, first we select the items with the pattern item (a pattern is
+  #a logical grouping of fillters; see Pattern documentation) Then our new
+  #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
+  #extract the name and the image of the item; and finally, extract the href attribute
+  #of the image. Let's see an illustration:
+  #
+  #   root             --> This pattern is called a 'root pattern', It is invisible to you
+  #   |                    and basically it represents the document; it has no filters
+  #   +-- item         --> Filter what's coming from above (the whole document) to get
+  #       |                relevant pieces of data (in this case webshop items)
+  #       +-- name     --> Again, filter what's coming from above (a webshop item) and
+  #       |                leave only item names after this operation
+  #       +-- image    --> This time filter the image of the item
+  #           |
+  #           +-- href --> And finally, from the image elements, get the attribute 'href'
+  class BaseFilter
+    #Type of the example this filter is extracted with
+    #XPath example, like html/body/tr/td[1] etc.
+    EXAMPLE_TYPE_XPATH = 0
+    #String from the document, for example 'Canon EOS 300 D'.
+    EXAMPLE_TYPE_STRING = 1
+    #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
+    EXAMPLE_TYPE_IMAGE = 2
+    #No example - the actual XPath is determined from the children XPaths (their LCA)
+    EXAMPLE_TYPE_CHILDREN = 3
+    #Regexp example, like /\d+@*\d+[a-z]/
+    EXAMPLE_TYPE_REGEXP = 4
+    #Compound example, like :contains => 'goodies'
+    EXAMPLE_TYPE_COMPOUND = 5
+    attr_accessor(:example_type, :parent_pattern, :temp_sink,
+                  :constraints, :xpath, :regexp, :example, :source, :sink)
+    def self.create(parent_pattern, example=nil)
+      filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
+      if filter_name == 'RootFilter'
+        BaseFilter.new(parent_pattern, example)
+      else
+        instance_eval("#{filter_name}.new(parent_pattern, example)")
+      end
+    end
+    #Dispatcher method to add constraints; of course, as with any method_missing, this method
+    #should not be called directly
+    #TODO still used?
+    def method_missing(method_name, *args, &block)
+      case method_name.to_s
+      when /^ensure.+/
+        constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
+      else
+        raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
+      end
+    end
+    def to_sexp
+      nil
+    end
+    private
+    #We don't want this to be accessible from outside
+    def initialize(parent_pattern, example)
+      @example_type = BaseFilter.determine_example_type(example)
+      @parent_pattern = parent_pattern
+      @sink = []                  #output of a filter
+      @source = []                #input of a filter
+      @example = example
+      @xpath = nil                #The xpath to evaluate this filter
+      @constraints = [] #list of constraints
+    end
+    def self.determine_example_type(example)
+      if example.instance_of? Regexp
+        EXAMPLE_TYPE_REGEXP
+      elsif example.instance_of? Hash
+        EXAMPLE_TYPE_COMPOUND
+      else
+        case example
+        when nil
+          EXAMPLE_TYPE_CHILDREN
+        when /\.(jpg|png|gif|jpeg)$/
+          EXAMPLE_TYPE_IMAGE
+        when /^\/{1,2}[a-z]+[0-9]?(\[[0-9]+\])?(\/{1,2}[a-z()]+[0-9]?(\[[0-9]+\])?)*$/
+         (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
+        else
+          EXAMPLE_TYPE_STRING
+        end
+      end
+    end #end of method
+  end #End of class
+end #End of module

data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb ADDED

@@ -0,0 +1,14 @@
+module Scrubyt
+  class DetailPageFilter < BaseFilter
+    def evaluate(source)
+      if source.is_a? String
+        result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
+      else
+        result = @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
+          XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
+          @parent_pattern, @parent_pattern.resolve)
+      end
+    end #end of method
+  end #End of class DetailPageFilter
+end #End of module Scrubyt

data/lib/scrubyt/core/scraping/filters/download_filter.rb ADDED

@@ -0,0 +1,49 @@
+require 'net/http'
+require 'fileutils'
+module Scrubyt
+  class DownloadFilter < BaseFilter
+    def evaluate(source)
+      download_file(source)
+    end #end of method
+    def to_sexp
+      [:str, @example]
+    end #end of method to_sexp
+private
+    def download_file(source)
+      host_name = @parent_pattern.evaluation_context.extractor.get_host_name
+      outfile = nil
+      base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
+      return '' if source.size < 4
+      file_name = source.scan(/.+\/(.*)/)[0][0]
+      Net::HTTP.start(base_url) { |http|
+        resp = http.get(source)
+        outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
+        FileUtils.mkdir_p @example
+        open(outfile, 'wb') {|f| f.write(resp.body) }
+       }
+       outfile.scan(/.+\/(.*)/)[0][0]
+    end
+   def self.find_nonexisting_file_name(file_name)
+      already_found = false
+      loop do
+        if File.exists? file_name
+          if already_found
+            last_no = file_name.scan(/_(\d+)\./)[0][0]
+            file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
+          else
+            file_name.sub!(/\./) {"_1\."}
+            already_found = true
+          end
+        else
+          break
+        end
+      end
+      file_name
+   end #end of method
+  end #End of class DownloadFilter
+end #End of module Scrubyt