RubyGems - scrubyt - Versions diffs - 0.1.0 - Mend

scrubyt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/README +41 -0
data/Rakefile +55 -0
data/lib/scrubyt.rb +9 -0
data/lib/scrubyt/constraint.rb +185 -0
data/lib/scrubyt/constraint_adder.rb +86 -0
data/lib/scrubyt/export.rb +187 -0
data/lib/scrubyt/extractor.rb +187 -0
data/lib/scrubyt/filter.rb +144 -0
data/lib/scrubyt/pattern.rb +263 -0
data/lib/scrubyt/result.rb +43 -0
data/lib/scrubyt/result_dumper.rb +84 -0
data/lib/scrubyt/xpathutils.rb +196 -0
data/test/unittests/constraint_test.rb +106 -0
data/test/unittests/extractor_test.rb +93 -0
data/test/unittests/filter_test.rb +71 -0
data/test/unittests/input/constraint_test.html +55 -0
data/test/unittests/input/test.html +39 -0
data/test/unittests/xpathutils_test.rb +165 -0
metadata +63 -0

data/lib/scrubyt/extractor.rb ADDED

@@ -0,0 +1,187 @@
+require 'logger'
+require 'open-uri'
+require 'rubygems'
+require 'mechanize'
+require 'hpricot'
+require 'pp'
+module Scrubyt
+##
+#=<tt>Driving the whole extraction process</tt>
+#Extractor is a performer class - it gets an extractor definition and carries
+#out the actions and evaluates the wrappers sequentially.
+#
+#It also defines the actions as class methods - check out the section
+#commented with ############# Actions.
+  class Extractor
+    #The definition of the extractor is passed through this method
+    def self.define(&extractor_definition)
+      @@current_doc_url = nil
+      @@current_form = nil
+      @@current_doc_protocol = nil
+      @@base_dir = nil
+      @@host_name = nil
+      @@agent = WWW::Mechanize.new
+      #Hack up an artificial root pattern (i.e. do not return the pattern which
+      #is the root one in the user's definition, but rather the real (invisible)
+      #root pattern
+      root_pattern = (class_eval(&extractor_definition)).parent
+      #A little hack here: upon wrapper construction we are counting the number
+      #of blocks, so we know the count of the 'end's/'}'s which end the extractor
+      #definition
+      #Recursively match data based on examples
+      root_pattern.setup_examples
+      #Once all is set up, evaluate the wrapper from the root pattern!
+      if root_pattern.next_page
+        current_page_count = 1
+        loop do
+          evaluate_wrapper(root_pattern)
+          break if (root_pattern.limit == current_page_count || root_pattern.crawl_to_new_page == nil)
+          current_page_count += 1 if root_pattern.limit != nil
+        end
+      else
+        evaluate_wrapper(root_pattern)
+      end
+      #Return the root pattern
+      root_pattern
+    end
+  #build the current wrapper
+  def self.method_missing(method_name, *args, &block)
+    pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
+    if @parent == nil
+      if method_name.to_s == 'next_page'
+        @@root_pattern.next_page = args[0]
+        @@root_pattern.limit = args[1][:limit] if args.size > 1
+        return @@last_pattern
+      else
+        #Create a root pattern
+        root_pattern = Scrubyt::Pattern.new('root', :type => :root)
+        @@root_pattern = root_pattern
+        @@root_pattern.root_pattern = root_pattern
+        @@root_pattern.root_pattern.extractor = self
+        #add the currently active document to the root pattern
+        @@root_pattern.attach_current_document
+        @@root_pattern.add_child_pattern(pattern)
+        @@root_pattern.block_count = 0
+        @@root_pattern.extractor = self
+      end
+    else
+      @parent.add_child_pattern(pattern) if @parent != nil
+    end
+    if block_given?
+      @@root_pattern.block_count = @@root_pattern.block_count + 1
+      @stack ||=[]
+      @parent = pattern
+      @stack.push @parent
+      class_eval(&block)
+      @stack.pop
+      @parent = @stack.last
+    end
+    @@last_pattern = pattern
+  end
+  #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
+  #(You should not be :)
+  def self.get_block_count
+    @@root_pattern.block_count
+  end
+############# Actions
+#
+  ##
+  # At any given point, the current document can be queried with this method; Typically used
+  # when the navigation is over and the result document is passed to the wrapper
+  def self.get_current_doc_url
+    @@current_doc_url
+  end
+  def self.get_hpricot_doc
+    @@hpricot_doc
+  end
+  ##
+  #Action to fetch a document (either a file or a http address)
+  #
+  #*parameters*
+  #
+  #_doc_url_ - the url or file name to fetch
+  def self.fetch(doc_url, mechanize_doc=nil)
+    puts "fetching: #{doc_url}"
+    if (mechanize_doc == nil)
+      @@current_doc_url = doc_url
+      @@current_doc_protocol = ((doc_url =~ /^http/ || doc_url =~ /^www/) ? :http : :file)
+      if @@base_dir == nil
+        @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == :file
+      else
+        @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
+      end
+      if @@host_name == nil
+        if @@current_doc_protocol == :http
+          @@host_name = doc_url.scan(/http:\/\/.+?\//)[0]
+          @@host_name = doc_url if @@host_name == nil
+        end
+      else
+        @@current_doc_url = (@@host_name + doc_url) if doc_url !~ /#{@@host_name}/
+      end
+      @@mechanize_doc = @@agent.get(@@current_doc_url) if @@current_doc_protocol == :http
+    else
+      @@current_doc_url = doc_url
+      @@mechanize_doc = mechanize_doc
+    end
+    @@hpricot_doc = mechanize_doc != nil ? Hpricot(@@mechanize_doc.body) : Hpricot(open(@@current_doc_url))
+    out = open('kamaty.html', 'w')
+    out.write @@hpricot_doc.to_s
+    out.close
+  end
+  ##
+  #Action to fill a textfield with a query string
+  #
+  ##*parameters*
+  #
+  #_textfield_name_ - the name of the textfield (e.g. the name of the google search
+  #textfield is 'q'
+  #
+  #_query_string_ - the string that should be entered into the textfield
+  def self.fill_textfield(textfield_name, query_string)
+    puts 'fill textfield'
+    textfield = (@@hpricot_doc/"input[@name=#{textfield_name}]").map()[0]
+    formname = Scrubyt::XPathUtils.traverse_up_until_name(textfield, 'form').attributes['name']
+    @@current_form = @@mechanize_doc.forms.with.name(formname).first
+    eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
+  end
+  #Submit the last form;
+  def self.submit
+    puts 'submit'
+    result_page = @@agent.submit(@@current_form)#, @@current_form.buttons.first)
+    @@current_doc_url = result_page.uri.to_s
+    fetch(@@current_doc_url, result_page)
+  end
+  def self.click_link(link_text)
+    puts 'click link'
+    puts /^#{Regexp.escape(link_text)}$/
+    p /^#{Regexp.escape(link_text)}$/
+    link = @@mechanize_doc.links.text(/^#{Regexp.escape(link_text)}$/)
+    result_page = @@agent.click(link)
+    @@current_doc_url = result_page.uri.to_s
+    fetch(@@current_doc_url, result_page)
+  end
+#
+#############
+private
+    def self.evaluate_wrapper(pattern)
+      pattern.evaluate
+      pattern.children.each { |child| evaluate_wrapper child }
+    end
+  end #end of class Extractor
+end #end of module Scrubyt

data/lib/scrubyt/filter.rb ADDED

@@ -0,0 +1,144 @@
+module Scrubyt
+  ##
+  #=<tt>Filter out relevant pieces from the parent pattern</tt>
+  #
+  #A Scrubyt wrapper is almost like a waterfall: water is pouring from the top until
+  #it reaches the bottom. The biggest difference is that instead of water, a HTML
+  #document travels through the space.
+  #
+  #Of course Scrubyt would not make much sense if the same document would arrive at
+  #the bottom that was poured in at the top - since in this case we might use an
+  #indentity transformation (i.e. do nothing with the input) as well.
+  #
+  #This is where filters came in: as they name says, they filter the stuff that is
+  #pouring from above, to leave the interesting parts and discard the rest.
+  #The working of a filter will be explained most easily by the help of an example.
+  #Let's consider that we would like to extract information from a webshop; Concretely
+  #we are interested in the name of the items and the URL pointing to the image of the
+  #item
+  #
+  #To accomplish this. first we select the items with the pattern item (a pattern is
+  #a logical grouping of fillters; see Pattern documentation) Then our new
+  #context is the result extracted by the item pattern; For every pattern, further
+  #extract the name and the image of the item; and finally, extractr the href attribute
+  #of the image. Let's see an illustration:
+  #
+  #   root             --> This pattern is called a 'root pattern', It is invisible to you
+  #   |                    and basically it represents the document; it has no filters
+  #   +-- item         --> Filter what's coming from above (the whole document) to get
+  #       |                relevant pieces of data (in this case webshop items)
+  #       +-- name     --> Again, filter what's coming from above (a webshop item) and
+  #       |                leave only item names after this operation
+  #       +-- image    --> This time filter the image of the item
+  #           |
+  #           +-- href --> And finally, from the image elements, get the attribute 'href'
+  class Filter
+    #Type of the example this filter is extracted with
+    #XPath example, like html/body/tr/td[1] etc.
+    EXAMPLE_TYPE_XPATH = 0
+    #String from the document, for example 'Canon EOS 300 D'.
+    EXAMPLE_TYPE_STRING = 1
+    #Image example, like 'http://www.rubyrailways.com/scrubyt.jpg'
+    EXAMPLE_TYPE_IMAGE = 2
+    #No example - the actual XPath is determined from the children XPaths (their LCA)
+    EXAMPLE_TYPE_CHILDREN = 3
+    #Regexp example, like /\d+@*\d+[a-z]/
+    EXAMPLE_TYPE_REGEXP = 4
+    attr_accessor :example_type, :parent_pattern, :temp_sink, :constraints, :xpath, :regexp
+    def initialize(parent_pattern, *args)
+      @parent_pattern = parent_pattern
+      #If the example type is not explicitly defined in the pattern definition,
+      #try to determine it automatically from the example
+      @example_type = (args[0] == nil ? Filter.determine_example_type(parent_pattern.example) :
+                                        args[0][:example_type])
+      @regexp = parent_pattern.example if @example_type == EXAMPLE_TYPE_REGEXP
+      @xpath = nil #The xpath to evaluate this filter
+      #temp sinks are used for the initial run when determining the XPaths for examples;
+      @temp_sink = nil
+      @constraints = [] #list of constraints
+    end
+    #Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy
+    #is evaluated, every pattern evaluates its filters and then they are calling this method
+    def evaluate(source)
+      case @parent_pattern.type
+        when Scrubyt::Pattern::PATTERN_TYPE_TREE
+          result = source/@xpath
+          result.class == Hpricot::Elements ? result.map : [result]
+        when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
+          [source.attributes[@parent_pattern.example]]
+        when Scrubyt::Pattern::PATTERN_TYPE_REGEXP
+          source.inner_text.scan(@regexp).flatten
+      end
+    end
+    #For all the tree patterns, generate an XPath based on the given example
+    #Also this method should not be called directly; It is automatically called for every tree
+    #pattern directly after wrapper definition
+    def generate_XPath_for_example
+      case @example_type
+        when EXAMPLE_TYPE_XPATH
+          @xpath = @parent_pattern.example
+        when EXAMPLE_TYPE_STRING
+          @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.source[0], @parent_pattern.example )
+          @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
+                                                 XPathUtils.generate_XPath(@temp_sink, nil, true)
+        when EXAMPLE_TYPE_CHILDREN
+          all_child_temp_sinks = []
+          @parent_pattern.children.each do |child_pattern|
+            child_pattern.filters.each do |filter|
+              all_child_temp_sinks << filter.temp_sink
+            end
+          end
+          result = all_child_temp_sinks.pop
+          if all_child_temp_sinks.empty?
+            result = result.parent
+          else
+            all_child_temp_sinks.each do |child_sink|
+              result = XPathUtils.lowest_common_ancestor(result, child_sink)
+            end
+          end
+          @temp_sink = result
+          @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
+                                                 XPathUtils.generate_XPath(@temp_sink, nil, true)
+          @parent_pattern.children.each do |child_pattern|
+            child_pattern.filters.each do |filter|
+                filter.xpath =
+                  child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(filter.temp_sink, result) :
+                                             XPathUtils.generate_relative_XPath(filter.temp_sink, result)
+            end
+          end
+        when EXAMPLE_TYPE_IMAGE
+          @temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.source[0], @parent_pattern.example)
+          @xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
+      end
+    end
+    #Dispatcher method to add constraints; of course, as with any method_missing, this method
+    #should not be called directly
+    def method_missing(method_name, *args, &block)
+      constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
+    end
+private
+    def self.determine_example_type(example)
+      if example.instance_of? Regexp
+        EXAMPLE_TYPE_REGEXP
+      else
+        case example
+          when nil
+            EXAMPLE_TYPE_CHILDREN
+          when /\.(jpg|png|gif|jpeg)$/
+            EXAMPLE_TYPE_IMAGE
+          when /^\/{1,2}[a-z]+(\[\d+\])?(\/{1,2}[a-z]+(\[\d+\])?)*$/
+            (example.include? '/' || example.include?('[')) ? EXAMPLE_TYPE_XPATH : EXAMPLE_TYPE_STRING
+          else
+            EXAMPLE_TYPE_STRING
+        end
+      end
+    end #End of method determine_example_type
+  end #End of class
+end #End of module

data/lib/scrubyt/pattern.rb ADDED

@@ -0,0 +1,263 @@
+require 'rubygems'
+require 'hpricot'
+require 'open-uri'
+module Scrubyt
+  ##
+  #=<tt>Group more filters into one</tt>
+  #
+  #Server as an umbrella for filters which are conceptually extracting
+  #the same thing - for example a price or a title or ...
+  #
+  #Sometimes the same piece of information can not be extracted with one filter
+  #across more result instances (for example a price has an XPath in record n,
+  #but since in record n+1 has a discount price as well, the real price is pushed
+  #to a different XPath etc) - in this case the more filters which extract the same
+  #thing are hold in the same pattern.
+  class Pattern
+    #Type of the pattern;
+    # a root pattern represents a (surprise!) root pattern
+    PATTERN_TYPE_ROOT = 0
+    # a tree pattern represents a HTML region
+    PATTERN_TYPE_TREE = 1
+    # represents an attribute of the node extracted by the parent pattern
+    PATTERN_TYPE_ATTRIBUTE = 2
+    # represents a pattern which filters its output with a regexp
+    PATTERN_TYPE_REGEXP = 3
+    #The pattern can be either a model pattern (in this case it is
+    #written to the output) or a temp pattern (in this case it is skipped)
+    #Will be implemented in a higher version (i.e. not 0.1.0) - for now, everything
+    #is considered to be a model pattern
+    #Model pattern are shown in the output
+    OUTPUT_TYPE_MODEL = 0
+    #Temp patterns are skipped in the output (their ancestors are appended to the parent
+    #of the pattrern which was skipped
+    OUTPUT_TYPE_TEMP = 1
+    #These fields can be set upon wrapper creation - i.e. a field which is public but not contained here can be accessed
+    #from outside, but not set as a result of wrapper construction
+    SETTABLE_FIELDS = ['generalize', 'type', 'output_type', 'example']
+    attr_accessor :name, :output_type, :generalize, :children, :filters, :parent,
+                  :last_result, :result, :root_pattern, :example,  :block_count,
+                  :next_page, :limit, :extractor, :extracted_docs, :source, :sink
+    attr_reader :type, :generalize_set, :next_page_url
+    def initialize (name, *args)
+      @name = name                #name of the pattern
+      parse_args(args)            #parse the rest of the arguments
+      @root_pattern = nil         #root pattern of the wrapper
+      @children = []              #child patterns
+      @filters = []               #filters of the wrapper
+      @sink = []                  #output of a pattern
+      @source = []                #input of a pattern
+      @result = Result.new        #hierarchical results of the pattern
+      @@instance_count = Hash.new(0)
+      @next_page = nil
+      filters << Scrubyt::Filter.new(self) #create a filter
+    end
+    #Parse the args passed as *args; There is only one compulsory parameter to pattern: it's name
+    #All the other parameters can (but do not have to) be specified;
+    #
+    #If an example is specified, it *MUST* be the first parameter; the order of the other
+    #parameters is irrelevant
+    def parse_args(args)
+      #If an example id defined, not only get it but also remove it so it
+      #does not interfere with the other possible string parameters
+      @example = args.delete_at(0) if args[0].instance_of? String
+      @example = args.delete_at(0) if args[0].instance_of? Regexp
+      args.each do |arg|
+        arg.each do |k,v|
+          #Set only the setable fields
+          if SETTABLE_FIELDS.include? k.to_s
+            #If the user is specifying a pattern type, turn it into the corresponding constant
+            v = "PATTERN_TYPE_#{v.to_s.upcase!}" if k.to_s == 'type'
+            v = "OUTPUT_TYPE_#{v.to_s.upcase!}" if k.to_s == 'output_type'
+            #Otherwise, if nothing special is happening, isntance_eval the hash pair
+            instance_eval("@#{k.to_s} = #{v}")
+          end
+          #This flags says that the user explicitly wants to set generalization on a pattern
+          #In this case, of course, our heuristics do not apply - the users setting overrides
+          #it
+          @generalize_set = true if (k.to_s == 'generalize')
+        end
+      end
+      #default settings - the user can override them, but if she did not do so,
+      #we will setup some meaningful defaults
+      @type ||= PATTERN_TYPE_TREE
+      @type = PATTERN_TYPE_REGEXP if @example.instance_of? Regexp
+      @output_type ||= OUTPUT_TYPE_MODEL
+      #don't generalize by default
+      @generalize ||= false
+      #This flag indicates that the user set 'generalize' to some value;
+      #This way we can ensure that the explicit setting will not be overridden
+      @generalize_set ||= false
+    end
+    #Dispatcher function; The class was already too big so I have decided to factor
+    #out some methods based on their functionality (like output, adding constraints)
+    #to utility classes.
+    #
+    #The second function besides dispatching is to lookup the results in an evaluated
+    #wrapper, for example
+    #
+    # camera_data.item[1].item_name[0]
+    def method_missing(method_name, *args, &block)
+      case method_name.to_s
+      when /^to_/
+        Scrubyt::ResultDumper.send(method_name.to_s, self)
+      when /^ensure_/
+        Scrubyt::ConstraintAdder.send(method_name, self, *args)
+      else
+        @children.each { |child| return child if child.name == method_name.to_s }
+        nil
+      end
+    end
+    #Companion function to the previous one (Pattern::method_missing). It makes
+    #inspecting results, like
+    #
+    #    camera_data.item[1].item_name[0]
+    #
+    #possible. The method Pattern::method missing handles the 'item', 'item_name' etc.
+    #parts, while the indexing ([1], [0]) is handled by this function
+    def [](index)
+      return nil if (@result.lookup(@parent.last_result)) == nil
+      @last_result = @result.lookup(@parent.last_result)[index]
+      self
+    end
+    ##
+    #If export is called on the root pattern, it exports the whole extractor wher it is
+    #defined; See export.rb for further details on the parameters
+    def export(file, output_file_name=nil, extractor_result_file_name=nil)
+      Scrubyt::Export.export(file, self, output_file_name, extractor_result_file_name)
+    end
+    ##
+    #Add a filter to this pattern
+    def add_filter(filter)
+      @filters << filter
+      return self
+    end
+    ##
+    #Add a child pattern to this pattern
+    def add_child_pattern(child)
+      child.parent = self
+      #by default, generalize direct children of the root pattern, but only in the case if
+      #@generalize was not set up explicitly
+      child.generalize = true if (!child.generalize_set && child.parent != nil && child.parent.parent == nil)
+      @children << child
+    end
+    ##
+    #Crawl to a new page. This function should not be called from the outside - it is automatically called
+    #if the next_page is defined
+    def crawl_to_new_page
+      temp_document = generate_next_page_link(@next_page)
+      return nil if temp_document == nil
+      clear_sources_and_sinks(@root_pattern)
+      @root_pattern.extractor.fetch(temp_document, nil)
+      attach_current_document
+    end
+    ##
+    #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
+    #crawling to a new page
+    def attach_current_document
+      doc = @root_pattern.extractor.get_hpricot_doc
+      @source << doc
+      @sink << doc
+      @last_result ||= []
+      @last_result << doc
+      @result.add_result(@source, @sink)
+    end
+    ##
+    #Based on the given examples, calculate the XPaths for the tree patterns
+    def setup_examples
+      get_root_pattern(self)
+      set_root_pattern_whole_wrapper(@root_pattern, @root_pattern)
+      generate_examples(@root_pattern)
+    end
+    ##
+    #Evaluate the pattern. This means evaluating all the filters and adding
+    #their extracted instances to the array of results of this pattern
+    def evaluate
+      #No need to evaluate if there is no parent pattern
+      return if @parent == nil
+      @source = @parent.sink
+      @source.each do |source|
+        @filters.each do |filter|
+          r = filter.evaluate(source)
+          if filter.constraints.size > 0
+            #in the beginning, keys of result_hash are made up of all the results of the filter
+            #with value = true; Later on, only those results will have 'true' value which are
+            #accepted with all filters
+            result_hash = {}
+            r.each { |res| result_hash[res] = true }
+            result_hash.keys.each do |res|
+              filter.constraints.each { |constraint| result_hash[res] &&= constraint.check(res) }
+            end
+            result = result_hash.reject {|k,v| k if !v}
+            sorted_result = r.reject {|e| !result.keys.include? e}
+            add_result(source, sorted_result)
+          else
+            add_result(source, r)
+          end
+        end
+      end
+    end
+    def get_instance_count
+      @@instance_count
+    end
+private
+    def add_result(source, results)
+      results.each do |res|
+          @sink << res
+        @result.add_result(source, res)
+        @@instance_count[@name] += 1
+      end
+    end
+    def get_root_pattern(pattern)
+      if @root_pattern == nil
+        while (pattern.parent != nil)
+          get_root_pattern(pattern.parent)
+        end
+        @root_pattern = pattern
+      end
+    end
+    def set_root_pattern_whole_wrapper(pattern, root_pattern)
+      pattern.children.each {|child| set_root_pattern_whole_wrapper(child, root_pattern)}
+      pattern.root_pattern = root_pattern
+    end
+    def generate_examples(pattern)
+      pattern.children.each {|child_pattern| generate_examples(child_pattern) }
+      pattern.filters.each { |filter| filter.generate_XPath_for_example } if pattern.type == PATTERN_TYPE_TREE
+    end
+    def clear_sources_and_sinks(pattern)
+      pattern.source = []
+      pattern.sink = []
+      pattern.children.each {|child| clear_sources_and_sinks child}
+    end
+    def generate_next_page_link(example)
+      node = XPathUtils.find_node_from_text(@root_pattern.source[0], example)
+      return nil if node == nil
+      node.attributes['href']
+    end # end of method generate_next_page_link
+  end #end of class Pattern
+end #end of module Scrubyt