RubyGems - scrubyt - Versions diffs - 0.2.3 → 0.2.6 - Mend

scrubyt 0.2.3 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/CHANGELOG +30 -0
data/Rakefile +2 -2
data/lib/scrubyt.rb +5 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +13 -2
data/lib/scrubyt/core/navigation/navigation_actions.rb +4 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/filter.rb +35 -11
data/lib/scrubyt/core/scraping/pattern.rb +29 -22
data/lib/scrubyt/core/scraping/result_indexer.rb +2 -0
data/lib/scrubyt/core/shared/evaluation_context.rb +44 -22
data/lib/scrubyt/core/shared/extractor.rb +111 -15
data/lib/scrubyt/core/shared/u_r_i_builder.rb +67 -0
data/lib/scrubyt/output/export.rb +69 -22
data/lib/scrubyt/output/result.rb +1 -0
data/lib/scrubyt/output/result_dumper.rb +26 -7
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/shared_utils.rb +45 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +23 -0
data/lib/scrubyt/utils/xpathutils.rb +43 -92
data/test/unittests/simple_example_lookup_test.rb +68 -0
data/test/unittests/xpathutils_test.rb +0 -13
metadata +9 -3

data/lib/scrubyt/core/shared/extractor.rb CHANGED

@@ -1,4 +1,3 @@
-require 'logger'
 require 'open-uri'
 require 'rubygems'
 require 'mechanize'
@@ -17,6 +16,11 @@ module Scrubyt
     #The definition of the extractor is passed through this method
     def self.define(mode=nil, &extractor_definition)
       @@mode = mode
+      #We are keeping the relations between the detail patterns and their root patterns
+      @@detail_extractor_to_pattern_name = {}
+      @@detail_pattern_relations = {}
+      #root pattern -> URIBuilder mapping
+      @@next_patterns = {}
       mode_name = (mode == :production ? 'Production' : 'Learning')
       puts "[MODE] #{mode_name}"
       NavigationActions.new
@@ -41,6 +45,47 @@ module Scrubyt
       root_pattern
     end
+  #Evaluate a subexttractor (i.e. an extractor on a detail page).
+  #The url passed to this function is automatically loaded.
+  #The definition of the subextractor is passed as a block
+  #
+  #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
+  def self.evaluate_subextractor(url, parent_pattern)
+    if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
+      detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
+      detail_root.result = Result.new
+      detail_root.last_result = nil
+      @@original_evaluation_context.push @@evaluation_context
+      @@evaluation_context = EvaluationContext.new
+      @@evaluation_context.clear_sources_and_sinks detail_root
+      FetchAction.restore_host_name
+      fetch url
+      @@evaluation_context.extractor = self
+      @@evaluation_context.root_pattern = detail_root
+      @@evaluation_context.attach_current_document
+      evaluate_extractor detail_root
+      @@evaluation_context = @@original_evaluation_context.pop
+      detail_root.to_xml
+    else
+      @@original_evaluation_context ||= []
+      FetchAction.restore_host_name
+      @@original_evaluation_context.push @@evaluation_context
+      @@evaluation_context = EvaluationContext.new
+      fetch url
+      evaluated_extractor = (class_eval(&parent_pattern.referenced_extractor))
+      root_pattern = evaluated_extractor.parent
+      @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern.children[0]
+      @@evaluation_context.setup_examples
+      evaluate_extractor(root_pattern)
+      #Apply all postprocess steps
+      PostProcessor.apply_post_processing(root_pattern)
+      #Return the root pattern
+      #puts "Extracted detail page"
+      @@evaluation_context = @@original_evaluation_context.pop
+      root_pattern.to_xml
+    end
+  end
   #build the current wrapper
   def self.method_missing(method_name, *args, &block)
     if NavigationActions::KEYWORDS.include? method_name.to_s
@@ -48,22 +93,25 @@ module Scrubyt
       return
     end
     pattern = Scrubyt::Pattern.new(method_name.to_s, *args)
+    check_if_shortcut_pattern(pattern)
+    check_if_detail_page(pattern, args)
     pattern.evaluation_context = @@evaluation_context
     if @parent == nil
       if method_name.to_s == 'next_page'
-        @@evaluation_context.next_page = args[0]
-        @@evaluation_context.limit =
-          args[1][:limit] if args.size > 1
+        @@evaluation_context.setup_uri_builder(pattern, args)
+        @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
+        p @@last_root_pattern.children[0].name
         return @@last_pattern
       else
         #Create a root pattern
         root_pattern = Scrubyt::Pattern.new('root', :type => :root)
+        @@last_root_pattern = root_pattern
         root_pattern.evaluation_context = @@evaluation_context
         @@evaluation_context.root_pattern = root_pattern
         @@evaluation_context.extractor = self
         #add the currently active document to the root pattern
         @@evaluation_context.attach_current_document
-        @@evaluation_context.root_pattern.add_child_pattern(pattern)
+        @@evaluation_context.root_pattern.add_child_pattern(pattern)
         @@evaluation_context.block_count = 0
       end
     else
@@ -80,28 +128,76 @@ module Scrubyt
     end
     @@last_pattern = pattern
   end
-  #Used in lord of the hacks vol 1. Check out export.rb if you are still interested
-  #(You should not be :)
-  def self.get_block_count
-    @@root_pattern.block_count
+  #Shortcut patterns, as their name says, are a shortcut for creating patterns
+  #from predefined rules; for example:
+  #
+  #  detail_url
+  #
+  #  is equivalent to
+  #
+  #  detail_url 'href', type => :attribute
+  #
+  #i.e. the system figures out on it's own that because of the postfix, the
+  #example should be looked up (but it should never override the user input!)
+  #another example (will be available later):
+  #
+  # every_img
+  #
+  # is equivivalent to
+  #
+  # every_img '//img'
+  #
+  def self.check_if_shortcut_pattern(pattern)
+    case pattern.name
+      when /.+_url/
+        #make sure that we are not overriding the user's settings
+        if !pattern.examples
+          pattern.filters[0].example = 'href'
+          pattern.type = Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE
+        end
+    end
   end
+  #Check whether the currently created pattern is a detail pattern (i.e. it refrences
+  #a subextractor). Also check if the currently created pattern is
+  #an ancestor of a detail pattern , and store this in a hash if yes (to be able to
+  #traverse the pattern structure on detail pages as well).
+  def self.check_if_detail_page(pattern, args)
+    return if args.size == 0
+    return if !args[0].is_a? Hash
+    return if !args[0][:references]
+    referenced_extractor = args[0][:references]
+    pattern.type = Scrubyt::Pattern::PATTERN_TYPE_DETAIL
+    pattern.referenced_extractor = referenced_extractor
+    @@detail_extractor_to_pattern_name[referenced_extractor] ||= []
+    @@detail_extractor_to_pattern_name[referenced_extractor] = @@detail_extractor_to_pattern_name[referenced_extractor] << pattern
+  end
   def self.get_hpricot_doc
     NavigationActions.get_hpricot_doc
   end
+  def self.get_current_doc_url
+    NavigationActions.get_current_doc_url
+  end
+  def self.get_detail_pattern_relations
+    @@detail_pattern_relations
+  end
   def self.get_mode
     @@mode
-  end
+  end
 private
-    def self.evaluate_extractor(root_pattern)
-      if @@evaluation_context.next_page
+    def self.evaluate_extractor(root_pattern)
+      if @@next_patterns[root_pattern]
         current_page_count = 1
         loop do
           really_evaluate_extractor(root_pattern)
-          break if (@@evaluation_context.limit == current_page_count || @@evaluation_context.crawl_to_new_page == nil)
-          current_page_count += 1 if @@evaluation_context.limit != nil
+          break if (@@next_patterns[root_pattern].limit == current_page_count || @@evaluation_context.crawl_to_new_page(root_pattern, @@next_patterns[root_pattern]) == nil)
+          current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
         end
       else
         really_evaluate_extractor(root_pattern)

data/lib/scrubyt/core/shared/u_r_i_builder.rb ADDED

@@ -0,0 +1,67 @@
+module Scrubyt
+  ##
+  #=<tt>Build URIs from different parameters</tt>
+  #
+  #When crawling to further pages which are machine-generated
+  #(most typically "next" pages) we need to detect the pattern
+  #and generate the next URI based on the edetected rule. This
+  #class provides methods to build URIs based on different criteria.
+  #
+  #The other possibility is to use constant objects ('Next' links,
+  #or image links (like right arrow) pointing to the next page).
+  #URIBUilder supports both possibilities.
+  class URIBuilder
+    attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
+    def initialize(pattern,args)
+      if args[0] =~ /^http.+/
+        #Figure out how are the URLs generated based on the next URL
+        get_next_param(string_diff(args[0], args[1]))
+        @increment = 0
+        @current_uri = args[1]
+        @limit = args[2][:limit] if args.size > 2
+      else
+        #Otherwise, do this in the 'classic' way (by clicking on the "next" link)
+        @next_page_pattern = pattern
+        @next_page_example = args[0]
+        @limit = args[1][:limit] if args.size > 1
+      end
+    end
+    #Used when generating the next URI (as opposed to 'clicking' the next link)
+    def generate_next_uri
+      @increment += @next_increment
+      return @current_uri if @increment == @next_increment
+      @next_increment = 1 if @next_increment == 2
+      if @current_uri !~ /#{@next_param}/
+        @current_uri += (@next_param + '=' + @next_increment.to_s)
+      else
+        @current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
+          "#{@next_param}=#{@increment}"
+        end
+      end
+    end
+private
+    def get_next_param(pair)
+      param_and_value = pair.split('=')
+      @next_param = param_and_value[0]
+      @next_increment = param_and_value[1].to_i
+    end
+    def find_difference_index(s1,s2)
+      cmp = s2.scan(/./).zip(s1.scan(/./))
+      i = 0
+      loop do
+        return i if cmp[i][0] != cmp[i][1]
+        i+=1
+      end
+    end
+    def string_diff(s1,s2)
+      s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
+    end #end of method string_diff
+  end #end of class URIBuilder
+end #end of module Scrubyt

data/lib/scrubyt/output/export.rb CHANGED

@@ -70,6 +70,7 @@ module Scrubyt
       output_file = output_file_name == nil ? open("#{wrapper_name}_extractor_export.rb", 'w') :
                                               open(output_file_name, 'w')
       export_header(output_file)
+      export_subextractors(contents, pattern, output_file)
       export_extractor(contents, pattern, output_file)
       export_footer(output_file, wrapper_name, extractor_result_file_name)
       cleanup_result
@@ -85,8 +86,27 @@ private
     def self.cleanup_result
       @result.gsub!('P.') {}
+      CompoundExample::DESCRIPTORS.each {|d|
+        @result.gsub!(/,\s*:#{d.to_s}.+?'.+?'/) {}
+      }
     end
+    def self.export_subextractors(contents, pattern, output_file)
+      all_subextractor_code = contents.scan(/.+=\s+lambda.+Extractor\.define/m)
+      return if all_subextractor_code.empty?
+      all_subextractor_code = all_subextractor_code[0].split("\n")
+      pure_subextractor_code = []
+      meaningful_code = false
+      all_subextractor_code.each do |sec|
+        meaningful_code = true if sec =~ /lambda/
+        meaningful_code = false if sec =~ /Extractor.define/
+        pure_subextractor_code << sec if meaningful_code
+      end
+      add_P pure_subextractor_code
+      substitute_examples_with_XPaths(pattern,pure_subextractor_code)
+    end
     #OK, I have to admit: this function is powered by woodo magic. A lots of woodoo magic.
     #Piles of tons of heaps of woodoo magic :-)
     #
@@ -111,26 +131,16 @@ private
       #end (to close the block of the extractor definition)
       count = pattern.evaluation_context.block_count + 1
       #Construct the extractor definition matching regexp based on the number of ends
-      definition = contents.scan(/Extractor\.define(?:.*?(?:\}|end)){#{count.to_s}}/m)
+      definition = contents.scan(/Extractor\.define(?:.*?(?:\}|\s+end)){#{count.to_s}}/m)
       #Since the regexp matching the extractor definition was multiline, get the first
       #line separately and patch it in!
       rows = definition[0].split("\n")
-      #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
-      #patterns could be matched very easily from the extractor definition (because they begun
-      #with 'P.'). Now that P has been removed, mimick it!
-      rows.each do |row|
-        #Do not prepend P. to comments and empty lines
-        next if (row.strip =~ /^#/ || row.strip == '')
-        #Do not prepend P. to any of the reserved keywords
-        jump_to_next = false
-        NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
-        next if jump_to_next
-        #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
-        row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
-        #Don't forget also the stuff in parentheses!
-        row.gsub!(/\{\s+/) {"{P."}
-      end
+      add_P(rows)
       rows[0] = first_line
+      substitute_examples_with_XPaths(pattern,rows)
+    end
+    def self.substitute_examples_with_XPaths(pattern,rows)
       #@full_definition holds the original definition (at this point, later on it will be
       #gsub!bed and all)
       @full_definition = rows.join("\n")
@@ -146,9 +156,10 @@ private
         replace_example_with_xpath(name, xpaths, %q{'})
       end
       #Finally, add XPaths to pattern which had no example at the beginning (the XPath was
-      #generated from the child patterns
+      #generated from the child patterns)
       @name_to_xpath_map.each do |name, xpaths|
         xpaths.reverse.each do |xpath|
+          next if !@full_definition.include? "P.#{name}"
           comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
           if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
             @full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
@@ -157,8 +168,8 @@ private
           end
         end
       end
-      @result += @full_definition
-    end
+      @result += @full_definition
+    end
     def self.export_footer(output_file, wrapper_name, extractor_result_file_name)
       if extractor_result_file_name
@@ -167,20 +178,56 @@ private
         @result += "\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
       end
     end
+    def self.add_P(rows)
+      #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
+      #patterns could be matched very easily from the extractor definition (because they begun
+      #with 'P.'). Now that P has been removed, mimick it!
+      rows.each do |row|
+        #Do not prepend P. to comments and empty lines
+        next if (row.strip =~ /^#/ || row.strip == '')
+        #Do not prepend P. to any of the reserved keywords
+        jump_to_next = false
+        NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
+        jump_to_next = true if row =~ /lambda/
+        next if jump_to_next
+        #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
+        row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
+        #Don't forget also the stuff in parentheses!
+        row.gsub!(/\{\s+/) {"{P."}
+      end
+    end
     def self.create_name_to_xpath_map(pattern)
+      puts " Cereating mapping for: #{pattern.name}"
       @name_to_xpath_map[pattern.name] = []
       pattern.filters.each do |filter|
         @name_to_xpath_map[pattern.name] << filter.xpath if pattern.filters[0].xpath != nil
+      end
+      pattern.children.each {|child| create_name_to_xpath_map child}
+      if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
+        puts pattern.name
+        puts "-------"
+        puts pattern.evaluation_context.extractor.get_detail_pattern_relations.each  {|k,v|
+          if k.include? pattern
+            v.parent.children.each do |child|
+              create_name_to_xpath_map child
+            end
+          end
+        }
+        #pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
+        #  create_name_to_xpath_map child
+        #end
       end
-      pattern.children.each {|child| create_name_to_xpath_map child}
-    end
+    end
     def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
       return if name=='root'
+      return if !@full_definition.include? "P.#{name}"
       parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
-      if parens.empty?
+      if parens.empty?
         full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
       else
         full_line = parens[0][0]

data/lib/scrubyt/output/result.rb CHANGED

@@ -11,6 +11,7 @@ module Scrubyt
     def add_result(source, result)
       @childmap.each do |hash|
         if hash.keys[0] == source
+          return if hash[source] == nil
           hash[source] << result if !hash[source].include? result
           return
         end

data/lib/scrubyt/output/result_dumper.rb CHANGED

@@ -50,7 +50,7 @@ module Scrubyt
 private
     def self.to_xml_recursive(pattern, element)
-      pattern.children.each do |child|
+      pattern.children.each do |child|
         childresults = child.result.lookup(child.parent.last_result)
         #Output text for leaf nodes only; Maybe add possibility to customize this later
         if (childresults == nil)
@@ -72,19 +72,38 @@ private
           if child.last_result.instance_of? String
             res = child.last_result
           else
-            child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
+            if child.last_result.respond_to? 'traverse_text'
+              child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
+            else
+              child.last_result.children.each { |c| element.add_element c }
+            end
           end
           child_node = REXML::Element.new(child.name)
-          child_node.text = (res.gsub('&nbsp;'){' '}).strip if (child.children.size == 0)
-          element.add_element(child_node)
+          child_node.text = (res.gsub('&nbsp;'){' '}).strip if write_text_criteria_met(child)
+          element.add_element(child_node) if child.type != Scrubyt::Pattern::PATTERN_TYPE_DETAIL
           to_xml_recursive(child, child_node)
         end
     end
+    def self.write_text_criteria_met(pattern)
+      if (pattern.write_text == nil)
+        return pattern.children.size == 0
+      else
+        pattern.write_text
+      end
+    end
     def self.print_statistics_recursive(pattern, depth)
-      if pattern.name != 'root'
-        count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
-        puts((' ' * "#{depth}".to_i) +  "#{pattern.name} extracted #{count} instances.")
+      if pattern.name != 'root'
+        if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
+          pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
+            print_statistics_recursive(child, depth)
+        end
+        else
+          count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
+          puts((' ' * "#{depth}".to_i) +  "#{pattern.name} extracted #{count} instances.")
+        end
       end
       pattern.children.each do |child|