RubyGems - scrubyt - Versions diffs - 0.2.8 → 0.3.0 - Mend

scrubyt 0.2.8 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/CHANGELOG +32 -2
data/Rakefile +25 -20
data/lib/scrubyt.rb +24 -5
data/lib/scrubyt/core/navigation/fetch_action.rb +76 -42
data/lib/scrubyt/core/navigation/navigation_actions.rb +24 -6
data/lib/scrubyt/core/scraping/filters/base_filter.rb +5 -5
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +2 -2
data/lib/scrubyt/core/scraping/filters/download_filter.rb +2 -1
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -2
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +37 -12
data/lib/scrubyt/core/scraping/pattern.rb +82 -90
data/lib/scrubyt/core/scraping/pre_filter_document.rb +2 -1
data/lib/scrubyt/core/shared/evaluation_context.rb +14 -37
data/lib/scrubyt/core/shared/extractor.rb +55 -54
data/lib/scrubyt/logging.rb +16 -0
data/lib/scrubyt/output/export.rb +1 -1
data/lib/scrubyt/output/post_processor.rb +6 -5
data/lib/scrubyt/output/result.rb +1 -0
data/lib/scrubyt/output/result_dumper.rb +4 -3
data/lib/scrubyt/output/result_node.rb +73 -0
data/lib/scrubyt/output/scrubyt_result.rb +28 -0
data/lib/scrubyt/utils/ruby_extensions.rb +8 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +14 -1
data/lib/scrubyt/utils/xpathutils.rb +11 -0
metadata +7 -12
data/test/unittests/constraint_test.rb +0 -107
data/test/unittests/extractor_test.rb +0 -91
data/test/unittests/filter_test.rb +0 -79
data/test/unittests/input/constraint_test.html +0 -55
data/test/unittests/input/test.html +0 -39
data/test/unittests/pattern_test.rb +0 -27
data/test/unittests/simple_example_lookup_test.rb +0 -68
data/test/unittests/xpathutils_test.rb +0 -152

data/lib/scrubyt/core/scraping/pre_filter_document.rb CHANGED

@@ -2,12 +2,13 @@ module Scrubyt
   ##
   #=<tt>Apply different functions on the input document</tt>
   #Before the document is passed to Hpricot for parsing, we may need
-  #to do different stuff with it which are clumsy/not appropriate/impossible
+  #to do different stuff with it which are clumsy/not appropriate/impossible
   #to do once the document is loaded.
   class PreFilterDocument
      #Replace <br/> tags with newlines
      def self.br_to_newline(doc)
        doc.gsub(/<br[ \/]*>/i, "\r\n")
+       doc = doc.tr("\240"," ")
      end #end of function  br_to_newline
   end #end of class PreFilterDocument
 end #end of module Scrubyt

data/lib/scrubyt/core/shared/evaluation_context.rb CHANGED

@@ -5,7 +5,7 @@ module Scrubyt
   #Every kind of data that is shared among patterns during the extraction process
   #is held in this class, so it can be looked up anytime.
   #
-  #This class provides also some high-level basic functionality in navigation, like
+  #This class provides also some high-level basic functionality in navigation, like
   #crawling to new pages, attaching doucment to the root pattern once arrived at the
   #desired page etc.
   #
@@ -14,7 +14,7 @@ module Scrubyt
   #and this is accomplished through EvaluationContext.
   class EvaluationContext
     attr_accessor :root_pattern, :document_index, :extractor, :uri_builder, :evaluating_extractor_definition
     def initialize
       @root_pattern = nil
       @next_page = nil
@@ -22,54 +22,31 @@ module Scrubyt
       @extractor = nil
       @evaluating_extractor_definition = false
     end
     ##
     #Crawl to a new page. This function should not be called from the outside - it is automatically called
     #if the next_page pattern is defined
-    def crawl_to_new_page(root_pattern, uri_builder)
-      temp_document = uri_builder.next_page_example ?
-                        generate_next_page_link(uri_builder) :
+    def crawl_to_new_page(uri_builder)
+      #puts "Crawling to new page!"
+      #puts "example #{uri_builder.next_page_example}"
+      temp_document = uri_builder.next_page_example ?
+                        generate_next_page_link(uri_builder) :
                         uri_builder.generate_next_uri
-      return nil if temp_document == nil
-      clear_sources_and_sinks(@root_pattern)
+      return false if temp_document == nil
       FetchAction.restore_host_name
       @extractor.fetch(temp_document)
-      attach_current_document
+      return true
     end
-    ##
-    #Attach document to the root pattern; This is happening automatically as the root pattern is defined or
-    #crawling to a new page
-    def attach_current_document
-      doc = @extractor.get_hpricot_doc
-      @root_pattern.filters[0].source << doc
-      @root_pattern.filters[0].sink << doc
-      @root_pattern.last_result ||= []
-      @root_pattern.last_result << doc
-      @root_pattern.result.add_result(@root_pattern.filters[0].source,
-                                      @root_pattern.filters[0].sink)
-    end
-    ##
-    #After crawling to the new page, the sources and sinks need to be cleaned
-    #since they are no more valid
-    def clear_sources_and_sinks(pattern)
-      pattern.filters.each do |filter|
-        filter.source = []
-        filter.sink = []
-      end
-      pattern.children.each {|child| clear_sources_and_sinks child}
-    end
     def generate_next_page_link(uri_builder)
-      uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
+      return nil unless uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
       xpath = uri_builder.next_page_pattern.filters[0].xpath
       node = (@extractor.get_hpricot_doc/xpath).map.last
       node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
-      return nil if node == nil || node.attributes['href'] == nil
+      return nil if node == nil || node.attributes['href'] == nil
       node.attributes['href'].gsub('&amp;') {'&'}
-    end
+    end
     def setup_uri_builder(pattern,args)
       if args[0] =~ /^http.+/
         args.insert(0, @extractor.get_current_doc_url) if args[1] !~ /^http.+/

data/lib/scrubyt/core/shared/extractor.rb CHANGED

@@ -3,49 +3,56 @@ module Scrubyt
   #=<tt>Driving the whole extraction process</tt>
   #
   #Extractor is a performer class - it gets an extractor definition and carries
-  #out the actions and evaluates the wrappers sequentially.
+  #out the actions and evaluates the wrappers sequentially.
   #
   #Originally also the navigation actions were here, but since the class got too
   #big, they were factored out to an own class, NavigationAction.
-  class Extractor
+  class Extractor
     #The definition of the extractor is passed through this method
     def self.define(mode=nil, &extractor_definition)
       backtrace = SharedUtils.get_backtrace
       parts = backtrace[1].split(':')
       source_file = parts[0]
       @@mode = mode
       #We are keeping the relations between the detail patterns and their root patterns
       @@detail_extractor_to_pattern_name = {}
-      @@detail_pattern_relations = {}
+      @@detail_pattern_relations = {}
       #root pattern -> URIBuilder mapping
       @@next_patterns = {}
       mode_name = (mode == :production ? 'Production' : 'Learning')
-      puts "[MODE] #{mode_name}"
-      NavigationActions.new
+      Scrubyt.log :MODE, mode_name
       @@evaluation_context = EvaluationContext.new
-      #Hack up an artificial root pattern (i.e. do not return the pattern which
+      #Hack up an artificial root pattern (i.e. do not return the pattern which
       #is the root one in the user's definition, but rather the real (invisible)
       #root pattern
       @@evaluation_context.evaluating_extractor_definition = true
       class_eval(&extractor_definition)
       @@evaluation_context.evaluating_extractor_definition = false
       root_pattern = @@evaluation_context.root_pattern
       if root_pattern.nil?
-        puts "No extractor defined, exiting..."
+        # TODO: this should be an exception
+        Scrubyt.log :ERROR, 'No extractor defined, exiting...'
         exit
       end
       root_pattern.source_file = source_file
       root_pattern.source_proc = extractor_definition
       #Once all is set up, evaluate the extractor from the root pattern!
-      evaluate_extractor(root_pattern)
-      #Apply all postprocess steps
-      PostProcessor.apply_post_processing(root_pattern)
+      root_results = evaluate_extractor(root_pattern)
+      scrubyt_result = ScrubytResult.new('root')
+      scrubyt_result.push(*root_results)
+      scrubyt_result.root_pattern = root_pattern
       #Return the root pattern
-      puts "Extraction finished succesfully!"
-      root_pattern
+      Scrubyt.log :INFO, 'Extraction finished succesfully!'
+      scrubyt_result
     end
     #Evaluate a subexttractor (i.e. an extractor on a detail page).
     #The url passed to this function is automatically loaded.
     #The definition of the subextractor is passed as a block
@@ -53,119 +60,113 @@ module Scrubyt
     #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
     def self.evaluate_subextractor(url, parent_pattern, resolve)
       if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
-        detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
-        detail_root.result = Result.new
+        detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
         detail_root.last_result = nil
         FetchAction.store_page
         @@original_evaluation_context.push @@evaluation_context
         @@host_stack.push FetchAction.get_host_name
         @@evaluation_context = EvaluationContext.new
-        @@evaluation_context.clear_sources_and_sinks detail_root
         FetchAction.restore_host_name
         fetch url, :resolve => resolve
         @@evaluation_context.extractor = self
-        @@evaluation_context.root_pattern = detail_root
-        @@evaluation_context.attach_current_document
-        evaluate_extractor detail_root
+        @@evaluation_context.root_pattern = detail_root
+        root_results = evaluate_extractor detail_root
         @@evaluation_context = @@original_evaluation_context.pop
         FetchAction.restore_page
         FetchAction.store_host_name(@@host_stack.pop)
-        detail_root.to_xml
-      else
+        root_results
+      else
         @@original_evaluation_context ||= []
         @@host_stack ||= []
         FetchAction.store_page
         @@original_evaluation_context.push @@evaluation_context
         @@host_stack.push FetchAction.get_host_name
         @@evaluation_context = EvaluationContext.new
-        FetchAction.restore_host_name
+        FetchAction.restore_host_name
         fetch url, :resolve => resolve
-        evaluated_extractor = (class_eval(&parent_pattern.referenced_extractor))
-        root_pattern = evaluated_extractor.parent
-        @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern.children[0]
-        evaluate_extractor(root_pattern)
-        #Apply all postprocess steps
-        PostProcessor.apply_post_processing(root_pattern)
+        class_eval(&parent_pattern.referenced_extractor)
+        root_pattern = @@evaluation_context.root_pattern
+        @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
+        root_results = evaluate_extractor(root_pattern)
         @@evaluation_context = @@original_evaluation_context.pop
         FetchAction.restore_page
         FetchAction.store_host_name(@@host_stack.pop)
-        root_pattern.to_xml
+        root_results
       end
     end
-    #build the current wrapper
+    #build the current wrapper
     def self.method_missing(method_name, *args, &block)
       if NavigationActions::KEYWORDS.include? method_name.to_s
         NavigationActions.send(method_name, *args)
         return
       end
       if method_name.to_s == 'next_page'
         pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
         pattern.evaluation_context = @@evaluation_context
         @@evaluation_context.setup_uri_builder(pattern, args)
         @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
       else
         raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
         #Create a root pattern
-        root_pattern = Scrubyt::Pattern.new('root', [:type => :root], @@evaluation_context)
+        @@evaluation_context.extractor = self
+        root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
         @@last_root_pattern = root_pattern
         @@evaluation_context.root_pattern = root_pattern
-        @@evaluation_context.extractor = self
-        #add the currently active document to the root pattern
-        @@evaluation_context.attach_current_document
-        pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
-        root_pattern.children << pattern
-        pattern
+        root_pattern
       end
     end
     def self.add_detail_extractor_to_pattern_name(referenced_extractor, pattern)
       @@detail_extractor_to_pattern_name[referenced_extractor] ||= [] << pattern
     end
     def self.get_detail_extractor(parent_pattern)
-      @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]].parent
+      @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
     end
     def self.get_hpricot_doc
       NavigationActions.get_hpricot_doc
     end
     def self.get_current_doc_url
       NavigationActions.get_current_doc_url
     end
     def self.get_detail_pattern_relations
       @@detail_pattern_relations
     end
     def self.get_host_name
       NavigationActions.get_host_name
     end
     def self.get_mode
       @@mode
     end
     def self.get_original_host_name
       @@original_host_name
     end
     private
     def self.evaluate_extractor(root_pattern)
+      root_results = []
       if @@next_patterns[root_pattern]
         current_page_count = 1
         loop do
-          root_pattern.evaluate(nil)
-          break if (@@next_patterns[root_pattern].limit == current_page_count || !@@evaluation_context.crawl_to_new_page(root_pattern, @@next_patterns[root_pattern]))
+          root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
+          break if (@@next_patterns[root_pattern].limit == current_page_count || !@@evaluation_context.crawl_to_new_page(@@next_patterns[root_pattern]))
           current_page_count += 1 if @@next_patterns[root_pattern].limit != nil
         end
       else
-        root_pattern.evaluate(nil)
+        root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
       end
+      root_results
     end
   end #end of class Extractor
-end #end of module Scrubyt
+end #end of module Scrubyt

data/lib/scrubyt/logging.rb ADDED

@@ -0,0 +1,16 @@
+module Scrubyt
+  def self.log(message_type, message)
+    pre = "[#{message_type}] "
+    if message.is_a? Array
+      puts pre + message.first
+      message[1..-1].each do |line|
+        puts ' ' * pre.length + line
+      end
+    else
+      puts pre + message.to_s
+    end
+  end
+end

data/lib/scrubyt/output/export.rb CHANGED

@@ -135,7 +135,7 @@ private
     end
     def self.export_pattern(root_pattern)
-      root_pattern.children[0].to_sexp
+      root_pattern.to_sexp
     end
   end
 end

data/lib/scrubyt/output/post_processor.rb CHANGED

@@ -1,5 +1,6 @@
 module Scrubyt
+########################################## NOT USED ANY MORE ##########################################
 require 'set'
 ##
 #=<tt>Post processing results after the extraction</tt>
@@ -46,11 +47,11 @@ require 'set'
     def self.report_if_no_results(root_pattern)
       results_found = false
       root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
-      puts
-      puts "!!!!!! WARNING: The extractor did not find any result instances"
-      puts "Most probably this is wrong. Check your extractor and if you are"
-      puts "sure it should work, report a bug!"
-      puts
+      Scrubyt.log :WARNING, [
+        "The extractor did not find any result instances. Most probably this is wrong.",
+        "Check your extractor and if you are sure it should work, report a bug!"
+      ]
     end
 private

data/lib/scrubyt/output/result.rb CHANGED

@@ -1,3 +1,4 @@
+########################################## NOT USED ANY MORE ##########################################
 module Scrubyt
   ##
   #=<tt>Represents the results of a pattern</tt>

data/lib/scrubyt/output/result_dumper.rb CHANGED

@@ -1,6 +1,7 @@
 require 'rexml/document'
 require 'rexml/xpath'
+########################################## NOT USED ANY MORE ##########################################
 module Scrubyt
   ##
   #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
@@ -45,7 +46,7 @@ module Scrubyt
       flat_csv_inner = lambda {|e, parts|
         content = e.text || ''
         parts << content if ((e.is_a? REXML::Element) && content != '')
-        e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
+        e.children.each {|c| flat_csv_inner.call(c, parts) if c.is_a? REXML::Element }
         parts
       }
       to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
@@ -55,7 +56,7 @@ module Scrubyt
     def self.to_hash(pattern)
       result = []
       flat_hash_inner = lambda {|e, parts|
-        content = e.text || ''
+        content = e.text ? REXML::Text.unnormalize(e.text) : ''
         if ((e.is_a? REXML::Element) && content != '')
           if parts[e.local_name]
             parts[e.local_name] = parts[e.local_name] + "," + content
@@ -141,7 +142,7 @@ private
           end
         else
           count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
-          puts((' ' * "#{depth}".to_i) +  "#{pattern.name} extracted #{count} instances.")
+          Scrubyt.log :INFO, (' ' * depth.to_i) + "#{pattern.name} extracted #{count} instances."
         end
       end

data/lib/scrubyt/output/result_node.rb ADDED

@@ -0,0 +1,73 @@
+module Scrubyt
+  class ResultNode < Array
+    OUTPUT_OPTIONS = [:write_text]
+    attr_accessor :name, :result, :options, :generated_by_leaf
+    def initialize(name, result=nil, options={})
+      @name = name
+      @result = result
+      @options = options
+    end
+    def write_text
+      @options[:write_text].nil? ? @generated_by_leaf : @options[:write_text]
+    end
+    def has_content?
+      return true if result.is_a? String
+      write_text || (inject(false) { |one_child_has_content, child| one_child_has_content || child.has_content? })
+    end
+    def to_s
+      text = (@result.is_a? String) ? @result : @result.inner_text
+      text = SharedUtils.unescape_entities(text)
+      text.strip!
+      text
+    end
+    def to_libxml
+      libxml_node = XML::Node.new(name)
+      self.each { |child| libxml_node << child.to_libxml if child.has_content? }
+      libxml_node << to_s if write_text
+      libxml_node
+    end
+    #note: see ruby_extensions.rb for String#write
+    def to_xml
+      to_xml_lines.join("\n")
+    end
+    def to_hash
+      result = []
+      flat_hash_inner = lambda {|e, hash|
+        hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + "," + e.to_s : e.to_s  if e.write_text && !e.to_s.empty?
+        e.each {|c| flat_hash_inner.call(c, hash)  }
+        hash
+      }
+      self.each {|e| result << flat_hash_inner.call(e, {}) }
+      result
+    end
+    def to_xml_lines
+      lines = []
+      children = self.select{ |child| child.has_content? }
+      if children.empty?
+        if result.is_a? String
+          lines << "<#{name}>#{result}</#{name}>"
+        elsif write_text && !to_s.empty?
+          lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
+        else
+          lines << "<#{name}/>"
+        end
+      else
+        lines << "<#{name}>"
+        lines << "  #{ERB::Util.html_escape(to_s)}" if write_text && !to_s.empty?
+        children.each do |child|
+          lines.push(*child.to_xml_lines.map{ |line| "  #{line}" })
+        end
+        lines << "</#{name}>"
+      end
+    end
+  end
+end