RubyGems - sutch-scrubyt - Versions diffs - 0.4.20 - Mend

sutch-scrubyt 0.4.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

data/CHANGELOG +350 -0
data/COPYING +340 -0
data/README +121 -0
data/Rakefile +101 -0
data/lib/scrubyt.rb +45 -0
data/lib/scrubyt/core/navigation/agents/firewatir.rb +253 -0
data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
data/lib/scrubyt/core/scraping/constraint.rb +169 -0
data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
data/lib/scrubyt/core/scraping/pattern.rb +359 -0
data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
data/lib/scrubyt/core/shared/extractor.rb +168 -0
data/lib/scrubyt/logging.rb +154 -0
data/lib/scrubyt/output/post_processor.rb +139 -0
data/lib/scrubyt/output/result.rb +44 -0
data/lib/scrubyt/output/result_dumper.rb +154 -0
data/lib/scrubyt/output/result_node.rb +140 -0
data/lib/scrubyt/output/scrubyt_result.rb +42 -0
data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
data/lib/scrubyt/utils/shared_utils.rb +58 -0
data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
data/lib/scrubyt/utils/xpathutils.rb +202 -0
data/test/blackbox_test.rb +60 -0
data/test/blackbox_tests/basic/multi_root.rb +6 -0
data/test/blackbox_tests/basic/simple.rb +5 -0
data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
metadata +117 -0

data/lib/scrubyt/output/result.rb ADDED Viewed

@@ -0,0 +1,44 @@
+########################################## NOT USED ANY MORE ##########################################
+module Scrubyt
+  ##
+  #=<tt>Represents the results of a pattern</tt>
+  class Result
+    attr_reader :childmap, :instances
+    def initialize
+      @childmap ||= []
+    end
+    def add_result(source, result)
+      @childmap.each do |hash|
+        if hash.keys[0] == source
+          hash[source] << result if !hash[source].include? result
+          return
+        end
+      end
+      @childmap << {source => [result]}
+    end
+    def lookup(last_result)
+      @childmap.each do |hashes|
+        hashes.each { |key, value| return value if (key == last_result) }
+      end
+      nil
+    end#end of method lookup
+  end#end of class Result
+end#end of module Scrubyt
+  #It roughly works like this:
+  #
+  # root
+  # source:         nil
+  # childmap:       [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
+  #table
+  #  source:         doc1
+  #  childmap        [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
+  #row
+  #  source:         table1s1, table2s1, table3s1
+  #  childmap:       [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
+  #                    {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]

data/lib/scrubyt/output/result_dumper.rb ADDED Viewed

@@ -0,0 +1,154 @@
+require 'rexml/document'
+require 'rexml/xpath'
+########################################## NOT USED ANY MORE ##########################################
+module Scrubyt
+  ##
+  #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
+  class ResultDumper
+    ##
+    #Output the results as XML
+    def self.to_xml(pattern)
+      doc = REXML::Document.new
+      root = REXML::Element.new('root')
+      doc.add_element(root)
+      all_extracted_docs = pattern.last_result
+      [all_extracted_docs].flatten.each do |lr|
+        pattern.last_result = lr
+        to_xml_recursive(pattern, root)
+      end
+      remove_empty_leaves(doc)
+      @@last_doc = doc
+    end
+    def self.remove_empty_leaves(node)
+      node.remove if  node.elements.empty? && node.text == nil
+      node.elements.each {|child| remove_empty_leaves child }
+    end
+    ##
+    #Output the text of the pattern; If this pattern is a tree, collect the text from its
+    #result instance node; otherwise rely on the last_result
+    #TODO: throw this away!!!
+    def self.to_text(pattern)
+      last_result = pattern.last_result
+      result = ""
+      if pattern.type == :tree
+        last_result.traverse_text { |t| result += t.to_s }
+      else
+        result = last_result
+      end
+      result
+    end
+    def self.to_csv(pattern)
+      result = []
+      flat_csv_inner = lambda {|e, parts|
+        content = e.text || ''
+        parts << content if ((e.is_a? REXML::Element) && content != '')
+        e.children.each {|c| flat_csv_inner.call(c, parts) if c.is_a? REXML::Element }
+        parts
+      }
+      to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
+      (result.map! {|a| a.join(',')}).join("\n")
+    end
+    def self.to_hash(pattern)
+      result = []
+      flat_hash_inner = lambda {|e, parts|
+        content = e.text ? REXML::Text.unnormalize(e.text) : ''
+        if ((e.is_a? REXML::Element) && content != '')
+          if parts[e.local_name]
+            parts[e.local_name] = parts[e.local_name] + "," + content
+          else
+            parts[e.local_name] = content
+          end
+        end
+        e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
+        parts
+      }
+      to_xml(pattern).root.elements['/root'].each {|e| result << flat_hash_inner.call(e, {}) }
+      result
+    end
+    ##
+    #Print some simple statistics on the extracted results, like the count of extracted
+    #instances by each pattern
+    def self.print_statistics(pattern)
+      puts "\n" * 2
+      print_statistics_recursive(pattern,0)
+      puts
+    end
+private
+    def self.to_xml_recursive(pattern, element)
+      pattern.children.each do |child|
+        childresults = child.result.lookup(child.parent.last_result)
+        #Output text for leaf nodes only; Maybe add possibility to customize this later
+        if (childresults == nil)
+##TODO: is this needed for anything? I guess not! Drop it!!!!!!
+#Update: it seems the blackbox tests are not passing because of this (?) so temporarily adding it back
+##=begin
+          res = ""
+          if child.parent.last_result.is_a? String
+            res = child.parent.last_result
+          else
+            child.parent.last_result.traverse_text { |t| res += t.to_s }
+          end
+          if (child.parent.respond_to?(:size) && child.parent.size == 0) #TODO: respond_to should not be used here, it's just a quick workaround
+            element.text = SharedUtils.unescape_entities(res).strip unless element.parent.is_a? REXML::Document
+          end
+          next
+##=end
+        end
+        generate_children(child, childresults, element)
+      end
+    end
+    def self.generate_children(child, childresults, element)
+      if childresults == nil
+        child_node = REXML::Element.new(child.name)
+        child_node.text = child.default
+        element.add_element(child_node)
+      else
+        childresults.size.times do |num|
+          child.last_result = childresults[num]
+          res = ""
+          if child.last_result.instance_of? String
+            res = child.last_result
+          else
+            if child.last_result.respond_to? 'traverse_text'
+              child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
+            else
+              child.last_result.children.each { |c| element.add_element c }
+            end
+          end
+          child_node = REXML::Element.new(child.name)
+          child_node.text = SharedUtils.unescape_entities(res).strip if child.write_text
+          element.add_element(child_node) if (child.type != :detail_page && child_node.text != '')
+          to_xml_recursive(child, child_node)
+        end
+      end
+    end
+    def self.print_statistics_recursive(pattern, depth)
+      if pattern.name != 'root'
+        if pattern.type == :detail_page
+          pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
+            print_statistics_recursive(child, depth)
+          end
+        else
+          count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
+          Scrubyt.log :INFO, (' ' * depth.to_i) + "#{pattern.name} extracted #{count} instances."
+        end
+      end
+      pattern.children.each do |child|
+        print_statistics_recursive(child, depth + 4)
+      end
+      end#end of method print_statistics_recursive
+    end #end of class ResultDumper
+  end #end of module Scrubyt

data/lib/scrubyt/output/result_node.rb ADDED Viewed

@@ -0,0 +1,140 @@
+module Scrubyt
+  class ResultNode < Array
+    OUTPUT_OPTIONS = [:write_text]
+    attr_accessor :name, :result, :options, :generated_by_leaf
+    def initialize(name, result=nil, options={})
+      @name = name
+      @result = result
+      @options = options
+    end
+    def write_text
+      @options[:write_text].nil? ? @generated_by_leaf : @options[:write_text]
+    end
+    def has_content?
+      return true if result.is_a? String
+      write_text || (inject(false) { |one_child_has_content, child| one_child_has_content || child.has_content? })
+    end
+    def to_s
+      text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
+      text = SharedUtils.unescape_entities(text)
+      text.strip!
+      if (@options[:default] && ((text == '') || (text == @options[:default])))
+        @options[:default]
+      else
+        text
+      end
+    end
+    def to_libxml
+      libxml_node = XML::Node.new(name)
+      self.each { |child| libxml_node << child.to_libxml if child.has_content? }
+      libxml_node << to_s if write_text
+      libxml_node
+    end
+    #note: see ruby_extensions.rb for String#write
+    def to_xml
+      to_xml_lines.join("\n")
+    end
+    def to_hash(delimiter=',')
+      result = []
+      flat_hash_inner = lambda {|e, hash|
+        hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s  if ((e.write_text && !e.to_s.empty?) || e.options[:default])
+        e.each {|c| flat_hash_inner.call(c, hash)  }
+        hash
+      }
+      self.each {|e| result << flat_hash_inner.call(e, {}) }
+      result
+    end
+    def to_flat_hash()
+      hash_result = self.to_hash('@@@@@@')
+      merged_hash = hash_result.delete_at 0
+      hash_result.each do |hash|
+        merged_hash.keys.each do |key|
+          merged_hash[key] += "@@@@@@#{hash[key]}"
+        end
+      end
+      result_sets = merged_hash.values.map!{|x| x.split('@@@@@@')}.transpose
+      final_result = []
+      result_sets.each do |rs|
+        temp_result = {}
+        merged_hash.keys.each do |k|
+          temp_result[k] = rs[merged_hash.keys.index(k)]
+        end
+        final_result << temp_result
+      end
+      final_result
+    end
+    def to_flat_xml(delimiter=nil)
+      lines = []
+      hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
+      merged_hash = hash_result.delete_at 0
+      hash_result.each do |hash|
+        merged_hash.keys.each do |key|
+          merged_hash[key] += "#{delimiter}#{hash[key]}"
+        end
+      end
+      if delimiter
+        result_sets = merged_hash.values.map!{|x| x.split(delimiter)}.transpose
+        final_result = []
+        result_sets.each do |rs|
+          temp_result = {}
+          merged_hash.keys.each do |k|
+            temp_result[k] = rs[merged_hash.keys.index(k)]
+          end
+          final_result << temp_result
+        end
+        hash_result = final_result
+      end
+      hash_result.each do |hash|
+        lines << "<item>"
+        hash.each do |key, value|
+          xml_tag = key.to_s
+          value = '' if value == '#empty#'
+          lines << "  <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
+        end
+        lines << "</item>"
+      end
+      return lines.join("\n")
+    end
+    def to_xml_lines
+      lines = []
+      children = self.select{ |child| child.has_content? }
+      if children.empty?
+        if result.is_a? String
+          lines << "<#{name}>#{result}</#{name}>"
+        elsif write_text && !to_s.empty?
+          lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
+        else
+          if @options[:default]
+            lines << "<#{name}>#{@options[:default]}</#{name}>"
+          else
+            lines << "<#{name}/>"
+          end
+        end
+      else
+        lines << "<#{name}>"
+        lines << "  #{ERB::Util.html_escape(to_s)}" if write_text && !to_s.empty?
+        children.each do |child|
+          lines.push(*child.to_xml_lines.map{ |line| "  #{line}" })
+        end
+        lines << "</#{name}>"
+      end
+    end
+  end
+end

data/lib/scrubyt/output/scrubyt_result.rb ADDED Viewed

@@ -0,0 +1,42 @@
+module Scrubyt
+  class ScrubytResult < ResultNode
+    attr_accessor :root_patterns, :source_file, :source_proc
+    def export
+      #Temporary solution; the real one will be back later - or not
+     result = <<-EXPLANATION
+     === Extractor tree ===
+     export() is not working at the moment, due to the removal or ParseTree, ruby2ruby and RubyInline.
+     For now, in case you are using examples, you can replace them by hand based on the output below.
+     So if your pattern in the learning extractor looks like
+     book "Ruby Cookbook"
+     and you see the following below:
+     [book] /table[1]/tr/td[2]
+     then replace "Ruby Cookbook" with "/table[1]/tr/td[2]" (and all the other XPaths) and you are ready!
+     EXPLANATION
+     tree_builder = lambda do |node, level|
+       result += current_level = ("   " * (level == 0 ? 0 : level-1) +
+                                  "|\n" * (level == 0 ? 0 : 1) +
+                                  "   " * (level == 0 ? 0 : level-1) +
+                                 "+-- " * (level == 0 ? 0 : 1) +
+                                 "[#{node.name}]")
+       result += " #{node.filters[0].xpath}" if node.type == :tree
+       result += "\n"
+       node.children.each {|c| tree_builder[c, level+1]}
+     end
+     tree_builder[root_patterns[0],0]
+     result += "\n"
+    end
+  end
+end

data/lib/scrubyt/utils/compound_example_lookup.rb ADDED Viewed

@@ -0,0 +1,50 @@
+module Scrubyt
+  #=<tt>Lookup of compund examples</tt>
+  #There are two types of string examples in scRUBYt! right now:
+  #the simple example and the compound example.
+  #
+  #This class is responsible for finding elements matched by compound examples.
+  #In the futre probably more sophisticated matching algorithms will be added
+  #(e.g. match the n-th which matches the text, or element that matches the
+  #text but also contains a specific attribute etc.)
+  class CompoundExampleLookup
+    def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
+      @partial_results = []
+      self.lookup_compound_example(doc, compound_example, index)
+    end
+private
+    #Lookup the first element which is matched by this compund example
+    #
+    #A compound example is specified with :contains, :begins_with and
+    #:ends_with descriptors - which can be both regexps or strings
+    #
+    #Example:
+    #
+    #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
+    def self.lookup_compound_example(doc, compound_example, index)
+      compound_example.each do |k,v|
+        v = Regexp.escape(v) if v.is_a? String
+        case k
+          when :contains
+            v = /#{v}/
+          when :begins_with
+            v = /^\s*#{v}/
+          when :ends_with
+            v = /#{v}\s*$/
+        end
+        if (@partial_results.empty?)
+          @partial_results = SharedUtils.traverse_for_match(doc, v)
+        else
+          refine_partial_results(v)
+        end
+      end
+      @partial_results[index]
+    end
+    def self.refine_partial_results(regexp)
+      @partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
+    end
+  end #End of class CompoundExampleLookup
+end #End of module Scrubyt

data/lib/scrubyt/utils/ruby_extensions.rb ADDED Viewed

@@ -0,0 +1,85 @@
+class Module
+  def option_reader(key_default_hash)
+    key_default_hash.each do |key, default|
+      define_method(key) {
+        if @options[key].nil?
+          if default.is_a? Proc
+            instance_eval(&default)
+          else
+            default
+          end
+        else
+          @options[key]
+        end
+      }
+    end
+  end
+  def option_writer(*keys)
+    keys.each do |key|
+      define_method("#{key.to_s}=".to_sym) { |value|
+        @options[key] = value
+      }
+    end
+  end
+  def option(key, default=nil, writable=false)
+    option_reader(key => default)
+    option_writer(key) if writable
+  end
+  def option_accessor(key_default_hash)
+    key_default_hash.each do |key, default|
+      option(key, default, true)
+    end
+  end
+end
+class Range
+  def <=>(other)
+    self.begin <=> other.begin
+  end
+  def +(amount)
+   (self.begin + amount)..(self.end + amount)
+  end
+  def -(amount)
+   (self.begin - amount)..(self.end - amount)
+  end
+end
+module Math
+  def self.min(a, b)
+    a < b ? a : b
+  end
+  def self.max(a, b)
+    a > b ? a : b
+  end
+end
+#dec 16: Dropped - causes some errors w/ Rails
+#just some hack here to allow current examples' syntax:
+#table_data.to_xml.write(open('result.xml', 'w'), 1)
+#class String
+#  def write(stringio, add_indent=0)
+#    stringio.write((self.split("\n").collect { |line| ('  ' * add_indent) + line }).join("\n"))
+#  end
+#end
+#hack to simulate ancestor::tag selector of XPAth
+module Hpricot
+  class Elem
+    def ancestors(tag = nil)
+      element=self
+      path=Hpricot::Elements.new
+      while element.class != Hpricot::Doc do
+        return element if (tag && (tag ==element.name))
+        path.push element
+        element = element.parent
+      end
+      path
+    end
+  end
+end