RubyGems - scrubyt - Versions diffs - 0.2.6 → 0.2.8 - Mend

scrubyt 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/CHANGELOG +59 -12
data/Rakefile +2 -2
data/lib/scrubyt.rb +24 -6
data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
data/lib/scrubyt/core/scraping/constraint.rb +53 -57
data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
data/lib/scrubyt/core/scraping/pattern.rb +292 -157
data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
data/lib/scrubyt/core/shared/extractor.rb +122 -163
data/lib/scrubyt/output/export.rb +59 -174
data/lib/scrubyt/output/post_processor.rb +4 -3
data/lib/scrubyt/output/result.rb +8 -9
data/lib/scrubyt/output/result_dumper.rb +81 -42
data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
data/lib/scrubyt/utils/shared_utils.rb +39 -26
data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
data/lib/scrubyt/utils/xpathutils.rb +31 -30
data/test/unittests/constraint_test.rb +11 -7
data/test/unittests/extractor_test.rb +6 -6
data/test/unittests/filter_test.rb +66 -66
metadata +22 -15
data/lib/scrubyt/core/scraping/filter.rb +0 -201

data/lib/scrubyt/output/export.rb CHANGED

@@ -1,5 +1,3 @@
-#require File.join(File.dirname(__FILE__), 'pattern.rb')
 module Scrubyt
   # =<tt>exporting previously defined extractors</tt>
   class Export
@@ -15,7 +13,7 @@ module Scrubyt
     #
     #*parameters*
     #
-    #_pattern_ - the root pattern of the extractor. This is the variable 'something' in
+    #_root_pattern_ - the root pattern of the extractor. This is the variable 'something' in
     #such a call:
     #
     #  something = Scrubyt::Extractor.define ...
@@ -63,194 +61,81 @@ module Scrubyt
     #This snippet will export the extractor to a file named 'my_super_camera_extractor.rb'.
     #After running 'my_super_camera_extractor.rb', the result will be dumped to the file
     #'/home/peter/stuff/result.xml'.
-    def self.export(input_file, pattern, output_file_name, extractor_result_file_name)
-      @result = ""
-      contents = open(input_file).read
-      wrapper_name = contents.scan(/\s+(.+)\s+=.*Extractor\.define.*/)
-      output_file = output_file_name == nil ? open("#{wrapper_name}_extractor_export.rb", 'w') :
-                                              open(output_file_name, 'w')
-      export_header(output_file)
-      export_subextractors(contents, pattern, output_file)
-      export_extractor(contents, pattern, output_file)
-      export_footer(output_file, wrapper_name, extractor_result_file_name)
-      cleanup_result
-      output_file.write(@result)
+    def self.export(root_pattern, wrapper_name, output_file_name=nil, extractor_result_file_name=nil)
+      sexp = [:block]
+      sexp << export_header(wrapper_name)
+      sexp << export_extractor(root_pattern, wrapper_name)
+      sexp << export_footer(wrapper_name, extractor_result_file_name)
+      result = RubyToRuby.new.process(sexp)
+      result.gsub! '"' + root_pattern.source_file + '"', '__FILE__'
+      output_file_name ||= "#{wrapper_name}_extractor_export.rb"
+      output_file = open(output_file_name, 'w')
+      output_file.write(result)
       output_file.close
-      @result
+      result
     end
 private
-    def self.export_header(output_file)
-      @result += "require 'rubygems'\nrequire 'scrubyt'\n\n"
-    end
-    def self.cleanup_result
-      @result.gsub!('P.') {}
-      CompoundExample::DESCRIPTORS.each {|d|
-        @result.gsub!(/,\s*:#{d.to_s}.+?'.+?'/) {}
-      }
-    end
-    def self.export_subextractors(contents, pattern, output_file)
-      all_subextractor_code = contents.scan(/.+=\s+lambda.+Extractor\.define/m)
-      return if all_subextractor_code.empty?
-      all_subextractor_code = all_subextractor_code[0].split("\n")
-      pure_subextractor_code = []
-      meaningful_code = false
-      all_subextractor_code.each do |sec|
-        meaningful_code = true if sec =~ /lambda/
-        meaningful_code = false if sec =~ /Extractor.define/
-        pure_subextractor_code << sec if meaningful_code
-      end
-      add_P pure_subextractor_code
-      substitute_examples_with_XPaths(pattern,pure_subextractor_code)
+    def self.create_sexp(code)
+      (ParseTree.new.parse_tree_for_string(code))[0]
     end
-    #OK, I have to admit: this function is powered by woodo magic. A lots of woodoo magic.
-    #Piles of tons of heaps of woodoo magic :-)
-    #
-    #The only reason I can expect it to work is that it passes all the tests of the extractors
-    #I have created so far. However at the same time  I know how to create one easily which
-    #would break the exporting, so don't experiment with this too much...
-    #
-    #The other solutions include:
-    #- serialization (yaml, pstore etc) but that would mess the code terribly up - so
-    #therefore I did not chose this solution.
-    #- defining the block as string - however, this introduces ugly %q{}s etc - all in all,
-    #this is still a more viable solution that serialization IMHO
-    #- a lot of other tricks - however, all of these introduce a lot of noise which I don't
-    #like.
-    #
-    #Conclusion: If there will be no terrible, unrepairable, uncontrollable etc. problems
-    #with this approach, it will be replaced (probably with constructing the extractor as
-    #a string). However, until that point, it will stay.
-    def self.export_extractor(contents, pattern, output_file)
-      first_line = contents.scan(/.*Extractor\.define.*/)
-      #During wrapper construction, we count the number of blocks; add one occurrence of
-      #end (to close the block of the extractor definition)
-      count = pattern.evaluation_context.block_count + 1
-      #Construct the extractor definition matching regexp based on the number of ends
-      definition = contents.scan(/Extractor\.define(?:.*?(?:\}|\s+end)){#{count.to_s}}/m)
-      #Since the regexp matching the extractor definition was multiline, get the first
-      #line separately and patch it in!
-      rows = definition[0].split("\n")
-      add_P(rows)
-      rows[0] = first_line
-      substitute_examples_with_XPaths(pattern,rows)
-    end
-    def self.substitute_examples_with_XPaths(pattern,rows)
-      #@full_definition holds the original definition (at this point, later on it will be
-      #gsub!bed and all)
-      @full_definition = rows.join("\n")
-      #This hash contains all the examples that need to be replaced with their XPath
-      #counterparts;"P.#{name}"
-      #We are relying on the convention that if an example is definied, it is always
-      #the first parameter and it is always a string
-      @name_to_xpath_map = {}
-      create_name_to_xpath_map(pattern)
-      #Replace the examples which are quoted with " and '
-      @name_to_xpath_map.each do |name, xpaths|
-        replace_example_with_xpath(name, xpaths, %q{"})
-        replace_example_with_xpath(name, xpaths, %q{'})
-      end
-      #Finally, add XPaths to pattern which had no example at the beginning (the XPath was
-      #generated from the child patterns)
-      @name_to_xpath_map.each do |name, xpaths|
-        xpaths.reverse.each do |xpath|
-          next if !@full_definition.include? "P.#{name}"
-          comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
-          if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
-            @full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
-          else
-            @full_definition.sub!("P.#{name}") {"P.#{name} \"#{xpath}\"#{comma}"}
-          end
-        end
-      end
-      @result += @full_definition
+    def self.export_header(wrapper_name)
+      create_sexp "require 'rubygems'; require 'scrubyt'"
     end
-    def self.export_footer(output_file, wrapper_name, extractor_result_file_name)
+    def self.export_footer(wrapper_name, extractor_result_file_name)
       if extractor_result_file_name
-        @result += "\n\n#{wrapper_name}.to_xml.write(open('result_of_exported_extractor.xml', 'w'), 1)"
+        create_sexp "#{wrapper_name}.to_xml.write(open('result_of_exported_extractor.xml', 'w'), 1)"
       else
-        @result += "\n\n#{wrapper_name}.to_xml.write($stdout, 1)"
+        create_sexp "#{wrapper_name}.to_xml.write($stdout, 1)"
       end
     end
-    def self.add_P(rows)
-      #Lord of the hacks :-) Originally, when we have used the class P as a pattern definer,
-      #patterns could be matched very easily from the extractor definition (because they begun
-      #with 'P.'). Now that P has been removed, mimick it!
-      rows.each do |row|
-        #Do not prepend P. to comments and empty lines
-        next if (row.strip =~ /^#/ || row.strip == '')
-        #Do not prepend P. to any of the reserved keywords
-        jump_to_next = false
-        NavigationActions::KEYWORDS.each { |keyword| jump_to_next = true if row.strip =~ /^#{keyword}/ }
-        jump_to_next = true if row =~ /lambda/
-        next if jump_to_next
-        #Prepend P.s - the hairy stuff with eval et al is there to preserve the whitespace
-        row.sub!(/\s+(.*)/) { "#{' ' * eval((row.size - row.lstrip.size ).to_s)} P.#{$1}" }
-        #Don't forget also the stuff in parentheses!
-        row.gsub!(/\{\s+/) {"{P."}
-      end
-    end
-    def self.create_name_to_xpath_map(pattern)
-      puts " Cereating mapping for: #{pattern.name}"
-      @name_to_xpath_map[pattern.name] = []
-      pattern.filters.each do |filter|
-        @name_to_xpath_map[pattern.name] << filter.xpath if pattern.filters[0].xpath != nil
-      end
-      pattern.children.each {|child| create_name_to_xpath_map child}
-      if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
-        puts pattern.name
-        puts "-------"
-        puts pattern.evaluation_context.extractor.get_detail_pattern_relations.each  {|k,v|
-          if k.include? pattern
-            v.parent.children.each do |child|
-              create_name_to_xpath_map child
-            end
+    def self.export_extractor(root_pattern, wrapper_name)
+      # filter actions before and after pattern
+      pre_pattern_sexp = []
+      post_pattern_sexp = []
+      pattern_skipped = false
+      actions = ['next_page', *NavigationActions::KEYWORDS]
+      root_pattern.source_proc.to_sexp[3][1..-1].each do |sexp|
+        get_call = lambda { |sexp|
+          if sexp[0] == :fcall
+            return sexp[1].to_s
+          elsif sexp[0] == :iter || sexp[0] == :call
+            return get_call.call(sexp[1])
+          else
+            return nil
           end
         }
-        #pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
-        #  create_name_to_xpath_map child
-        #end
+        call = get_call.call(sexp)
+        if(call.nil? || actions.index(call) != nil)
+          if !pattern_skipped
+            pre_pattern_sexp.push(sexp)
+          else
+            post_pattern_sexp.push(sexp)
+          end
+        else
+          raise "Second pattern tree found while exporting." if pattern_skipped
+          pattern_skipped = true
+        end
       end
+      # build extractor content
+      inner_block = [:block]
+      inner_block.push([:block, *pre_pattern_sexp])
+      inner_block.push([:block, export_pattern(root_pattern)])
+      inner_block.push([:block, *post_pattern_sexp])
+      # build extractor
+      [:block, [:lasgn, wrapper_name, [:iter, [:call, [:colon2, [:const, :Scrubyt], :Extractor], :define], nil, inner_block]]]
     end
-    def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
-      return if name=='root'
-      return if !@full_definition.include? "P.#{name}"
-      parens = @full_definition.scan(/P.#{name}\s*\((.+?)\)/)
-      if parens.empty?
-        full_line = @full_definition.scan(/P.#{name}\W(.+)$/)[0][0]
-      else
-        full_line = parens[0][0]
-      end
-      examples = full_line.split(",")
-      examples.reject! {|exa| exa.strip!;  exa[0..0] != %q{"} && exa[0..0] != %q{'} }
-      all_xpaths = ""
-      examples.each do |e|
-        index = examples.index(e)
-        xpath = xpaths[index]
-        return if xpath == nil
-        all_xpaths += ", " if index > 0
-        all_xpaths += '"' + xpath + '"'
-      end
-      replacing_xpath = full_line.include?('{') ? "P.#{name}('#{all_xpaths}')" :
-                                                  "P.#{name} #{all_xpaths}"
-      optional_paren_escaped = parens.empty? ? '' : '\('
-      optional_paren = parens.empty? ? '' : '('
-      @full_definition.sub!(/P\.#{name}\s*#{optional_paren_escaped}#{left_delimiter}(.*)#{right_delimiter}/) do
-        @name_to_xpath_map.delete("#{name}")
-        optional_paren + replacing_xpath
-      end
+    def self.export_pattern(root_pattern)
+      root_pattern.children[0].to_sexp
     end
   end
 end

data/lib/scrubyt/output/post_processor.rb CHANGED

@@ -56,7 +56,7 @@ require 'set'
 private
     def self.ensure_presence_of_pattern(pattern)
       #holds the name of those child patterns which have to be present as children of the input parameter
-      epop_names = pattern.get_constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
+      epop_names = pattern.constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
       return if epop_names.empty?
       #all_parent_values holds instances extracted by pattern
       all_parent_values = []
@@ -95,8 +95,9 @@ private
     end
     def self.check_ancestors(parent_value, all_child_values)
-      parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
-      parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem }
+      parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child } if
+parent_value.is_a? Hpricot::Elem
+      parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem } if parent_value.is_a? Hpricot::Elem
     end
     def self.remove_multiple_filter_duplicates_intern(pattern)

data/lib/scrubyt/output/result.rb CHANGED

@@ -1,24 +1,23 @@
-module Scrubyt
+module Scrubyt
   ##
   #=<tt>Represents the results of a pattern</tt>
   class Result
     attr_reader :childmap, :instances
     def initialize
       @childmap ||= []
     end
     def add_result(source, result)
       @childmap.each do |hash|
         if hash.keys[0] == source
-          return if hash[source] == nil
           hash[source] << result if !hash[source].include? result
           return
         end
       end
-      @childmap << {source => [result]}
+      @childmap << {source => [result]}
     end
     def lookup(last_result)
       @childmap.each do |hashes|
         hashes.each { |key, value| return value if (key == last_result) }
@@ -33,12 +32,12 @@ end#end of module Scrubyt
   # root
   # source:         nil
   # childmap:       [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
   #table
   #  source:         doc1
   #  childmap        [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
   #row
   #  source:         table1s1, table2s1, table3s1
   #  childmap:       [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
-  #                    {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
+  #                    {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]

data/lib/scrubyt/output/result_dumper.rb CHANGED

@@ -12,36 +12,69 @@ module Scrubyt
       root = REXML::Element.new('root')
       doc.add_element(root)
       all_extracted_docs = pattern.last_result
-      all_extracted_docs.each do |lr|
+      [all_extracted_docs].flatten.each do |lr|
         pattern.last_result = lr
-        to_xml_recursive(pattern, root)
+        to_xml_recursive(pattern, root)
       end
       remove_empty_leaves(doc)
       @@last_doc = doc
     end
     def self.remove_empty_leaves(node)
       node.remove if  node.elements.empty? && node.text == nil
       node.elements.each {|child| remove_empty_leaves child }
     end
     ##
     #Output the text of the pattern; If this pattern is a tree, collect the text from its
     #result instance node; otherwise rely on the last_result
+    #TODO: throw this away!!!
     def self.to_text(pattern)
-       last_result = pattern.last_result
-       result = ""
-       if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_TREE
-         last_result.traverse_text { |t| result += t.to_s }
-       else
-         result = last_result
-       end
-       result
+      last_result = pattern.last_result
+      result = ""
+      if pattern.type == :tree
+        last_result.traverse_text { |t| result += t.to_s }
+      else
+        result = last_result
+      end
+      result
+    end
+    def self.to_csv(pattern)
+      result = []
+      flat_csv_inner = lambda {|e, parts|
+        content = e.text || ''
+        parts << content if ((e.is_a? REXML::Element) && content != '')
+        e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
+        parts
+      }
+      to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
+      (result.map! {|a| a.join(',')}).join("\n")
+    end
+    def self.to_hash(pattern)
+      result = []
+      flat_hash_inner = lambda {|e, parts|
+        content = e.text || ''
+        if ((e.is_a? REXML::Element) && content != '')
+          if parts[e.local_name]
+            parts[e.local_name] = parts[e.local_name] + "," + content
+          else
+            parts[e.local_name] = content
+          end
+        end
+        e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
+        parts
+      }
+      to_xml(pattern).root.elements['/root'].each {|e| result << flat_hash_inner.call(e, {}) }
+      result
     end
     ##
-    #Print some simple statistics on the extracted results, like the count of extracted
-    #instances by each pattern
+    #Print some simple statistics on the extracted results, like the count of extracted
+    #instances by each pattern
     def self.print_statistics(pattern)
       puts "\n" * 2
       print_statistics_recursive(pattern,0)
@@ -54,20 +87,34 @@ private
         childresults = child.result.lookup(child.parent.last_result)
         #Output text for leaf nodes only; Maybe add possibility to customize this later
         if (childresults == nil)
+##TODO: is this needed for anything? I guess not! Drop it!!!!!!
+#Update: it seems the blackbox tests are not passing because of this (?) so temporarily adding it back
+##=begin
           res = ""
-          child.parent.last_result.traverse_text { |t| res += t.to_s }
-          if (child.parent.size == 0)
-            element.text = (res.gsub('&nbsp;'){' '}).strip unless element.parent.is_a? REXML::Document
+          if child.parent.last_result.is_a? String
+            res = child.parent.last_result
+          else
+            child.parent.last_result.traverse_text { |t| res += t.to_s }
+          end
+          if (child.parent.respond_to?(:size) && child.parent.size == 0) #TODO: respond_to should not be used here, it's just a quick workaround
+            element.text = SharedUtils.unescape_entities(res).strip unless element.parent.is_a? REXML::Document
           end
           next
+##=end
         end
         generate_children(child, childresults, element)
       end
     end
     def self.generate_children(child, childresults, element)
+      if childresults == nil
+        child_node = REXML::Element.new(child.name)
+        child_node.text = child.default
+        element.add_element(child_node)
+      else
         childresults.size.times do |num|
-          child.last_result = childresults[num]
+          child.last_result = childresults[num]
           res = ""
           if child.last_result.instance_of? String
             res = child.last_result
@@ -78,37 +125,29 @@ private
               child.last_result.children.each { |c| element.add_element c }
             end
           end
-          child_node = REXML::Element.new(child.name)
-          child_node.text = (res.gsub('&nbsp;'){' '}).strip if write_text_criteria_met(child)
-          element.add_element(child_node) if child.type != Scrubyt::Pattern::PATTERN_TYPE_DETAIL
+          child_node = REXML::Element.new(child.name)
+          child_node.text = SharedUtils.unescape_entities(res).strip if child.write_text
+          element.add_element(child_node) if (child.type != :detail_page && child_node.text != '')
           to_xml_recursive(child, child_node)
-        end
-    end
-    def self.write_text_criteria_met(pattern)
-      if (pattern.write_text == nil)
-        return pattern.children.size == 0
-      else
-        pattern.write_text
+        end
       end
     end
     def self.print_statistics_recursive(pattern, depth)
-      if pattern.name != 'root'
-        if pattern.type == Scrubyt::Pattern::PATTERN_TYPE_DETAIL
+      if pattern.name != 'root'
+        if pattern.type == :detail_page
           pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
             print_statistics_recursive(child, depth)
-        end
+          end
         else
-          count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
+          count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
           puts((' ' * "#{depth}".to_i) +  "#{pattern.name} extracted #{count} instances.")
         end
       end
       pattern.children.each do |child|
         print_statistics_recursive(child, depth + 4)
-      end
-    end#end of method print_statistics_recursive
-  end #end of class ResultDumper
-end #end of module Scrubyt
+      end
+      end#end of method print_statistics_recursive
+    end #end of class ResultDumper
+  end #end of module Scrubyt