scrubyt 0.1.0 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +34 -0
- data/COPYING +340 -0
- data/README +34 -5
- data/Rakefile +6 -5
- data/lib/scrubyt.rb +1 -0
- data/lib/scrubyt/constraint.rb +12 -24
- data/lib/scrubyt/constraint_adder.rb +3 -17
- data/lib/scrubyt/export.rb +33 -17
- data/lib/scrubyt/extractor.rb +74 -23
- data/lib/scrubyt/filter.rb +52 -37
- data/lib/scrubyt/pattern.rb +74 -30
- data/lib/scrubyt/post_processor.rb +58 -0
- data/lib/scrubyt/result.rb +2 -2
- data/lib/scrubyt/result_dumper.rb +6 -0
- data/lib/scrubyt/xpathutils.rb +52 -15
- data/test/unittests/constraint_test.rb +0 -3
- data/test/unittests/extractor_test.rb +11 -13
- data/test/unittests/xpathutils_test.rb +31 -31
- metadata +8 -5
    
        data/lib/scrubyt.rb
    CHANGED
    
    
    
        data/lib/scrubyt/constraint.rb
    CHANGED
    
    | @@ -36,12 +36,11 @@ module Scrubyt | |
| 36 36 | 
             
                #2b) Do it on the XML level - most probably this solution will be implemented
         | 
| 37 37 |  | 
| 38 38 | 
             
                # Different constraint types
         | 
| 39 | 
            -
                 | 
| 40 | 
            -
                 | 
| 41 | 
            -
                 | 
| 42 | 
            -
                 | 
| 43 | 
            -
                 | 
| 44 | 
            -
                CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 5
         | 
| 39 | 
            +
                CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN = 0
         | 
| 40 | 
            +
                CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE = 1
         | 
| 41 | 
            +
                CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE = 2
         | 
| 42 | 
            +
                CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ANCESTOR_NODE = 3
         | 
| 43 | 
            +
                CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_NODE = 4
         | 
| 45 44 |  | 
| 46 45 |  | 
| 47 46 | 
             
                attr_reader :type, :target, :parent_filter
         | 
| @@ -52,22 +51,12 @@ module Scrubyt | |
| 52 51 | 
             
                #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
         | 
| 53 52 | 
             
                #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
         | 
| 54 53 | 
             
                #(just by looking at the wrapper model, the ancestor pattern is always present)
         | 
| 55 | 
            -
                # | 
| 56 | 
            -
                 | 
| 57 | 
            -
             | 
| 54 | 
            +
                #Note that from this type of constraint there is no 'ensure_absence' version, since
         | 
| 55 | 
            +
                #I could not think about an use case for that 
         | 
| 56 | 
            +
                def self.add_ensure_presence_of_pattern(parent_filter, ancestor)
         | 
| 57 | 
            +
                  Constraint.new(parent_filter, ancestor, CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN)
         | 
| 58 58 | 
             
                end
         | 
| 59 59 |  | 
| 60 | 
            -
                #Add 'ensure presence of ancestor pattern' constraint
         | 
| 61 | 
            -
                
         | 
| 62 | 
            -
                #If this type of constraint is added to a pattern, it must NOT have an ancestor pattern
         | 
| 63 | 
            -
                #(child pattern, or child pattern of a child pattern, etc.) denoted by "ancestor"
         | 
| 64 | 
            -
                #'Has an ancestor pattern' means that the ancestor pattern actually extracts something
         | 
| 65 | 
            -
                #(just by looking at the wrapper model, the ancestor pattern is always present)
         | 
| 66 | 
            -
                #ON result level!!!
         | 
| 67 | 
            -
                def self.add_ensure_absence_of_ancestor_pattern(parent_filter, ancestor)
         | 
| 68 | 
            -
                  Constraint.new(parent_filter, ancestor, CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_PATTERN)
         | 
| 69 | 
            -
                end  
         | 
| 70 | 
            -
                
         | 
| 71 60 | 
             
                #Add 'ensure absence of attribute' constraint
         | 
| 72 61 |  | 
| 73 62 | 
             
                #If this type of constraint is added to a pattern, the HTML node it targets
         | 
| @@ -127,10 +116,9 @@ module Scrubyt | |
| 127 116 | 
             
                #content of the pattern
         | 
| 128 117 | 
             
                def check(result)
         | 
| 129 118 | 
             
                  case @type
         | 
| 130 | 
            -
                     | 
| 131 | 
            -
             | 
| 132 | 
            -
             | 
| 133 | 
            -
                      puts "CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ANCESTOR_PATTERN"
         | 
| 119 | 
            +
                    #checked after evaluation, so here always return true
         | 
| 120 | 
            +
                    when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN
         | 
| 121 | 
            +
                      return true
         | 
| 134 122 | 
             
                    when CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_ATTRIBUTE
         | 
| 135 123 | 
             
                      attribute_present(result)
         | 
| 136 124 | 
             
                    when CONSTRAINT_TYPE_ENSURE_ABSENCE_OF_ATTRIBUTE
         | 
| @@ -11,15 +11,8 @@ module Scrubyt | |
| 11 11 | 
             
              #functions with their documentation in Scrubyt::Constraint.rb
         | 
| 12 12 | 
             
              class ConstraintAdder
         | 
| 13 13 |  | 
| 14 | 
            -
                def self. | 
| 15 | 
            -
                   | 
| 16 | 
            -
                  pattern.filters[0].ensure_presence_of_ancestor_pattern(ancestor_node_name)
         | 
| 17 | 
            -
                  pattern #To make chaining possible
         | 
| 18 | 
            -
                end
         | 
| 19 | 
            -
                
         | 
| 20 | 
            -
                def self.ensure_absence_of_ancestor_pattern(pattern, ancestor_node_name)
         | 
| 21 | 
            -
                  data = self.prepare_ensure_ancestor_pattern(pattern, sym_root, sym_ancestor)
         | 
| 22 | 
            -
                  pattern.filters[0].ensure_absence_of_ancestor_pattern(ancestor_node_name)
         | 
| 14 | 
            +
                def self.ensure_presence_of_pattern(pattern, ancestor_node_name)    
         | 
| 15 | 
            +
                  pattern.filters[0].ensure_presence_of_pattern(ancestor_node_name)
         | 
| 23 16 | 
             
                  pattern #To make chaining possible
         | 
| 24 17 | 
             
                end
         | 
| 25 18 |  | 
| @@ -74,13 +67,6 @@ private | |
| 74 67 | 
             
                    end
         | 
| 75 68 | 
             
                  end
         | 
| 76 69 | 
             
                  return attribute_pairs
         | 
| 77 | 
            -
                end | 
| 78 | 
            -
                
         | 
| 79 | 
            -
                def self.prepare_ensure_ancestor_pattern(pattern, root, ancestor)
         | 
| 80 | 
            -
                  context_pattern = find_by_name(pattern.root_pattern, root)
         | 
| 81 | 
            -
                  target_pattern = find_by_name(pattern.root_pattern, ancestor)
         | 
| 82 | 
            -
                  return [context_pattern, target_pattern]    
         | 
| 83 | 
            -
                end
         | 
| 84 | 
            -
              
         | 
| 70 | 
            +
                end #end of method prepare_attributes      
         | 
| 85 71 | 
             
              end #end of class ConstraintAddere
         | 
| 86 72 | 
             
            end #end of module Scrubyt
         | 
    
        data/lib/scrubyt/export.rb
    CHANGED
    
    | @@ -80,7 +80,7 @@ module Scrubyt | |
| 80 80 |  | 
| 81 81 | 
             
            private
         | 
| 82 82 | 
             
                def self.export_header(output_file)
         | 
| 83 | 
            -
                  @result += "require ' | 
| 83 | 
            +
                  @result += "require 'rubygems'\nrequire 'scrubyt'\n\n"
         | 
| 84 84 | 
             
                end
         | 
| 85 85 |  | 
| 86 86 | 
             
                def self.cleanup_result
         | 
| @@ -142,19 +142,21 @@ private | |
| 142 142 | 
             
                  @name_to_xpath_map = {}
         | 
| 143 143 | 
             
                  create_name_to_xpath_map(pattern)
         | 
| 144 144 | 
             
                  #Replace the examples which are quoted with " and '
         | 
| 145 | 
            -
                  @name_to_xpath_map.each do |name,  | 
| 146 | 
            -
                    replace_example_with_xpath(name,  | 
| 147 | 
            -
                    replace_example_with_xpath(name,  | 
| 145 | 
            +
                  @name_to_xpath_map.each do |name, xpaths| 
         | 
| 146 | 
            +
                    replace_example_with_xpath(name, xpaths, %q{"})
         | 
| 147 | 
            +
                    replace_example_with_xpath(name, xpaths, %q{'})
         | 
| 148 148 | 
             
                  end
         | 
| 149 149 | 
             
                  #Finally, add XPaths to pattern which had no example at the beginning (the XPath was
         | 
| 150 150 | 
             
                  #generated from the child patterns
         | 
| 151 | 
            -
                  @name_to_xpath_map.each do |name,  | 
| 152 | 
            -
                     | 
| 153 | 
            -
             | 
| 154 | 
            -
                      @full_definition. | 
| 155 | 
            -
             | 
| 156 | 
            -
                       | 
| 157 | 
            -
             | 
| 151 | 
            +
                  @name_to_xpath_map.each do |name, xpaths| 
         | 
| 152 | 
            +
                    xpaths.each do |xpath|
         | 
| 153 | 
            +
                      comma = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0].sub('do'){}.strip == '' ? '' : ','
         | 
| 154 | 
            +
                      if (@full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]).include?('{')
         | 
| 155 | 
            +
                        @full_definition.sub!("P.#{name}") {"P.#{name}('#{xpath}')"}
         | 
| 156 | 
            +
                      else
         | 
| 157 | 
            +
                        @full_definition.sub!("P.#{name}") {"P.#{name} \"#{xpath}\"#{comma}"}
         | 
| 158 | 
            +
                      end
         | 
| 159 | 
            +
                    end
         | 
| 158 160 | 
             
                  end
         | 
| 159 161 | 
             
                  @result += @full_definition
         | 
| 160 162 | 
             
                end    
         | 
| @@ -169,18 +171,32 @@ private | |
| 169 171 |  | 
| 170 172 |  | 
| 171 173 | 
             
                def self.create_name_to_xpath_map(pattern)
         | 
| 172 | 
            -
                  @name_to_xpath_map[pattern.name] =  | 
| 174 | 
            +
                  @name_to_xpath_map[pattern.name] = []
         | 
| 175 | 
            +
                  pattern.filters.each do |filter|
         | 
| 176 | 
            +
                    @name_to_xpath_map[pattern.name] << filter.xpath if pattern.filters[0].xpath != nil
         | 
| 177 | 
            +
                  end
         | 
| 173 178 | 
             
                  pattern.children.each {|child| create_name_to_xpath_map child}      
         | 
| 174 179 | 
             
                end
         | 
| 175 180 |  | 
| 176 | 
            -
                def self.replace_example_with_xpath(name,  | 
| 177 | 
            -
                   | 
| 178 | 
            -
             | 
| 179 | 
            -
             | 
| 181 | 
            +
                def self.replace_example_with_xpath(name, xpaths, left_delimiter, right_delimiter=left_delimiter)
         | 
| 182 | 
            +
                  return if name=='root'
         | 
| 183 | 
            +
                  full_line = @full_definition.scan(Regexp.new("P.#{name}(.+)$"))[0][0]
         | 
| 184 | 
            +
                  examples = full_line.split(",")
         | 
| 185 | 
            +
                  examples.reject! {|exa| exa.strip!;  exa[0..0] != %q{"} && exa[0..0] != %q{'} }
         | 
| 186 | 
            +
                  all_xpaths = ""
         | 
| 187 | 
            +
                  examples.each do |e|
         | 
| 188 | 
            +
                    index = examples.index(e)
         | 
| 189 | 
            +
                    xpath = xpaths[index]
         | 
| 190 | 
            +
                    return if xpath == nil
         | 
| 191 | 
            +
                    all_xpaths += ", " if index > 0
         | 
| 192 | 
            +
                    all_xpaths += '"' + xpath + '"'
         | 
| 193 | 
            +
                  end
         | 
| 194 | 
            +
                  replacing_xpath = full_line.include?('{') ? "P.#{name}('#{all_xpaths}')" :
         | 
| 195 | 
            +
                                                              "P.#{name} #{all_xpaths}"
         | 
| 180 196 | 
             
                  @full_definition.sub!(/P\.#{name}\s+#{left_delimiter}(.*)#{right_delimiter}/) do
         | 
| 181 197 | 
             
                    @name_to_xpath_map.delete("#{name}")
         | 
| 182 198 | 
             
                    replacing_xpath
         | 
| 183 | 
            -
                  end
         | 
| 199 | 
            +
                  end                                                  
         | 
| 184 200 | 
             
                end
         | 
| 185 201 |  | 
| 186 202 | 
             
              end
         | 
    
        data/lib/scrubyt/extractor.rb
    CHANGED
    
    | @@ -4,6 +4,7 @@ require 'rubygems' | |
| 4 4 | 
             
            require 'mechanize'
         | 
| 5 5 | 
             
            require 'hpricot'
         | 
| 6 6 | 
             
            require 'pp'
         | 
| 7 | 
            +
            require 'set'
         | 
| 7 8 |  | 
| 8 9 | 
             
            module Scrubyt
         | 
| 9 10 | 
             
            ##
         | 
| @@ -43,6 +44,8 @@ module Scrubyt | |
| 43 44 | 
             
                  else 
         | 
| 44 45 | 
             
                    evaluate_wrapper(root_pattern)
         | 
| 45 46 | 
             
                  end
         | 
| 47 | 
            +
                  ensure_all_postconditions(root_pattern)
         | 
| 48 | 
            +
                  PostProcessor.remove_multiple_filter_duplicates(root_pattern)
         | 
| 46 49 | 
             
                  #Return the root pattern
         | 
| 47 50 | 
             
                  root_pattern
         | 
| 48 51 | 
             
                end
         | 
| @@ -104,39 +107,35 @@ module Scrubyt | |
| 104 107 |  | 
| 105 108 | 
             
              ##
         | 
| 106 109 | 
             
              #Action to fetch a document (either a file or a http address)
         | 
| 107 | 
            -
              #
         | 
| 110 | 
            +
              # 
         | 
| 108 111 | 
             
              #*parameters*
         | 
| 109 112 | 
             
              #
         | 
| 110 113 | 
             
              #_doc_url_ - the url or file name to fetch
         | 
| 111 114 | 
             
              def self.fetch(doc_url, mechanize_doc=nil)
         | 
| 112 | 
            -
                puts "fetching: #{doc_url}"
         | 
| 113 115 | 
             
                if (mechanize_doc == nil)
         | 
| 114 116 | 
             
                  @@current_doc_url = doc_url    
         | 
| 115 117 | 
             
                  @@current_doc_protocol = ((doc_url =~ /^http/ || doc_url =~ /^www/) ? :http : :file)
         | 
| 116 118 | 
             
                  if @@base_dir == nil
         | 
| 117 119 | 
             
                    @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == :file
         | 
| 118 | 
            -
                  else
         | 
| 120 | 
            +
                  else        
         | 
| 119 121 | 
             
                    @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
         | 
| 120 122 | 
             
                  end
         | 
| 121 123 |  | 
| 122 | 
            -
                  if @@host_name  | 
| 123 | 
            -
                    if  | 
| 124 | 
            -
                      @@ | 
| 125 | 
            -
                      @@ | 
| 126 | 
            -
                    end
         | 
| 127 | 
            -
                  else
         | 
| 128 | 
            -
                    @@current_doc_url = (@@host_name + doc_url) if doc_url !~ /#{@@host_name}/
         | 
| 124 | 
            +
                  if @@host_name != nil
         | 
| 125 | 
            +
                    if doc_url !~ /#{@@host_name}/
         | 
| 126 | 
            +
                      @@current_doc_url = (@@host_name + doc_url) 
         | 
| 127 | 
            +
                      @@current_doc_url.gsub!(/([^:])\/\//) {"#{$1}/"}
         | 
| 128 | 
            +
                    end  
         | 
| 129 129 | 
             
                  end
         | 
| 130 | 
            -
                  
         | 
| 130 | 
            +
                  puts "[ACTION] fetching document: #{@@current_doc_url}"
         | 
| 131 131 | 
             
                  @@mechanize_doc = @@agent.get(@@current_doc_url) if @@current_doc_protocol == :http
         | 
| 132 132 | 
             
                else
         | 
| 133 133 | 
             
                  @@current_doc_url = doc_url
         | 
| 134 134 | 
             
                  @@mechanize_doc = mechanize_doc
         | 
| 135 | 
            +
                  @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0]
         | 
| 136 | 
            +
                  @@host_name = doc_url if @@host_name == nil    
         | 
| 135 137 | 
             
                end
         | 
| 136 | 
            -
                @@hpricot_doc =  | 
| 137 | 
            -
                out = open('kamaty.html', 'w')
         | 
| 138 | 
            -
                out.write @@hpricot_doc.to_s
         | 
| 139 | 
            -
                out.close
         | 
| 138 | 
            +
                @@hpricot_doc = Hpricot(open(@@current_doc_url))#.to_original_html
         | 
| 140 139 | 
             
              end
         | 
| 141 140 |  | 
| 142 141 | 
             
              ##
         | 
| @@ -149,7 +148,7 @@ module Scrubyt | |
| 149 148 | 
             
              #
         | 
| 150 149 | 
             
              #_query_string_ - the string that should be entered into the textfield
         | 
| 151 150 | 
             
              def self.fill_textfield(textfield_name, query_string)
         | 
| 152 | 
            -
                puts  | 
| 151 | 
            +
                puts "[ACTION] typing #{query_string} into the textfield named '#{textfield_name}'"
         | 
| 153 152 | 
             
                textfield = (@@hpricot_doc/"input[@name=#{textfield_name}]").map()[0]
         | 
| 154 153 | 
             
                formname = Scrubyt::XPathUtils.traverse_up_until_name(textfield, 'form').attributes['name']
         | 
| 155 154 | 
             
                @@current_form = @@mechanize_doc.forms.with.name(formname).first
         | 
| @@ -158,16 +157,16 @@ module Scrubyt | |
| 158 157 |  | 
| 159 158 | 
             
              #Submit the last form; 
         | 
| 160 159 | 
             
              def self.submit    
         | 
| 161 | 
            -
                puts ' | 
| 160 | 
            +
                puts '[ACTION] submitting form...'
         | 
| 162 161 | 
             
                result_page = @@agent.submit(@@current_form)#, @@current_form.buttons.first)
         | 
| 163 162 | 
             
                @@current_doc_url = result_page.uri.to_s
         | 
| 164 163 | 
             
                fetch(@@current_doc_url, result_page)
         | 
| 165 164 | 
             
              end
         | 
| 166 165 |  | 
| 167 | 
            -
              def self.click_link(link_text) | 
| 168 | 
            -
                puts  | 
| 169 | 
            -
                puts /^#{Regexp.escape(link_text)}$/
         | 
| 170 | 
            -
                p /^#{Regexp.escape(link_text)}$/
         | 
| 166 | 
            +
              def self.click_link(link_text)
         | 
| 167 | 
            +
                puts "[ACTION] clicking link: #{link_text}"
         | 
| 168 | 
            +
                #puts /^#{Regexp.escape(link_text)}$/
         | 
| 169 | 
            +
                #p /^#{Regexp.escape(link_text)}$/
         | 
| 171 170 | 
             
                link = @@mechanize_doc.links.text(/^#{Regexp.escape(link_text)}$/)
         | 
| 172 171 | 
             
                result_page = @@agent.click(link)
         | 
| 173 172 | 
             
                @@current_doc_url = result_page.uri.to_s
         | 
| @@ -178,10 +177,62 @@ module Scrubyt | |
| 178 177 | 
             
            #############  
         | 
| 179 178 |  | 
| 180 179 | 
             
            private
         | 
| 180 | 
            +
              def self.ensure_all_postconditions(pattern)
         | 
| 181 | 
            +
                ensure_postconditions(pattern)
         | 
| 182 | 
            +
                pattern.children.each {|child| ensure_all_postconditions(child)}
         | 
| 183 | 
            +
              end
         | 
| 184 | 
            +
              
         | 
| 185 | 
            +
              def self.ensure_postconditions(pattern)
         | 
| 186 | 
            +
                #holds the name of those child patterns which have to be present as children of the input parameter  
         | 
| 187 | 
            +
                epop_names = pattern.get_constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
         | 
| 188 | 
            +
                return if epop_names.empty?
         | 
| 189 | 
            +
                #all_parent_values holds instances extracted by pattern    
         | 
| 190 | 
            +
                all_parent_values = []
         | 
| 191 | 
            +
                pattern.result.childmap.each { |h| all_parent_values << h.values }
         | 
| 192 | 
            +
                all_parent_values.flatten!
         | 
| 193 | 
            +
                #indices of result instances (of pattern) we are going to remove
         | 
| 194 | 
            +
                results_to_remove = Set.new
         | 
| 195 | 
            +
                pattern.children.each do |child_pattern|
         | 
| 196 | 
            +
                  #all_child_values holds instances extracted by child_pattern
         | 
| 197 | 
            +
                  all_child_values = []
         | 
| 198 | 
            +
                  child_pattern.result.childmap.each { |h| all_child_values << h.values }
         | 
| 199 | 
            +
                  all_child_values.flatten!
         | 
| 200 | 
            +
                
         | 
| 201 | 
            +
                  #populate results_to_remove
         | 
| 202 | 
            +
                  i = 0      
         | 
| 203 | 
            +
                  all_parent_values.each do |parent_value|
         | 
| 204 | 
            +
                    #Hey! Not just the direct children but all the ancestors
         | 
| 205 | 
            +
                    @found_ancestor = false
         | 
| 206 | 
            +
                    check_ancestors(parent_value, all_child_values)
         | 
| 207 | 
            +
                    
         | 
| 208 | 
            +
                    results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
         | 
| 209 | 
            +
                    i += 1
         | 
| 210 | 
            +
                  end
         | 
| 211 | 
            +
                end   
         | 
| 212 | 
            +
                #based on results_to_remove, populate the array 'rejected' which holds the actual instances
         | 
| 213 | 
            +
                #(and not indices, as in the case of results_to_remove!). In other words, we are mapping 
         | 
| 214 | 
            +
                #results_to_remove indices to their actual instances
         | 
| 215 | 
            +
                rejected = []    
         | 
| 216 | 
            +
                i = -1 
         | 
| 217 | 
            +
                pattern.result.childmap.each do |h|
         | 
| 218 | 
            +
                  h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
         | 
| 219 | 
            +
                end    
         | 
| 220 | 
            +
                
         | 
| 221 | 
            +
                #Correct the statistics
         | 
| 222 | 
            +
                pattern.get_instance_count[pattern.name] -= rejected.size
         | 
| 223 | 
            +
                
         | 
| 224 | 
            +
                #Finally, do the actual delete!
         | 
| 225 | 
            +
                pattern.result.childmap.each { |h| h.each { |k,v| rejected.each  { |r| v.delete(r)} } }    
         | 
| 226 | 
            +
              end
         | 
| 227 | 
            +
              
         | 
| 228 | 
            +
              def self.check_ancestors(parent_value, all_child_values)
         | 
| 229 | 
            +
                parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child }
         | 
| 230 | 
            +
                parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem }
         | 
| 231 | 
            +
              end
         | 
| 232 | 
            +
              
         | 
| 181 233 | 
             
                def self.evaluate_wrapper(pattern)
         | 
| 182 234 | 
             
                  pattern.evaluate
         | 
| 183 235 | 
             
                  pattern.children.each { |child| evaluate_wrapper child } 
         | 
| 184 | 
            -
                end
         | 
| 185 | 
            -
                
         | 
| 236 | 
            +
                end #end of method evaluate_wrapper    
         | 
| 186 237 | 
             
              end #end of class Extractor
         | 
| 187 238 | 
             
            end #end of module Scrubyt
         | 
    
        data/lib/scrubyt/filter.rb
    CHANGED
    
    | @@ -2,7 +2,7 @@ module Scrubyt | |
| 2 2 | 
             
              ##
         | 
| 3 3 | 
             
              #=<tt>Filter out relevant pieces from the parent pattern</tt>
         | 
| 4 4 | 
             
              #
         | 
| 5 | 
            -
              #A Scrubyt  | 
| 5 | 
            +
              #A Scrubyt extractor is almost like a waterfall: water is pouring from the top until
         | 
| 6 6 | 
             
              #it reaches the bottom. The biggest difference is that instead of water, a HTML 
         | 
| 7 7 | 
             
              #document travels through the space.
         | 
| 8 8 | 
             
              #
         | 
| @@ -15,12 +15,12 @@ module Scrubyt | |
| 15 15 | 
             
              #The working of a filter will be explained most easily by the help of an example.
         | 
| 16 16 | 
             
              #Let's consider that we would like to extract information from a webshop; Concretely
         | 
| 17 17 | 
             
              #we are interested in the name of the items and the URL pointing to the image of the 
         | 
| 18 | 
            -
              #item
         | 
| 18 | 
            +
              #item.
         | 
| 19 19 | 
             
              #
         | 
| 20 | 
            -
              #To accomplish this | 
| 20 | 
            +
              #To accomplish this, first we select the items with the pattern item (a pattern is 
         | 
| 21 21 | 
             
              #a logical grouping of fillters; see Pattern documentation) Then our new 
         | 
| 22 | 
            -
              #context is the result extracted by the item pattern; For every pattern, further
         | 
| 23 | 
            -
              #extract the name and the image of the item; and finally,  | 
| 22 | 
            +
              #context is the result extracted by the 'item' pattern; For every 'item' pattern, further
         | 
| 23 | 
            +
              #extract the name and the image of the item; and finally, extract the href attribute
         | 
| 24 24 | 
             
              #of the image. Let's see an illustration:
         | 
| 25 25 | 
             
              #
         | 
| 26 26 | 
             
              #   root             --> This pattern is called a 'root pattern', It is invisible to you
         | 
| @@ -46,15 +46,18 @@ module Scrubyt | |
| 46 46 | 
             
                #Regexp example, like /\d+@*\d+[a-z]/
         | 
| 47 47 | 
             
                EXAMPLE_TYPE_REGEXP = 4
         | 
| 48 48 |  | 
| 49 | 
            -
                attr_accessor :example_type, :parent_pattern, :temp_sink,  | 
| 49 | 
            +
                attr_accessor :example_type, :parent_pattern, :temp_sink, 
         | 
| 50 | 
            +
                              :constraints, :xpath, :regexp, :example, :source, :sink
         | 
| 50 51 |  | 
| 51 | 
            -
                def initialize(parent_pattern, *args)
         | 
| 52 | 
            +
                def initialize(parent_pattern, example=nil, *args)
         | 
| 52 53 | 
             
                  @parent_pattern = parent_pattern
         | 
| 53 54 | 
             
                  #If the example type is not explicitly defined in the pattern definition,
         | 
| 54 55 | 
             
                  #try to determine it automatically from the example
         | 
| 55 | 
            -
                  @example_type = (args[0] == nil ? Filter.determine_example_type( | 
| 56 | 
            +
                  @example_type = (args[0] == nil ? Filter.determine_example_type(example) :
         | 
| 56 57 | 
             
                                                    args[0][:example_type])
         | 
| 57 | 
            -
                  @ | 
| 58 | 
            +
                  @sink = []                  #output of a filter
         | 
| 59 | 
            +
                  @source = []                #input of a filter                                        
         | 
| 60 | 
            +
                  @example = example
         | 
| 58 61 | 
             
                  @xpath = nil #The xpath to evaluate this filter
         | 
| 59 62 | 
             
                  #temp sinks are used for the initial run when determining the XPaths for examples;
         | 
| 60 63 | 
             
                  @temp_sink = nil 
         | 
| @@ -64,14 +67,15 @@ module Scrubyt | |
| 64 67 | 
             
                #Evaluate this filter. This method shoulf not be called directly - as the pattern hierarchy 
         | 
| 65 68 | 
             
                #is evaluated, every pattern evaluates its filters and then they are calling this method
         | 
| 66 69 | 
             
                def evaluate(source)
         | 
| 70 | 
            +
                  @parent_pattern.root_pattern.already_evaluated_sources ||= {}
         | 
| 67 71 | 
             
                  case @parent_pattern.type
         | 
| 68 | 
            -
                    when Scrubyt::Pattern::PATTERN_TYPE_TREE
         | 
| 72 | 
            +
                    when Scrubyt::Pattern::PATTERN_TYPE_TREE      
         | 
| 69 73 | 
             
                      result = source/@xpath
         | 
| 70 74 | 
             
                      result.class == Hpricot::Elements ? result.map : [result]
         | 
| 71 75 | 
             
                    when Scrubyt::Pattern::PATTERN_TYPE_ATTRIBUTE  
         | 
| 72 | 
            -
                      [source.attributes[@ | 
| 76 | 
            +
                      [source.attributes[@example]]
         | 
| 73 77 | 
             
                    when Scrubyt::Pattern::PATTERN_TYPE_REGEXP 
         | 
| 74 | 
            -
                      source.inner_text.scan(@ | 
| 78 | 
            +
                      source.inner_text.scan(@example).flatten
         | 
| 75 79 | 
             
                  end      
         | 
| 76 80 | 
             
                end
         | 
| 77 81 |  | 
| @@ -81,38 +85,49 @@ module Scrubyt | |
| 81 85 | 
             
                def generate_XPath_for_example
         | 
| 82 86 | 
             
                  case @example_type
         | 
| 83 87 | 
             
                    when EXAMPLE_TYPE_XPATH
         | 
| 84 | 
            -
                      @xpath = @ | 
| 88 | 
            +
                      @xpath = @example
         | 
| 85 89 | 
             
                    when EXAMPLE_TYPE_STRING
         | 
| 86 | 
            -
                      @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.source[0], @ | 
| 90 | 
            +
                      @temp_sink = XPathUtils.find_node_from_text( @parent_pattern.root_pattern.filters[0].source[0], @example )
         | 
| 87 91 | 
             
                      @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
         | 
| 88 92 | 
             
                                                             XPathUtils.generate_XPath(@temp_sink, nil, true)
         | 
| 89 | 
            -
             | 
| 90 | 
            -
                       | 
| 91 | 
            -
                       | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 93 | 
            +
                      puts @xpath                                                 
         | 
| 94 | 
            +
                    when EXAMPLE_TYPE_CHILDREN          
         | 
| 95 | 
            +
                      current_example_index = 0
         | 
| 96 | 
            +
                      loop do
         | 
| 97 | 
            +
                        all_child_temp_sinks = []
         | 
| 98 | 
            +
                        @parent_pattern.children.each do |child_pattern|
         | 
| 99 | 
            +
                          all_child_temp_sinks << child_pattern.filters[current_example_index].temp_sink
         | 
| 94 100 | 
             
                        end
         | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
             | 
| 101 | 
            +
                        
         | 
| 102 | 
            +
                        result = all_child_temp_sinks.pop
         | 
| 103 | 
            +
                        if all_child_temp_sinks.empty?
         | 
| 104 | 
            +
                          result = result.parent
         | 
| 105 | 
            +
                        else
         | 
| 106 | 
            +
                          all_child_temp_sinks.each do |child_sink|              
         | 
| 107 | 
            +
                            result = XPathUtils.lowest_common_ancestor(result, child_sink)
         | 
| 108 | 
            +
                          end
         | 
| 102 109 | 
             
                        end
         | 
| 110 | 
            +
                        xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(result, nil, false) :
         | 
| 111 | 
            +
                                                               XPathUtils.generate_XPath(result, nil, true)
         | 
| 112 | 
            +
                        if @parent_pattern.filters.size < current_example_index + 1
         | 
| 113 | 
            +
                          @parent_pattern.filters << Scrubyt::Filter.new(@parent_pattern)
         | 
| 114 | 
            +
                        end  
         | 
| 115 | 
            +
                        @parent_pattern.filters[current_example_index].xpath = xpath
         | 
| 116 | 
            +
                        @parent_pattern.filters[current_example_index].temp_sink = result
         | 
| 117 | 
            +
                        @parent_pattern.children.each do |child_pattern|
         | 
| 118 | 
            +
                              child_pattern.filters[current_example_index].xpath = 
         | 
| 119 | 
            +
                                child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result) :
         | 
| 120 | 
            +
                                                           XPathUtils.generate_relative_XPath(child_pattern.filters[current_example_index].temp_sink, result)                                            
         | 
| 121 | 
            +
                        end                                                
         | 
| 122 | 
            +
                        if @parent_pattern.children[0].examples == nil
         | 
| 123 | 
            +
                          break if @parent_pattern.children[0].filters.size == current_example_index+1
         | 
| 124 | 
            +
                        else
         | 
| 125 | 
            +
                          break if @parent_pattern.children[0].examples.size == current_example_index+1
         | 
| 126 | 
            +
                        end             
         | 
| 127 | 
            +
                        current_example_index += 1
         | 
| 103 128 | 
             
                      end
         | 
| 104 | 
            -
                      @temp_sink = result
         | 
| 105 | 
            -
                      @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
         | 
| 106 | 
            -
                                                             XPathUtils.generate_XPath(@temp_sink, nil, true)
         | 
| 107 | 
            -
                      @parent_pattern.children.each do |child_pattern|
         | 
| 108 | 
            -
                        child_pattern.filters.each do |filter|
         | 
| 109 | 
            -
                            filter.xpath = 
         | 
| 110 | 
            -
                              child_pattern.generalize ? XPathUtils.generate_generalized_relative_XPath(filter.temp_sink, result) :              
         | 
| 111 | 
            -
                                                         XPathUtils.generate_relative_XPath(filter.temp_sink, result)                                            
         | 
| 112 | 
            -
                        end
         | 
| 113 | 
            -
                      end          
         | 
| 114 129 | 
             
                    when EXAMPLE_TYPE_IMAGE
         | 
| 115 | 
            -
                      @temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.source[0], @ | 
| 130 | 
            +
                      @temp_sink = XPathUtils.find_image(@parent_pattern.root_pattern.filters[0].source[0], @example)
         | 
| 116 131 | 
             
                      @xpath = XPathUtils.generate_XPath(@temp_sink, nil, false)
         | 
| 117 132 | 
             
                  end
         | 
| 118 133 | 
             
                end
         |