scrubyt 0.3.0 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
| @@ -14,17 +14,25 @@ module Scrubyt | |
| 14 14 |  | 
| 15 15 | 
             
            private
         | 
| 16 16 | 
             
                def download_file(source)
         | 
| 17 | 
            -
                   | 
| 17 | 
            +
                  return '' if source.size < 4
         | 
| 18 | 
            +
                  host_name = (source =~ /^http/ ? source : @parent_pattern.extractor.get_host_name)
         | 
| 18 19 | 
             
                  outfile = nil
         | 
| 20 | 
            +
                  host_name += "/" if host_name[-1..-1] != "/"
         | 
| 19 21 | 
             
                  base_url = host_name.scan(/http:\/\/(.+?)\//)[0][0]
         | 
| 20 | 
            -
                  return '' if source.size < 4
         | 
| 21 22 | 
             
                  file_name = source.scan(/.+\/(.*)/)[0][0]
         | 
| 23 | 
            +
                  return nil if @parent_pattern.except.include? file_name
         | 
| 22 24 | 
             
                  Net::HTTP.start(base_url) { |http|
         | 
| 23 | 
            -
                     | 
| 24 | 
            -
                     | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 25 | 
            +
                    Scrubyt.log :INFO, "downloading: #{source.scan(/\s*(.+)/)[0][0]}"
         | 
| 26 | 
            +
                    begin
         | 
| 27 | 
            +
                      ua = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)'
         | 
| 28 | 
            +
                      path = host_name.scan(/http:\/\/#{base_url}(.+)\//)
         | 
| 29 | 
            +
                      resp = http.get(path, {'User-Agent'=> ua})
         | 
| 30 | 
            +
                      outfile = DownloadFilter.find_nonexisting_file_name(File.join(@example, file_name))
         | 
| 31 | 
            +
                      FileUtils.mkdir_p @example
         | 
| 32 | 
            +
                      open(outfile, 'wb') {|f| f.write(resp.body) }
         | 
| 33 | 
            +
                    rescue Timeout::Error
         | 
| 34 | 
            +
                      outfile = "[FAILED]#{file_name}"
         | 
| 35 | 
            +
                    end
         | 
| 28 36 | 
             
                   }
         | 
| 29 37 | 
             
                   outfile.scan(/.+\/(.*)/)[0][0]
         | 
| 30 38 | 
             
                end
         | 
| @@ -34,11 +42,21 @@ private | |
| 34 42 | 
             
                  loop do
         | 
| 35 43 | 
             
                    if File.exists? file_name
         | 
| 36 44 | 
             
                      if already_found
         | 
| 37 | 
            -
                         | 
| 38 | 
            -
             | 
| 45 | 
            +
                        if file_name.include?('.')
         | 
| 46 | 
            +
                          last_no = file_name.scan(/_(\d+)\./)[0][0]
         | 
| 47 | 
            +
                          file_name.sub!(/_#{last_no}\./) {"_#{(last_no.to_i+1).to_s}."}
         | 
| 48 | 
            +
                        else
         | 
| 49 | 
            +
                          last_no = file_name.scan(/_(\d+)$/)[0][0]
         | 
| 50 | 
            +
                          file_name.sub!(/_#{last_no}$/) {"_#{(last_no.to_i+1).to_s}"}
         | 
| 51 | 
            +
                        end
         | 
| 39 52 | 
             
                      else
         | 
| 40 | 
            -
                        file_name. | 
| 41 | 
            -
             | 
| 53 | 
            +
                        if file_name.include?('.')
         | 
| 54 | 
            +
                          file_name.sub!(/\./) {"_1\."}
         | 
| 55 | 
            +
                          already_found = true
         | 
| 56 | 
            +
                        else
         | 
| 57 | 
            +
                          file_name << '_1'
         | 
| 58 | 
            +
                          already_found = true
         | 
| 59 | 
            +
                        end
         | 
| 42 60 | 
             
                      end
         | 
| 43 61 | 
             
                    else
         | 
| 44 62 | 
             
                      break
         | 
| @@ -1,17 +1,17 @@ | |
| 1 1 | 
             
            module Scrubyt
         | 
| 2 2 | 
             
              class RegexpFilter < BaseFilter
         | 
| 3 | 
            -
             | 
| 3 | 
            +
             | 
| 4 4 | 
             
                def evaluate(source)
         | 
| 5 5 | 
             
                  if source.is_a? String
         | 
| 6 6 | 
             
                    source.scan(@example).flatten
         | 
| 7 7 | 
             
                  else
         | 
| 8 | 
            -
                    source. | 
| 9 | 
            -
                  end | 
| 8 | 
            +
                    source.inner_html.gsub(/<.*?>/, '').scan(@example).flatten
         | 
| 9 | 
            +
                  end
         | 
| 10 10 | 
             
                end
         | 
| 11 | 
            -
             | 
| 11 | 
            +
             | 
| 12 12 | 
             
                def to_sexp
         | 
| 13 13 | 
             
                  [:lit, @example]
         | 
| 14 14 | 
             
                end
         | 
| 15 | 
            -
             | 
| 15 | 
            +
             | 
| 16 16 | 
             
              end #End of class TreeFilter
         | 
| 17 17 | 
             
            end #End of module Scrubyt
         | 
| @@ -0,0 +1,14 @@ | |
| 1 | 
            +
            module Scrubyt
         | 
| 2 | 
            +
              class ScriptFilter < BaseFilter
         | 
| 3 | 
            +
             | 
| 4 | 
            +
                def evaluate(source)
         | 
| 5 | 
            +
                  param = source
         | 
| 6 | 
            +
                  param = source.inner_html.gsub(/<.*?>/, "") unless source.is_a? String
         | 
| 7 | 
            +
                  @example.call param
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def to_sexp
         | 
| 11 | 
            +
                  [:str, "FIXME!!! Can't dump Proc"]
         | 
| 12 | 
            +
                end #end of method to_sexp
         | 
| 13 | 
            +
              end #End of class ConstantFilter
         | 
| 14 | 
            +
            end #End of module Scrubyt
         | 
| @@ -0,0 +1,38 @@ | |
| 1 | 
            +
            module Scrubyt
         | 
| 2 | 
            +
              class TextFilter < BaseFilter
         | 
| 3 | 
            +
             | 
| 4 | 
            +
                def evaluate(source)
         | 
| 5 | 
            +
                    return find_string(source) if @example =~ /^find\(/
         | 
| 6 | 
            +
                    final_element_name = @example.scan(/^(.+?)\[/)[0][0]
         | 
| 7 | 
            +
                    text = Regexp.escape(@example.scan(/\[(.+?)\]/)[0][0])
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                    index = @example.scan(/\]:(.+)/).flatten
         | 
| 10 | 
            +
                    index = 0 if index.empty?
         | 
| 11 | 
            +
                    index = index[0].to_i unless index[0] == "all"
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                    result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
         | 
| 14 | 
            +
                    return "" unless result
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    if index[0] == "all"
         | 
| 17 | 
            +
                      result.inject([]) {|a,r| a << XPathUtils.traverse_up_until_name(r,final_element_name); a}
         | 
| 18 | 
            +
                    else
         | 
| 19 | 
            +
                      [XPathUtils.traverse_up_until_name(result,final_element_name)]
         | 
| 20 | 
            +
                    end
         | 
| 21 | 
            +
                end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                def find_string(source)
         | 
| 24 | 
            +
                  str = @example.scan(/find\((.+)\)/).flatten[0]
         | 
| 25 | 
            +
                  strings_to_find = str.include? ('|') ? str.split('|') : [str]
         | 
| 26 | 
            +
                  strings_to_find.each do |s|
         | 
| 27 | 
            +
                    result = SharedUtils.traverse_for_match(source,/#{s}/i)
         | 
| 28 | 
            +
                    return [s] unless result.empty?
         | 
| 29 | 
            +
                  end
         | 
| 30 | 
            +
                  return []
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                def to_sexp
         | 
| 34 | 
            +
                  [:str, @example]
         | 
| 35 | 
            +
                end #end of method to_sexp
         | 
| 36 | 
            +
              end #End of class TextFilter
         | 
| 37 | 
            +
            end #End of module Scrubyt
         | 
| 38 | 
            +
             | 
| @@ -38,7 +38,7 @@ module Scrubyt | |
| 38 38 | 
             
                  return if @temp_sink.is_a? String
         | 
| 39 39 | 
             
                  return if @example =~ /.+\[.+\]$/
         | 
| 40 40 |  | 
| 41 | 
            -
                  text = SharedUtils.prepare_text_for_comparison(@temp_sink. | 
| 41 | 
            +
                  text = SharedUtils.prepare_text_for_comparison(@temp_sink.inner_html.gsub(/<.*?>/, ''))
         | 
| 42 42 | 
             
                  match_range = @temp_sink.match_data.begin(0)..@temp_sink.match_data.end(0)
         | 
| 43 43 | 
             
                  return if match_range == (0..text.length)
         | 
| 44 44 |  | 
| @@ -64,7 +64,7 @@ module Scrubyt | |
| 64 64 | 
             
                  when EXAMPLE_TYPE_XPATH
         | 
| 65 65 | 
             
                    @xpath = @example
         | 
| 66 66 | 
             
                  when EXAMPLE_TYPE_STRING
         | 
| 67 | 
            -
                    @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern. | 
| 67 | 
            +
                    @temp_sink = SimpleExampleLookup.find_node_from_text(@parent_pattern.extractor.get_hpricot_doc,
         | 
| 68 68 | 
             
                                                                         @example,
         | 
| 69 69 | 
             
                                                                         next_page_example)
         | 
| 70 70 | 
             
                    return if @temp_sink == nil
         | 
| @@ -116,10 +116,10 @@ module Scrubyt | |
| 116 116 | 
             
                      current_example_index += 1
         | 
| 117 117 | 
             
                    end
         | 
| 118 118 | 
             
                  when EXAMPLE_TYPE_IMAGE
         | 
| 119 | 
            -
                    @temp_sink = XPathUtils.find_image(@parent_pattern. | 
| 119 | 
            +
                    @temp_sink = XPathUtils.find_image(@parent_pattern.extractor.get_hpricot_doc, @example)
         | 
| 120 120 | 
             
                    @xpath = XPathUtils.generate_XPath(@temp_sink, nil, true)
         | 
| 121 121 | 
             
                  when EXAMPLE_TYPE_COMPOUND
         | 
| 122 | 
            -
                    @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern. | 
| 122 | 
            +
                    @temp_sink = CompoundExampleLookup.find_node_from_compund_example(@parent_pattern.extractor.get_hpricot_doc,
         | 
| 123 123 | 
             
                                                                                      @example,
         | 
| 124 124 | 
             
                                                                                      next_page_example)
         | 
| 125 125 | 
             
                    @xpath = @parent_pattern.generalize ? XPathUtils.generate_XPath(@temp_sink, nil, false) :
         | 
| @@ -128,7 +128,7 @@ module Scrubyt | |
| 128 128 | 
             
                end
         | 
| 129 129 |  | 
| 130 130 | 
             
                def generate_relative_XPath(parent_xpath)
         | 
| 131 | 
            -
                  parent_xpath = XPathUtils.to_full_XPath(@parent_pattern. | 
| 131 | 
            +
                  parent_xpath = XPathUtils.to_full_XPath(@parent_pattern.extractor.get_hpricot_doc,
         | 
| 132 132 | 
             
                                                          parent_xpath,
         | 
| 133 133 | 
             
                                                          @parent_pattern.parent.generalize) if parent_xpath =~ /(\[@.+=.+\])$/
         | 
| 134 134 | 
             
                  @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
         | 
| @@ -33,7 +33,7 @@ module Scrubyt | |
| 33 33 | 
             
                #    # write out the HTML subtree beginning at the matched element
         | 
| 34 34 | 
             
                #    PATTERN_TYPE_HTML_SUBTREE = :PATTERN_TYPE_HTML_SUBTREE
         | 
| 35 35 |  | 
| 36 | 
            -
                VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree]
         | 
| 36 | 
            +
                VALID_PATTERN_TYPES = [:tree, :attribute, :regexp, :detail_page, :download, :html_subtree, :constant, :script, :text]
         | 
| 37 37 |  | 
| 38 38 | 
             
                #The pattern can be either a model pattern (in this case it is
         | 
| 39 39 | 
             
                #written to the output) or a temp pattern (in this case it is skipped)
         | 
| @@ -46,27 +46,25 @@ module Scrubyt | |
| 46 46 | 
             
                #    #of the pattrern which was skipped
         | 
| 47 47 | 
             
                #    OUTPUT_TYPE_TEMP = :OUTPUT_TYPE_TEMP
         | 
| 48 48 |  | 
| 49 | 
            -
                VALID_OUTPUT_TYPES = [:model, :temp]
         | 
| 49 | 
            +
                VALID_OUTPUT_TYPES = [:model, :temp, :page_list]
         | 
| 50 50 |  | 
| 51 51 | 
             
                #These options can be set upon wrapper creation
         | 
| 52 | 
            -
                PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve]
         | 
| 52 | 
            +
                PATTERN_OPTIONS = [:generalize, :type, :output_type, :references, :limit, :default, :resolve, :except, :example_type]
         | 
| 53 53 | 
             
                VALID_OPTIONS = PATTERN_OPTIONS + Scrubyt::CompoundExample::DESCRIPTORS + Scrubyt::ResultNode::OUTPUT_OPTIONS
         | 
| 54 54 |  | 
| 55 | 
            -
                attr_accessor(:name, :options, :children, :constraints, :filters, :parent,
         | 
| 56 | 
            -
                              : | 
| 57 | 
            -
                              :indices_to_extract, :referenced_extractor, :referenced_pattern,
         | 
| 58 | 
            -
                              :source_file, :source_proc, :modifier_calls)
         | 
| 55 | 
            +
                attr_accessor(:name, :options, :children, :constraints, :filters, :parent, :extractor,
         | 
| 56 | 
            +
                              :indices_to_extract, :referenced_extractor, :referenced_pattern, :modifier_calls)
         | 
| 59 57 |  | 
| 60 58 | 
             
                attr_reader(:next_page_url, :result_indexer)
         | 
| 61 59 |  | 
| 62 60 | 
             
                option_reader(:type => :tree, :output_type => :model, :generalize => false,
         | 
| 63 61 | 
             
                              :write_text => lambda { @children.size == 0 }, :limit => nil,
         | 
| 64 | 
            -
             | 
| 62 | 
            +
                              :default => nil, :resolve => :full, :except => nil, :example_type => nil)
         | 
| 65 63 |  | 
| 66 | 
            -
                def initialize(name, args=[],  | 
| 64 | 
            +
                def initialize(name, args=[], extractor=nil, parent=nil, &block)
         | 
| 67 65 | 
             
                  #init attributes
         | 
| 68 66 | 
             
                  @name = name
         | 
| 69 | 
            -
                  @ | 
| 67 | 
            +
                  @extractor = extractor
         | 
| 70 68 | 
             
                  @parent = parent
         | 
| 71 69 | 
             
                  @options = {}
         | 
| 72 70 | 
             
                  @children = []
         | 
| @@ -83,6 +81,7 @@ module Scrubyt | |
| 83 81 | 
             
                  #perform checks for special cases
         | 
| 84 82 | 
             
                  examples = check_if_shortcut_pattern() if examples == nil
         | 
| 85 83 | 
             
                  check_if_detail_page(block)
         | 
| 84 | 
            +
                  @options[:output_type] = :page_list if name == 'page_list'
         | 
| 86 85 |  | 
| 87 86 | 
             
                  #create filters
         | 
| 88 87 | 
             
                  if examples == nil
         | 
| @@ -97,7 +96,7 @@ module Scrubyt | |
| 97 96 | 
             
                  #@generalize was not set up explicitly
         | 
| 98 97 | 
             
                  if @options[:generalize].nil?
         | 
| 99 98 | 
             
                    @options[:generalize] = true if parent.nil?
         | 
| 100 | 
            -
                    @options[:generalize] = false if filters[0].example =~ /.+\[[a-zA-Z].+\]$/
         | 
| 99 | 
            +
                    @options[:generalize] = false if ((filters[0].example.is_a? String) && (filters[0].example =~ /.+\[[a-zA-Z].+\]$/))
         | 
| 101 100 | 
             
                  end
         | 
| 102 101 |  | 
| 103 102 | 
             
                  #parse child patterns if available
         | 
| @@ -160,7 +159,6 @@ module Scrubyt | |
| 160 159 | 
             
                  if @name =~ /.+_detail/
         | 
| 161 160 | 
             
                    @options[:type] = :detail_page
         | 
| 162 161 | 
             
                    @referenced_extractor = block
         | 
| 163 | 
            -
                    Scrubyt::Extractor.add_detail_extractor_to_pattern_name(block, self)
         | 
| 164 162 | 
             
                  end
         | 
| 165 163 | 
             
                end
         | 
| 166 164 |  | 
| @@ -194,7 +192,7 @@ module Scrubyt | |
| 194 192 | 
             
                        end
         | 
| 195 193 | 
             
                      else
         | 
| 196 194 | 
             
                        #create child pattern
         | 
| 197 | 
            -
                        child = Scrubyt::Pattern.new(method_name.to_s, args, @current. | 
| 195 | 
            +
                        child = Scrubyt::Pattern.new(method_name.to_s, args, @current.extractor, @current, &block)
         | 
| 198 196 | 
             
                        @current.children << child
         | 
| 199 197 | 
             
                        child
         | 
| 200 198 | 
             
                      end
         | 
| @@ -213,7 +211,7 @@ module Scrubyt | |
| 213 211 | 
             
                #
         | 
| 214 212 | 
             
                # camera_data.item[1].item_name[0]
         | 
| 215 213 | 
             
                def method_missing(method_name, *args, &block)
         | 
| 216 | 
            -
                  if @ | 
| 214 | 
            +
                  if @extractor.evaluating_extractor_definition
         | 
| 217 215 | 
             
                    @modifier_calls << [method_name, [:array, *args.collect { |arg| [:lit, arg] }]]
         | 
| 218 216 | 
             
                  end
         | 
| 219 217 |  | 
| @@ -294,9 +292,17 @@ module Scrubyt | |
| 294 292 | 
             
                    result_nodes << node
         | 
| 295 293 | 
             
                  end
         | 
| 296 294 | 
             
                  if result_nodes.empty?
         | 
| 297 | 
            -
                    result_nodes << ResultNode.new(@name | 
| 295 | 
            +
                    result_nodes << ResultNode.new(@name, @options[:default], @options) if @options[:default]
         | 
| 296 | 
            +
                  end
         | 
| 297 | 
            +
                  case output_type
         | 
| 298 | 
            +
                    when :model
         | 
| 299 | 
            +
                      return result_nodes
         | 
| 300 | 
            +
                    when :page_list
         | 
| 301 | 
            +
                      result_nodes.each do |result_node|
         | 
| 302 | 
            +
                        @extractor.add_to_next_page_list result_node
         | 
| 303 | 
            +
                      end
         | 
| 304 | 
            +
                      return []
         | 
| 298 305 | 
             
                  end
         | 
| 299 | 
            -
                  result_nodes
         | 
| 300 306 | 
             
                end
         | 
| 301 307 |  | 
| 302 308 | 
             
                def to_sexp
         | 
| @@ -310,8 +316,7 @@ module Scrubyt | |
| 310 316 |  | 
| 311 317 | 
             
                  if type == :detail_page
         | 
| 312 318 | 
             
                    #add detail page extractor
         | 
| 313 | 
            -
                     | 
| 314 | 
            -
                    sexp = [:iter, sexp, nil, [:block, detail_root.to_sexp]]
         | 
| 319 | 
            +
                    sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
         | 
| 315 320 | 
             
                  else
         | 
| 316 321 | 
             
                    #add child block if the pattern has children
         | 
| 317 322 | 
             
                    sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
         | 
| @@ -363,6 +368,8 @@ module Scrubyt | |
| 363 368 | 
             
                  elsif (args[0].is_a? Hash)
         | 
| 364 369 | 
             
                    examples = (args.select {|e| e.is_a? Hash}).select {|e| CompoundExample.compound_example?(e)}
         | 
| 365 370 | 
             
                    examples = nil if examples == []
         | 
| 371 | 
            +
                  elsif (args[0].is_a? Proc)
         | 
| 372 | 
            +
                    examples = [args[0]]
         | 
| 366 373 | 
             
                  end
         | 
| 367 374 |  | 
| 368 375 | 
             
                  @has_examples = !examples.nil?
         | 
| @@ -7,166 +7,147 @@ module Scrubyt | |
| 7 7 | 
             
              #
         | 
| 8 8 | 
             
              #Originally also the navigation actions were here, but since the class got too
         | 
| 9 9 | 
             
              #big, they were factored out to an own class, NavigationAction.
         | 
| 10 | 
            -
              class Extractor | 
| 10 | 
            +
              class Extractor
         | 
| 11 | 
            +
                include FetchAction
         | 
| 12 | 
            +
                
         | 
| 13 | 
            +
                attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
         | 
| 14 | 
            +
                
         | 
| 11 15 | 
             
                #The definition of the extractor is passed through this method
         | 
| 12 16 | 
             
                def self.define(mode=nil, &extractor_definition)
         | 
| 17 | 
            +
                  extractor = self.new(mode, extractor_definition)
         | 
| 18 | 
            +
                  extractor.result
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
                
         | 
| 21 | 
            +
                def self.load(filename)
         | 
| 22 | 
            +
                  define(&eval(IO.read(filename)))
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
                
         | 
| 25 | 
            +
                def initialize(mode, extractor_definition)
         | 
| 26 | 
            +
                  @mode = mode
         | 
| 27 | 
            +
                  @root_patterns = []
         | 
| 28 | 
            +
                  @next_page_pattern = nil
         | 
| 29 | 
            +
                  #      @hpricot_doc = nil
         | 
| 30 | 
            +
                  #      @hpricot_doc_url = nil
         | 
| 31 | 
            +
                  @evaluating_extractor_definition = false
         | 
| 32 | 
            +
                  @next_page_list = []
         | 
| 33 | 
            +
                  @processed_pages = []
         | 
| 34 | 
            +
                  
         | 
| 13 35 | 
             
                  backtrace = SharedUtils.get_backtrace
         | 
| 14 36 | 
             
                  parts = backtrace[1].split(':')
         | 
| 15 37 | 
             
                  source_file = parts[0]
         | 
| 16 38 |  | 
| 17 | 
            -
                   | 
| 18 | 
            -
                  #We are keeping the relations between the detail patterns and their root patterns
         | 
| 19 | 
            -
                  @@detail_extractor_to_pattern_name = {}
         | 
| 20 | 
            -
                  @@detail_pattern_relations = {} 
         | 
| 21 | 
            -
                  #root pattern -> URIBuilder mapping
         | 
| 22 | 
            -
                  @@next_patterns = {}
         | 
| 23 | 
            -
                  mode_name = (mode == :production ? 'Production' : 'Learning')
         | 
| 39 | 
            +
                  Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
         | 
| 24 40 |  | 
| 25 | 
            -
                   | 
| 26 | 
            -
             | 
| 27 | 
            -
                   | 
| 28 | 
            -
                   | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 41 | 
            +
                  @evaluating_extractor_definition = true
         | 
| 42 | 
            +
                  context = Object.new
         | 
| 43 | 
            +
                  context.extend NavigationActions
         | 
| 44 | 
            +
                  context.instance_eval do
         | 
| 45 | 
            +
                    def extractor=(value)
         | 
| 46 | 
            +
                      @extractor = value
         | 
| 47 | 
            +
                    end
         | 
| 48 | 
            +
                    
         | 
| 49 | 
            +
                    def next_page(*args)
         | 
| 50 | 
            +
                      @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
         | 
| 51 | 
            +
                    end
         | 
| 52 | 
            +
                    
         | 
| 53 | 
            +
                    def method_missing(method_name, *args, &block)
         | 
| 54 | 
            +
                      root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
         | 
| 55 | 
            +
                      @extractor.root_patterns << root_pattern
         | 
| 56 | 
            +
                      root_pattern
         | 
| 57 | 
            +
                    end
         | 
| 58 | 
            +
                  end
         | 
| 59 | 
            +
                  context.extractor = self
         | 
| 60 | 
            +
                  context.instance_eval(&extractor_definition)
         | 
| 61 | 
            +
                  @evaluating_extractor_definition = false
         | 
| 62 | 
            +
                  
         | 
| 63 | 
            +
                  if @root_patterns.empty?
         | 
| 37 64 | 
             
                    # TODO: this should be an exception
         | 
| 38 65 | 
             
                    Scrubyt.log :ERROR, 'No extractor defined, exiting...'
         | 
| 39 66 | 
             
                    exit
         | 
| 40 67 | 
             
                  end
         | 
| 41 | 
            -
             | 
| 42 | 
            -
                  root_pattern.source_file = source_file
         | 
| 43 | 
            -
                  root_pattern.source_proc = extractor_definition
         | 
| 68 | 
            +
                  
         | 
| 44 69 | 
             
                  #Once all is set up, evaluate the extractor from the root pattern!
         | 
| 45 | 
            -
                  root_results = evaluate_extractor | 
| 46 | 
            -
             | 
| 47 | 
            -
                   | 
| 48 | 
            -
                   | 
| 49 | 
            -
                   | 
| 50 | 
            -
             | 
| 70 | 
            +
                  root_results = evaluate_extractor
         | 
| 71 | 
            +
                  
         | 
| 72 | 
            +
                  @result = ScrubytResult.new('root')
         | 
| 73 | 
            +
                  @result.push(*root_results)
         | 
| 74 | 
            +
                  @result.root_patterns = @root_patterns
         | 
| 75 | 
            +
                  @result.source_file = source_file
         | 
| 76 | 
            +
                  @result.source_proc = extractor_definition
         | 
| 77 | 
            +
                  
         | 
| 51 78 | 
             
                  #Return the root pattern
         | 
| 52 79 | 
             
                  Scrubyt.log :INFO, 'Extraction finished succesfully!'
         | 
| 53 | 
            -
                  scrubyt_result
         | 
| 54 | 
            -
                end
         | 
| 55 | 
            -
                
         | 
| 56 | 
            -
                #Evaluate a subexttractor (i.e. an extractor on a detail page).
         | 
| 57 | 
            -
                #The url passed to this function is automatically loaded.
         | 
| 58 | 
            -
                #The definition of the subextractor is passed as a block
         | 
| 59 | 
            -
                #
         | 
| 60 | 
            -
                #!!!! THIS CODE IS A MESS, IT needs to be refactored ASAP....
         | 
| 61 | 
            -
                def self.evaluate_subextractor(url, parent_pattern, resolve)
         | 
| 62 | 
            -
                  if @@detail_pattern_relations.keys.include? @@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]
         | 
| 63 | 
            -
                    detail_root = @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
         | 
| 64 | 
            -
                    detail_root.last_result = nil
         | 
| 65 | 
            -
                    FetchAction.store_page
         | 
| 66 | 
            -
                    @@original_evaluation_context.push @@evaluation_context
         | 
| 67 | 
            -
                    @@host_stack.push FetchAction.get_host_name
         | 
| 68 | 
            -
                    @@evaluation_context = EvaluationContext.new
         | 
| 69 | 
            -
                    FetchAction.restore_host_name
         | 
| 70 | 
            -
                    fetch url, :resolve => resolve
         | 
| 71 | 
            -
                    @@evaluation_context.extractor = self
         | 
| 72 | 
            -
                    @@evaluation_context.root_pattern = detail_root      
         | 
| 73 | 
            -
                    root_results = evaluate_extractor detail_root      
         | 
| 74 | 
            -
                    @@evaluation_context = @@original_evaluation_context.pop
         | 
| 75 | 
            -
                    FetchAction.restore_page
         | 
| 76 | 
            -
                    FetchAction.store_host_name(@@host_stack.pop)
         | 
| 77 | 
            -
                    root_results
         | 
| 78 | 
            -
                  else      
         | 
| 79 | 
            -
                    @@original_evaluation_context ||= []
         | 
| 80 | 
            -
                    @@host_stack ||= []
         | 
| 81 | 
            -
                    FetchAction.store_page
         | 
| 82 | 
            -
                    @@original_evaluation_context.push @@evaluation_context
         | 
| 83 | 
            -
                    @@host_stack.push FetchAction.get_host_name
         | 
| 84 | 
            -
                    @@evaluation_context = EvaluationContext.new
         | 
| 85 | 
            -
                    FetchAction.restore_host_name      
         | 
| 86 | 
            -
                    fetch url, :resolve => resolve
         | 
| 87 | 
            -
                    class_eval(&parent_pattern.referenced_extractor)
         | 
| 88 | 
            -
                    root_pattern = @@evaluation_context.root_pattern
         | 
| 89 | 
            -
                    @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]] = root_pattern
         | 
| 90 | 
            -
                    root_results = evaluate_extractor(root_pattern)
         | 
| 91 | 
            -
                    @@evaluation_context = @@original_evaluation_context.pop
         | 
| 92 | 
            -
                    FetchAction.restore_page
         | 
| 93 | 
            -
                    FetchAction.store_host_name(@@host_stack.pop)
         | 
| 94 | 
            -
                    root_results
         | 
| 95 | 
            -
                  end
         | 
| 96 | 
            -
                end
         | 
| 97 | 
            -
                
         | 
| 98 | 
            -
                #build the current wrapper    
         | 
| 99 | 
            -
                def self.method_missing(method_name, *args, &block)
         | 
| 100 | 
            -
                  if NavigationActions::KEYWORDS.include? method_name.to_s
         | 
| 101 | 
            -
                    NavigationActions.send(method_name, *args)
         | 
| 102 | 
            -
                    return
         | 
| 103 | 
            -
                  end
         | 
| 104 | 
            -
             | 
| 105 | 
            -
                  if method_name.to_s == 'next_page'
         | 
| 106 | 
            -
                    pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context)
         | 
| 107 | 
            -
                    pattern.evaluation_context = @@evaluation_context
         | 
| 108 | 
            -
                    
         | 
| 109 | 
            -
                    @@evaluation_context.setup_uri_builder(pattern, args)
         | 
| 110 | 
            -
                    @@next_patterns[@@last_root_pattern] = @@evaluation_context.uri_builder
         | 
| 111 | 
            -
                  else
         | 
| 112 | 
            -
                    raise "Only one root pattern allowed" if !@@evaluation_context.root_pattern.nil?
         | 
| 113 | 
            -
                    #Create a root pattern
         | 
| 114 | 
            -
                    @@evaluation_context.extractor = self
         | 
| 115 | 
            -
                    root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @@evaluation_context, root_pattern, &block)
         | 
| 116 | 
            -
                    @@last_root_pattern = root_pattern
         | 
| 117 | 
            -
                    @@evaluation_context.root_pattern = root_pattern
         | 
| 118 | 
            -
                    root_pattern
         | 
| 119 | 
            -
                  end
         | 
| 120 80 | 
             
                end
         | 
| 121 81 |  | 
| 122 | 
            -
                def  | 
| 123 | 
            -
                   | 
| 124 | 
            -
                end
         | 
| 125 | 
            -
             | 
| 126 | 
            -
                def self.get_detail_extractor(parent_pattern)
         | 
| 127 | 
            -
                  @@detail_pattern_relations[@@detail_extractor_to_pattern_name[parent_pattern.referenced_extractor]]
         | 
| 128 | 
            -
                end
         | 
| 129 | 
            -
             | 
| 130 | 
            -
                def self.get_hpricot_doc
         | 
| 131 | 
            -
                  NavigationActions.get_hpricot_doc
         | 
| 82 | 
            +
                def get_hpricot_doc
         | 
| 83 | 
            +
                  FetchAction.get_hpricot_doc
         | 
| 132 84 | 
             
                end
         | 
| 133 85 |  | 
| 134 | 
            -
                def  | 
| 135 | 
            -
                   | 
| 86 | 
            +
                def get_current_doc_url
         | 
| 87 | 
            +
                  FetchAction.get_current_doc_url
         | 
| 136 88 | 
             
                end
         | 
| 137 89 |  | 
| 138 | 
            -
                def  | 
| 139 | 
            -
                   | 
| 90 | 
            +
                def get_detail_pattern_relations
         | 
| 91 | 
            +
                  @detail_pattern_relations
         | 
| 140 92 | 
             
                end
         | 
| 141 93 |  | 
| 142 | 
            -
                def  | 
| 143 | 
            -
                   | 
| 94 | 
            +
                def get_mode
         | 
| 95 | 
            +
                  @mode
         | 
| 144 96 | 
             
                end
         | 
| 145 97 |  | 
| 146 | 
            -
                def  | 
| 147 | 
            -
                   | 
| 98 | 
            +
                def get_original_host_name
         | 
| 99 | 
            +
                  @original_host_name
         | 
| 148 100 | 
             
                end
         | 
| 149 101 |  | 
| 150 | 
            -
                def  | 
| 151 | 
            -
                   | 
| 102 | 
            +
                def add_to_next_page_list(result_node)
         | 
| 103 | 
            +
                  if result_node.result.is_a? Hpricot::Elem
         | 
| 104 | 
            +
                    node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
         | 
| 105 | 
            +
                    return if node == nil || node.attributes['href'] == nil
         | 
| 106 | 
            +
                    href = node.attributes['href'].gsub('&') {'&'}
         | 
| 107 | 
            +
                  elsif result_node.result.is_a? String
         | 
| 108 | 
            +
                    href = result_node.result
         | 
| 109 | 
            +
                  end
         | 
| 110 | 
            +
                  url = href #TODO need absolute address here 1/4
         | 
| 111 | 
            +
                  @next_page_list << url
         | 
| 152 112 | 
             
                end
         | 
| 153 113 |  | 
| 154 | 
            -
                 | 
| 155 | 
            -
                
         | 
| 156 | 
            -
                def self.evaluate_extractor(root_pattern)
         | 
| 114 | 
            +
                def evaluate_extractor
         | 
| 157 115 | 
             
                  root_results = []
         | 
| 158 | 
            -
                   | 
| 159 | 
            -
             | 
| 116 | 
            +
                  current_page_count = 1
         | 
| 117 | 
            +
                  catch :quit_next_page_loop do
         | 
| 160 118 | 
             
                    loop do
         | 
| 161 | 
            -
                       | 
| 162 | 
            -
                       | 
| 163 | 
            -
                       | 
| 119 | 
            +
                      url = get_current_doc_url #TODO need absolute address here 2/4
         | 
| 120 | 
            +
                      puts url
         | 
| 121 | 
            +
                      @processed_pages << url
         | 
| 122 | 
            +
                      @root_patterns.each do |root_pattern|
         | 
| 123 | 
            +
                        root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
         | 
| 124 | 
            +
                      end
         | 
| 125 | 
            +
                      
         | 
| 126 | 
            +
                      while @processed_pages.include? url #TODO need absolute address here 3/4
         | 
| 127 | 
            +
                        if !@next_page_pattern.nil?
         | 
| 128 | 
            +
                          throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
         | 
| 129 | 
            +
                          throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
         | 
| 130 | 
            +
                          xpath = @next_page_pattern.filters[0].xpath
         | 
| 131 | 
            +
                          node = (get_hpricot_doc/xpath).map.last
         | 
| 132 | 
            +
                          node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
         | 
| 133 | 
            +
                          throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
         | 
| 134 | 
            +
                          href = node.attributes['href'].gsub('&') {'&'}
         | 
| 135 | 
            +
                          throw :quit_next_page_loop if href == nil
         | 
| 136 | 
            +
                          url = href #TODO need absolute address here 4/4
         | 
| 137 | 
            +
                        else
         | 
| 138 | 
            +
                          throw :quit_next_page_loop if @next_page_list.empty?
         | 
| 139 | 
            +
                          url = @next_page_list.pop
         | 
| 140 | 
            +
                        end
         | 
| 141 | 
            +
                      end
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                      restore_host_name
         | 
| 144 | 
            +
                      FetchAction.fetch(url)
         | 
| 145 | 
            +
                      
         | 
| 146 | 
            +
                      current_page_count += 1
         | 
| 164 147 | 
             
                    end
         | 
| 165 | 
            -
                  else
         | 
| 166 | 
            -
                    root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
         | 
| 167 148 | 
             
                  end
         | 
| 168 149 | 
             
                  root_results
         | 
| 169 150 | 
             
                end
         | 
| 170 151 |  | 
| 171 | 
            -
              end | 
| 172 | 
            -
            end | 
| 152 | 
            +
              end
         | 
| 153 | 
            +
            end
         |