scrubyt 0.3.0 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
    
        metadata
    CHANGED
    
    | @@ -3,9 +3,9 @@ rubygems_version: 0.9.0 | |
| 3 3 | 
             
            specification_version: 1
         | 
| 4 4 | 
             
            name: scrubyt
         | 
| 5 5 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 6 | 
            -
              version: 0.3. | 
| 7 | 
            -
            date: 2007- | 
| 8 | 
            -
            summary: A powerful Web-scraping framework
         | 
| 6 | 
            +
              version: 0.3.4
         | 
| 7 | 
            +
            date: 2007-09-26 00:00:00 +02:00
         | 
| 8 | 
            +
            summary: A powerful Web-scraping framework built on Mechanize and Hpricot
         | 
| 9 9 | 
             
            require_paths: 
         | 
| 10 10 | 
             
            - lib
         | 
| 11 11 | 
             
            email: peter@rubyrailways.com
         | 
| @@ -15,7 +15,7 @@ description: scRUBYt! is an easy to learn and use, yet powerful and effective we | |
| 15 15 | 
             
            autorequire: 
         | 
| 16 16 | 
             
            default_executable: 
         | 
| 17 17 | 
             
            bindir: bin
         | 
| 18 | 
            -
            has_rdoc:  | 
| 18 | 
            +
            has_rdoc: false
         | 
| 19 19 | 
             
            required_ruby_version: !ruby/object:Gem::Version::Requirement 
         | 
| 20 20 | 
             
              requirements: 
         | 
| 21 21 | 
             
              - - ">"
         | 
| @@ -61,9 +61,10 @@ files: | |
| 61 61 | 
             
            - lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
         | 
| 62 62 | 
             
            - lib/scrubyt/core/scraping/filters/detail_page_filter.rb
         | 
| 63 63 | 
             
            - lib/scrubyt/core/scraping/filters/download_filter.rb
         | 
| 64 | 
            -
            - lib/scrubyt/core/ | 
| 64 | 
            +
            - lib/scrubyt/core/scraping/filters/text_filter.rb
         | 
| 65 | 
            +
            - lib/scrubyt/core/scraping/filters/constant_filter.rb
         | 
| 66 | 
            +
            - lib/scrubyt/core/scraping/filters/script_filter.rb
         | 
| 65 67 | 
             
            - lib/scrubyt/core/shared/extractor.rb
         | 
| 66 | 
            -
            - lib/scrubyt/core/shared/evaluation_context.rb
         | 
| 67 68 | 
             
            test_files: []
         | 
| 68 69 |  | 
| 69 70 | 
             
            rdoc_options: []
         | 
| @@ -95,3 +96,48 @@ dependencies: | |
| 95 96 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 96 97 | 
             
                    version: 0.6.3
         | 
| 97 98 | 
             
                version: 
         | 
| 99 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 100 | 
            +
              name: ParseTreeReloaded
         | 
| 101 | 
            +
              version_requirement: 
         | 
| 102 | 
            +
              version_requirements: !ruby/object:Gem::Version::Requirement 
         | 
| 103 | 
            +
                requirements: 
         | 
| 104 | 
            +
                - - ">"
         | 
| 105 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 106 | 
            +
                    version: 0.0.0
         | 
| 107 | 
            +
                version: 
         | 
| 108 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 109 | 
            +
              name: RubyInlineAcceleration
         | 
| 110 | 
            +
              version_requirement: 
         | 
| 111 | 
            +
              version_requirements: !ruby/object:Gem::Version::Requirement 
         | 
| 112 | 
            +
                requirements: 
         | 
| 113 | 
            +
                - - ">"
         | 
| 114 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 115 | 
            +
                    version: 0.0.0
         | 
| 116 | 
            +
                version: 
         | 
| 117 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 118 | 
            +
              name: RubyInline
         | 
| 119 | 
            +
              version_requirement: 
         | 
| 120 | 
            +
              version_requirements: !ruby/object:Gem::Version::Requirement 
         | 
| 121 | 
            +
                requirements: 
         | 
| 122 | 
            +
                - - "="
         | 
| 123 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 124 | 
            +
                    version: 3.6.3
         | 
| 125 | 
            +
                version: 
         | 
| 126 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 127 | 
            +
              name: ParseTree
         | 
| 128 | 
            +
              version_requirement: 
         | 
| 129 | 
            +
              version_requirements: !ruby/object:Gem::Version::Requirement 
         | 
| 130 | 
            +
                requirements: 
         | 
| 131 | 
            +
                - - "="
         | 
| 132 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 133 | 
            +
                    version: 1.7.1
         | 
| 134 | 
            +
                version: 
         | 
| 135 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 136 | 
            +
              name: ruby2ruby
         | 
| 137 | 
            +
              version_requirement: 
         | 
| 138 | 
            +
              version_requirements: !ruby/object:Gem::Version::Requirement 
         | 
| 139 | 
            +
                requirements: 
         | 
| 140 | 
            +
                - - "="
         | 
| 141 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 142 | 
            +
                    version: 1.1.6
         | 
| 143 | 
            +
                version: 
         | 
| @@ -1,57 +0,0 @@ | |
| 1 | 
            -
            module Scrubyt
         | 
| 2 | 
            -
              ##
         | 
| 3 | 
            -
              #=<tt>Holding the evaluation context of the extraction process</tt>
         | 
| 4 | 
            -
              #
         | 
| 5 | 
            -
              #Every kind of data that is shared among patterns during the extraction process
         | 
| 6 | 
            -
              #is held in this class, so it can be looked up anytime.
         | 
| 7 | 
            -
              #
         | 
| 8 | 
            -
              #This class provides also some high-level basic functionality in navigation, like
         | 
| 9 | 
            -
              #crawling to new pages, attaching doucment to the root pattern once arrived at the
         | 
| 10 | 
            -
              #desired page etc.
         | 
| 11 | 
            -
              #
         | 
| 12 | 
            -
              #It can be viewed as a glue between Extractor and NavigationActions as well - these
         | 
| 13 | 
            -
              #two classes need to communicate frequently as well as share different information
         | 
| 14 | 
            -
              #and this is accomplished through EvaluationContext.
         | 
| 15 | 
            -
              class EvaluationContext
         | 
| 16 | 
            -
                attr_accessor :root_pattern, :document_index, :extractor, :uri_builder, :evaluating_extractor_definition
         | 
| 17 | 
            -
             | 
| 18 | 
            -
                def initialize
         | 
| 19 | 
            -
                  @root_pattern = nil
         | 
| 20 | 
            -
                  @next_page = nil
         | 
| 21 | 
            -
                  @document_index = 0
         | 
| 22 | 
            -
                  @extractor = nil
         | 
| 23 | 
            -
                  @evaluating_extractor_definition = false
         | 
| 24 | 
            -
                end
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                ##
         | 
| 27 | 
            -
                #Crawl to a new page. This function should not be called from the outside - it is automatically called
         | 
| 28 | 
            -
                #if the next_page pattern is defined
         | 
| 29 | 
            -
                def crawl_to_new_page(uri_builder)
         | 
| 30 | 
            -
                  #puts "Crawling to new page!"
         | 
| 31 | 
            -
                  #puts "example #{uri_builder.next_page_example}"
         | 
| 32 | 
            -
                  temp_document = uri_builder.next_page_example ?
         | 
| 33 | 
            -
                                    generate_next_page_link(uri_builder) :
         | 
| 34 | 
            -
                                    uri_builder.generate_next_uri
         | 
| 35 | 
            -
                  return false if temp_document == nil
         | 
| 36 | 
            -
                  FetchAction.restore_host_name
         | 
| 37 | 
            -
                  @extractor.fetch(temp_document)
         | 
| 38 | 
            -
                  return true
         | 
| 39 | 
            -
                end
         | 
| 40 | 
            -
             | 
| 41 | 
            -
                def generate_next_page_link(uri_builder)
         | 
| 42 | 
            -
                  return nil unless uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
         | 
| 43 | 
            -
                  xpath = uri_builder.next_page_pattern.filters[0].xpath
         | 
| 44 | 
            -
                  node = (@extractor.get_hpricot_doc/xpath).map.last
         | 
| 45 | 
            -
                  node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
         | 
| 46 | 
            -
                  return nil if node == nil || node.attributes['href'] == nil
         | 
| 47 | 
            -
                  node.attributes['href'].gsub('&') {'&'}
         | 
| 48 | 
            -
                end
         | 
| 49 | 
            -
             | 
| 50 | 
            -
                def setup_uri_builder(pattern,args)
         | 
| 51 | 
            -
                  if args[0] =~ /^http.+/
         | 
| 52 | 
            -
                    args.insert(0, @extractor.get_current_doc_url) if args[1] !~ /^http.+/
         | 
| 53 | 
            -
                  end
         | 
| 54 | 
            -
                  @uri_builder = URIBuilder.new(pattern,args)
         | 
| 55 | 
            -
                end
         | 
| 56 | 
            -
              end #end of class EvaluationContext
         | 
| 57 | 
            -
            end #end of module Scrubyt
         | 
| @@ -1,67 +0,0 @@ | |
| 1 | 
            -
            module Scrubyt
         | 
| 2 | 
            -
              ##
         | 
| 3 | 
            -
              #=<tt>Build URIs from different parameters</tt>
         | 
| 4 | 
            -
              #
         | 
| 5 | 
            -
              #When crawling to further pages which are machine-generated
         | 
| 6 | 
            -
              #(most typically "next" pages) we need to detect the pattern
         | 
| 7 | 
            -
              #and generate the next URI based on the edetected rule. This
         | 
| 8 | 
            -
              #class provides methods to build URIs based on different criteria.
         | 
| 9 | 
            -
              #
         | 
| 10 | 
            -
              #The other possibility is to use constant objects ('Next' links,
         | 
| 11 | 
            -
              #or image links (like right arrow) pointing to the next page).
         | 
| 12 | 
            -
              #URIBUilder supports both possibilities.
         | 
| 13 | 
            -
              class URIBuilder
         | 
| 14 | 
            -
                attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
         | 
| 15 | 
            -
                
         | 
| 16 | 
            -
                def initialize(pattern,args)
         | 
| 17 | 
            -
                  if args[0] =~ /^http.+/
         | 
| 18 | 
            -
                    #Figure out how are the URLs generated based on the next URL
         | 
| 19 | 
            -
                    get_next_param(string_diff(args[0], args[1]))
         | 
| 20 | 
            -
                    @increment = 0
         | 
| 21 | 
            -
                    @current_uri = args[1]
         | 
| 22 | 
            -
                    @limit = args[2][:limit] if args.size > 2
         | 
| 23 | 
            -
                  else
         | 
| 24 | 
            -
                    #Otherwise, do this in the 'classic' way (by clicking on the "next" link)
         | 
| 25 | 
            -
                    @next_page_pattern = pattern
         | 
| 26 | 
            -
                    @next_page_example = args[0]
         | 
| 27 | 
            -
                    @limit = args[1][:limit] if args.size > 1
         | 
| 28 | 
            -
                  end
         | 
| 29 | 
            -
                end
         | 
| 30 | 
            -
                
         | 
| 31 | 
            -
                #Used when generating the next URI (as opposed to 'clicking' the next link)
         | 
| 32 | 
            -
                def generate_next_uri      
         | 
| 33 | 
            -
                  @increment += @next_increment
         | 
| 34 | 
            -
                  return @current_uri if @increment == @next_increment
         | 
| 35 | 
            -
                  @next_increment = 1 if @next_increment == 2
         | 
| 36 | 
            -
                  if @current_uri !~ /#{@next_param}/
         | 
| 37 | 
            -
                    @current_uri += (@next_param + '=' + @next_increment.to_s)
         | 
| 38 | 
            -
                  else
         | 
| 39 | 
            -
                    @current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
         | 
| 40 | 
            -
                      "#{@next_param}=#{@increment}"
         | 
| 41 | 
            -
                    end
         | 
| 42 | 
            -
                  end
         | 
| 43 | 
            -
                end 
         | 
| 44 | 
            -
                
         | 
| 45 | 
            -
            private
         | 
| 46 | 
            -
                def get_next_param(pair)
         | 
| 47 | 
            -
                  param_and_value = pair.split('=')
         | 
| 48 | 
            -
                  @next_param = param_and_value[0]
         | 
| 49 | 
            -
                  @next_increment = param_and_value[1].to_i
         | 
| 50 | 
            -
                end
         | 
| 51 | 
            -
                
         | 
| 52 | 
            -
                def find_difference_index(s1,s2)
         | 
| 53 | 
            -
                  cmp = s2.scan(/./).zip(s1.scan(/./))    
         | 
| 54 | 
            -
                  i = 0
         | 
| 55 | 
            -
                  loop do
         | 
| 56 | 
            -
                    return i if cmp[i][0] != cmp[i][1]
         | 
| 57 | 
            -
                    i+=1
         | 
| 58 | 
            -
                  end
         | 
| 59 | 
            -
                end
         | 
| 60 | 
            -
             | 
| 61 | 
            -
                def string_diff(s1,s2)
         | 
| 62 | 
            -
                  s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
         | 
| 63 | 
            -
                end #end of method string_diff
         | 
| 64 | 
            -
              end #end of class URIBuilder
         | 
| 65 | 
            -
            end #end of module Scrubyt 
         | 
| 66 | 
            -
             | 
| 67 | 
            -
             |