rdig 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +14 -0
- data/doc/examples/config.rb +43 -7
- data/lib/rdig.rb +35 -20
- data/lib/rdig/content_extractors.rb +75 -37
- data/lib/rdig/crawler.rb +18 -91
- data/lib/rdig/documents.rb +133 -0
- data/lib/rdig/file.rb +18 -0
- data/lib/rdig/index.rb +6 -4
- data/lib/rdig/url_filters.rb +42 -9
- data/test/fixtures/pdf/simple.pdf +0 -0
- data/test/unit/crawler_fs_test.rb +32 -0
- data/test/unit/file_document_test.rb +34 -0
- data/test/unit/html_content_extractor_test.rb +14 -24
- data/test/unit/pdf_content_extractor_test.rb +3 -3
- data/test/unit/url_filters_test.rb +38 -38
- data/test/unit/word_content_extractor_test.rb +1 -1
- metadata +8 -4
- data/lib/rdig/http_client.rb +0 -22
    
        data/CHANGES
    CHANGED
    
    | @@ -1,3 +1,17 @@ | |
| 1 | 
            +
            0.3.0
         | 
| 2 | 
            +
            - file system crawling
         | 
| 3 | 
            +
            - optional url rewriting before indexing, e.g. for linking to results 
         | 
| 4 | 
            +
              via http and building the index directly from the file system
         | 
| 5 | 
            +
            - PDF title extraction with pdfinfo
         | 
| 6 | 
            +
            - removed dependency on mkmf which doesn't seem to exist in Ruby 1.8.2
         | 
| 7 | 
            +
            - made content extractors more flexible - instances now use a given 
         | 
| 8 | 
            +
              configuration instead of the global one. This allows the 
         | 
| 9 | 
            +
              WordContentExtractor to use an HtmlContentExtractor with it's own 
         | 
| 10 | 
            +
              configuration that is independent of the global config.
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            0.2.1
         | 
| 13 | 
            +
            - Bugfix release
         | 
| 14 | 
            +
             | 
| 1 15 | 
             
            0.2.0
         | 
| 2 16 | 
             
            - add pdf and Word content extraction capabilities using the tools
         | 
| 3 17 | 
             
              from the xpdf-utils and wv packages
         | 
    
        data/doc/examples/config.rb
    CHANGED
    
    | @@ -1,25 +1,36 @@ | |
| 1 1 | 
             
            RDig.configuration do |cfg|
         | 
| 2 2 |  | 
| 3 3 | 
             
              ##################################################################
         | 
| 4 | 
            -
              # options you should  | 
| 4 | 
            +
              # options you really should set
         | 
| 5 5 |  | 
| 6 6 | 
             
              # provide one or more URLs for the crawler to start from
         | 
| 7 7 | 
             
              cfg.crawler.start_urls = [ 'http://www.example.com/' ]
         | 
| 8 8 |  | 
| 9 | 
            +
              # use something like this for crawling a file system:
         | 
| 10 | 
            +
              # cfg.crawler.start_urls = [ 'file:///home/bob/documents/' ]
         | 
| 11 | 
            +
              # beware, mixing file and http crawling is not possible and might result in
         | 
| 12 | 
            +
              # unpredictable results.
         | 
| 13 | 
            +
             | 
| 9 14 | 
             
              # limit the crawl to these hosts. The crawler will never
         | 
| 10 15 | 
             
              # follow any links pointing to hosts other than those given here.
         | 
| 16 | 
            +
              # ignored for file system crawling
         | 
| 11 17 | 
             
              cfg.crawler.include_hosts = [ 'www.example.com' ]
         | 
| 12 18 |  | 
| 13 19 | 
             
              # this is the path where the index will be stored
         | 
| 14 20 | 
             
              # caution, existing contents of this directory will be deleted!
         | 
| 15 | 
            -
              cfg. | 
| 21 | 
            +
              cfg.indexer.path        = '/path/to/index'
         | 
| 16 22 |  | 
| 17 23 | 
             
              ##################################################################
         | 
| 18 24 | 
             
              # options you might want to set, the given values are the defaults
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              # set to true to get stack traces on errors
         | 
| 27 | 
            +
              # cfg.verbose = false
         | 
| 19 28 |  | 
| 20 29 | 
             
              # content extraction options
         | 
| 21 30 |  | 
| 22 | 
            -
              # provide a method that  | 
| 31 | 
            +
              # provide a method that returns the title of an html document
         | 
| 32 | 
            +
              # this method may either return a tag to extract the title from, 
         | 
| 33 | 
            +
              # or a ready-to-index string.
         | 
| 23 34 | 
             
              # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
         | 
| 24 35 |  | 
| 25 36 | 
             
              # provide a method that selects the tag containing the page content you 
         | 
| @@ -29,8 +40,12 @@ RDig.configuration do |cfg| | |
| 29 40 |  | 
| 30 41 | 
             
              # crawler options
         | 
| 31 42 |  | 
| 32 | 
            -
              #  | 
| 33 | 
            -
              #  | 
| 43 | 
            +
              # Notice: for file system crawling the include/exclude_document patterns are 
         | 
| 44 | 
            +
              # applied to the full path of _files_ only (like /home/bob/test.pdf), 
         | 
| 45 | 
            +
              # for http to full URIs (like http://example.com/index.html).
         | 
| 46 | 
            +
              
         | 
| 47 | 
            +
              # nil (include all documents) or an array of Regexps 
         | 
| 48 | 
            +
              # matching the URLs you want to index.
         | 
| 34 49 | 
             
              # cfg.crawler.include_documents = nil
         | 
| 35 50 |  | 
| 36 51 | 
             
              # nil (no documents excluded) or an array of Regexps 
         | 
| @@ -40,14 +55,35 @@ RDig.configuration do |cfg| | |
| 40 55 | 
             
              # included by the inclusion patterns.
         | 
| 41 56 | 
             
              # cfg.crawler.exclude_documents = nil
         | 
| 42 57 |  | 
| 43 | 
            -
              # number of  | 
| 58 | 
            +
              # number of document fetching threads to use. Should be raised only if 
         | 
| 59 | 
            +
              # your CPU has idle time when indexing.
         | 
| 44 60 | 
             
              # cfg.crawler.num_threads = 2
         | 
| 61 | 
            +
              # suggested setting for file system crawling:
         | 
| 62 | 
            +
              # cfg.crawler.num_threads = 1
         | 
| 45 63 |  | 
| 46 64 | 
             
              # maximum number of http redirections to follow
         | 
| 47 65 | 
             
              # cfg.crawler.max_redirects = 5
         | 
| 48 66 |  | 
| 49 67 | 
             
              # number of seconds to wait with an empty url queue before 
         | 
| 50 | 
            -
              # finishing the crawl. Set to a higher number  | 
| 68 | 
            +
              # finishing the crawl. Set to a higher number when experiencing incomplete
         | 
| 69 | 
            +
              # crawls on slow sites. Don't set to 0, even when crawling a local fs.
         | 
| 51 70 | 
             
              # cfg.crawler.wait_before_leave = 10
         | 
| 71 | 
            +
             | 
| 72 | 
            +
              # indexer options
         | 
| 73 | 
            +
             | 
| 74 | 
            +
              # create a new index on each run. Will append to the index if false. Use when
         | 
| 75 | 
            +
              # building a single index from multiple runs, e.g. one across a website and the
         | 
| 76 | 
            +
              # other a tree in a local file system
         | 
| 77 | 
            +
              # config.index.create = true
         | 
| 78 | 
            +
             | 
| 79 | 
            +
              # rewrite document uris before indexing them. This is useful if you're
         | 
| 80 | 
            +
              # indexing on disk, but the documents should be accessible via http, e.g. from 
         | 
| 81 | 
            +
              # a web based search application. By default, no rewriting takes place.
         | 
| 82 | 
            +
              # example:
         | 
| 83 | 
            +
              # cfg.index.rewrite_uri = lambda { |uri| 
         | 
| 84 | 
            +
              #   uri.path.gsub!(/^\/base\//, '/virtual_dir/')
         | 
| 85 | 
            +
              #   uri.scheme = 'http'
         | 
| 86 | 
            +
              #   uri.host = 'www.mydomain.com'
         | 
| 87 | 
            +
              # }
         | 
| 52 88 |  | 
| 53 89 | 
             
            end
         | 
    
        data/lib/rdig.rb
    CHANGED
    
    | @@ -24,7 +24,7 @@ | |
| 24 24 | 
             
            #++
         | 
| 25 25 | 
             
            #
         | 
| 26 26 |  | 
| 27 | 
            -
            RDIGVERSION = '0. | 
| 27 | 
            +
            RDIGVERSION = '0.3.0'
         | 
| 28 28 |  | 
| 29 29 |  | 
| 30 30 | 
             
            require 'thread'
         | 
| @@ -38,28 +38,28 @@ require 'set' | |
| 38 38 | 
             
            require 'net/http'
         | 
| 39 39 | 
             
            require 'getoptlong'
         | 
| 40 40 | 
             
            require 'tempfile'
         | 
| 41 | 
            -
             | 
| 42 | 
            -
            # programs:
         | 
| 43 | 
            -
            require 'mkmf'      
         | 
| 41 | 
            +
            require 'open-uri'
         | 
| 44 42 |  | 
| 45 43 | 
             
            begin
         | 
| 46 | 
            -
              require 'rubyful_soup'
         | 
| 47 44 | 
             
              require 'ferret'
         | 
| 45 | 
            +
              require 'rubyful_soup'
         | 
| 48 46 | 
             
            rescue LoadError
         | 
| 49 47 | 
             
              require 'rubygems'
         | 
| 50 | 
            -
              require 'rubyful_soup'
         | 
| 51 48 | 
             
              require 'ferret'
         | 
| 49 | 
            +
              require 'rubyful_soup'
         | 
| 52 50 | 
             
            end
         | 
| 53 51 |  | 
| 54 52 | 
             
            require 'htmlentities/htmlentities'
         | 
| 55 | 
            -
             | 
| 56 | 
            -
            require 'rdig/http_client'
         | 
| 53 | 
            +
                
         | 
| 57 54 | 
             
            require 'rdig/content_extractors'
         | 
| 58 55 | 
             
            require 'rdig/url_filters'
         | 
| 59 56 | 
             
            require 'rdig/search'
         | 
| 60 57 | 
             
            require 'rdig/index'
         | 
| 58 | 
            +
            require 'rdig/file'
         | 
| 59 | 
            +
            require 'rdig/documents'
         | 
| 61 60 | 
             
            require 'rdig/crawler'
         | 
| 62 61 |  | 
| 62 | 
            +
             | 
| 63 63 | 
             
            $KCODE = 'u'
         | 
| 64 64 | 
             
            require 'jcode'
         | 
| 65 65 |  | 
| @@ -68,17 +68,30 @@ module RDig | |
| 68 68 |  | 
| 69 69 | 
             
              class << self
         | 
| 70 70 |  | 
| 71 | 
            -
                # the filter  | 
| 71 | 
            +
                # the filter chains are for limiting the set of indexed documents.
         | 
| 72 | 
            +
                # there are two chain types - one for http, and one for file system
         | 
| 73 | 
            +
                # crawling.
         | 
| 74 | 
            +
                # a document has to survive all filters in the chain to get indexed.
         | 
| 72 75 | 
             
                def filter_chain
         | 
| 73 | 
            -
                  @filter_chain ||=  | 
| 74 | 
            -
                     | 
| 75 | 
            -
                    : | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 76 | 
            +
                  @filter_chain ||= {
         | 
| 77 | 
            +
                    # filter chain for http crawling
         | 
| 78 | 
            +
                    :http => [
         | 
| 79 | 
            +
                      :scheme_filter_http,
         | 
| 80 | 
            +
                      :fix_relative_uri,
         | 
| 81 | 
            +
                      :normalize_uri,
         | 
| 82 | 
            +
                      { :hostname_filter => :include_hosts },
         | 
| 83 | 
            +
                      { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
         | 
| 84 | 
            +
                      { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
         | 
| 85 | 
            +
                      RDig::UrlFilters::VisitedUrlFilter 
         | 
| 86 | 
            +
                    ],
         | 
| 87 | 
            +
                    # filter chain for file system crawling
         | 
| 88 | 
            +
                    :file => [
         | 
| 89 | 
            +
                      :scheme_filter_file,
         | 
| 90 | 
            +
                      { RDig::UrlFilters::PathInclusionFilter => :include_documents },
         | 
| 91 | 
            +
                      { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
         | 
| 92 | 
            +
                    ]
         | 
| 93 | 
            +
                  }
         | 
| 94 | 
            +
                     
         | 
| 82 95 | 
             
                end
         | 
| 83 96 |  | 
| 84 97 | 
             
                def application
         | 
| @@ -86,7 +99,7 @@ module RDig | |
| 86 99 | 
             
                end
         | 
| 87 100 |  | 
| 88 101 | 
             
                def searcher
         | 
| 89 | 
            -
                  @searcher ||= Search::Searcher.new(config. | 
| 102 | 
            +
                  @searcher ||= Search::Searcher.new(config.index)
         | 
| 90 103 | 
             
                end
         | 
| 91 104 |  | 
| 92 105 | 
             
                # RDig configuration
         | 
| @@ -124,7 +137,7 @@ module RDig | |
| 124 137 | 
             
                          }
         | 
| 125 138 | 
             
                        )
         | 
| 126 139 | 
             
                      ),
         | 
| 127 | 
            -
                      : | 
| 140 | 
            +
                      :index                 => OpenStruct.new( 
         | 
| 128 141 | 
             
                        :path                => "index/", 
         | 
| 129 142 | 
             
                        :create              => true,
         | 
| 130 143 | 
             
                        :handle_parse_errors => true,
         | 
| @@ -224,6 +237,8 @@ module RDig | |
| 224 237 |  | 
| 225 238 | 
             
                  end    
         | 
| 226 239 |  | 
| 240 | 
            +
                  puts "using Ferret #{Ferret::VERSION}"
         | 
| 241 | 
            +
             | 
| 227 242 | 
             
                  if options.query
         | 
| 228 243 | 
             
                    # query the index
         | 
| 229 244 | 
             
                    puts "executing query >#{options.query}<"
         | 
| @@ -54,7 +54,9 @@ module RDig | |
| 54 54 |  | 
| 55 55 | 
             
                  def self.extractors; @@extractors ||= [] end
         | 
| 56 56 | 
             
                  def self.extractor_instances
         | 
| 57 | 
            -
                    @@extractor_instances ||= extractors.map { |ex_class|  | 
| 57 | 
            +
                    @@extractor_instances ||= extractors.map { |ex_class| 
         | 
| 58 | 
            +
                      ex_class.new(RDig.configuration.content_extraction) 
         | 
| 59 | 
            +
                    }
         | 
| 58 60 | 
             
                  end
         | 
| 59 61 |  | 
| 60 62 | 
             
                  def self.process(content, content_type)
         | 
| @@ -65,6 +67,10 @@ module RDig | |
| 65 67 | 
             
                    nil
         | 
| 66 68 | 
             
                  end
         | 
| 67 69 |  | 
| 70 | 
            +
                  def initialize(config)
         | 
| 71 | 
            +
                    @config = config
         | 
| 72 | 
            +
                  end
         | 
| 73 | 
            +
             | 
| 68 74 | 
             
                  def can_do(content_type)
         | 
| 69 75 | 
             
                    content_type =~ @pattern
         | 
| 70 76 | 
             
                  end
         | 
| @@ -91,60 +97,88 @@ module RDig | |
| 91 97 | 
             
                    file.delete
         | 
| 92 98 | 
             
                  end
         | 
| 93 99 |  | 
| 94 | 
            -
                   | 
| 95 | 
            -
             | 
| 96 | 
            -
                      @available = !find_executable(@executable).nil?
         | 
| 97 | 
            -
                    end
         | 
| 98 | 
            -
                    @available
         | 
| 99 | 
            -
                  end
         | 
| 100 | 
            -
             | 
| 100 | 
            +
                  # setting @available according to presence of external executables
         | 
| 101 | 
            +
                  # in initializer of ContentExtractor is needed to make this work
         | 
| 101 102 | 
             
                  def can_do(content_type)
         | 
| 102 | 
            -
                    available and super(content_type)
         | 
| 103 | 
            +
                    @available and super(content_type)
         | 
| 103 104 | 
             
                  end
         | 
| 104 105 | 
             
                end
         | 
| 105 106 |  | 
| 106 107 | 
             
                # Extract text from pdf content.
         | 
| 107 108 | 
             
                #
         | 
| 108 | 
            -
                # Requires the pdftotext  | 
| 109 | 
            +
                # Requires the pdftotext and pdfinfo utilities from the 
         | 
| 110 | 
            +
                # xpdf-utils package
         | 
| 109 111 | 
             
                # (on debian and friends do 'apt-get install xpdf-utils')
         | 
| 110 112 | 
             
                #
         | 
| 111 | 
            -
                # TODO: use pdfinfo to get title from document
         | 
| 112 113 | 
             
                class PdfContentExtractor < ContentExtractor
         | 
| 113 114 | 
             
                  include ExternalAppHelper
         | 
| 114 115 |  | 
| 115 | 
            -
                  def initialize
         | 
| 116 | 
            -
                     | 
| 116 | 
            +
                  def initialize(config)
         | 
| 117 | 
            +
                    super(config)
         | 
| 117 118 | 
             
                    @pattern = /^application\/pdf/
         | 
| 119 | 
            +
                    @pdftotext = 'pdftotext'
         | 
| 120 | 
            +
                    @pdfinfo = 'pdfinfo'
         | 
| 121 | 
            +
                    @available = true
         | 
| 122 | 
            +
                    [ @pdftotext, @pdfinfo].each { |program|
         | 
| 123 | 
            +
                      unless %x{#{program} -h 2>&1} =~ /Copyright 1996/ 
         | 
| 124 | 
            +
                        @available = false 
         | 
| 125 | 
            +
                        break
         | 
| 126 | 
            +
                      end
         | 
| 127 | 
            +
                    }
         | 
| 118 128 | 
             
                  end
         | 
| 119 | 
            -
             | 
| 129 | 
            +
             
         | 
| 130 | 
            +
                  def process(content)
         | 
| 131 | 
            +
                    result = {}
         | 
| 132 | 
            +
                    as_file(content) do |file|
         | 
| 133 | 
            +
                      result[:content] = get_content(file.path).strip
         | 
| 134 | 
            +
                      result[:title] = get_title(file.path)
         | 
| 135 | 
            +
                    end
         | 
| 136 | 
            +
                    result
         | 
| 137 | 
            +
                  end
         | 
| 138 | 
            +
             | 
| 120 139 | 
             
                  def get_content(path_to_tempfile)
         | 
| 121 | 
            -
                    %x{#{@ | 
| 140 | 
            +
                    %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
         | 
| 141 | 
            +
                  end
         | 
| 142 | 
            +
                  
         | 
| 143 | 
            +
                  # extracts the title from pdf meta data
         | 
| 144 | 
            +
                  # needs pdfinfo
         | 
| 145 | 
            +
                  # returns the title or nil if no title was found
         | 
| 146 | 
            +
                  def get_title(path_to_tempfile)
         | 
| 147 | 
            +
                    %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
         | 
| 148 | 
            +
                  rescue
         | 
| 122 149 | 
             
                  end
         | 
| 123 150 | 
             
                end
         | 
| 124 151 |  | 
| 125 152 | 
             
                # Extract text from word documents
         | 
| 126 153 | 
             
                #
         | 
| 127 | 
            -
                # Requires the  | 
| 128 | 
            -
                # (on debian and friends do 'apt-get install  | 
| 154 | 
            +
                # Requires the wvHtml utility
         | 
| 155 | 
            +
                # (on debian and friends do 'apt-get install wv')
         | 
| 129 156 | 
             
                class WordContentExtractor < ContentExtractor
         | 
| 130 157 | 
             
                  include ExternalAppHelper
         | 
| 131 158 |  | 
| 132 | 
            -
                  def initialize
         | 
| 133 | 
            -
                     | 
| 159 | 
            +
                  def initialize(config)
         | 
| 160 | 
            +
                    super(config)
         | 
| 161 | 
            +
                    @wvhtml = 'wvHtml'
         | 
| 134 162 | 
             
                    @pattern = /^application\/msword/
         | 
| 135 | 
            -
                     | 
| 163 | 
            +
                    # html extractor for parsing wvHtml output
         | 
| 164 | 
            +
                    @html_extractor = HtmlContentExtractor.new(OpenStruct.new(
         | 
| 165 | 
            +
                        :html => OpenStruct.new(
         | 
| 166 | 
            +
                          :content_tag_selector => lambda { |tagsoup|
         | 
| 167 | 
            +
                            tagsoup.html.body
         | 
| 168 | 
            +
                          },
         | 
| 169 | 
            +
                          :title_tag_selector         => lambda { |tagsoup|
         | 
| 170 | 
            +
                            tagsoup.html.head.title
         | 
| 171 | 
            +
                          }
         | 
| 172 | 
            +
                        )))
         | 
| 173 | 
            +
             | 
| 174 | 
            +
                    # TODO: besser: if $?.exitstatus == 127 (not found)
         | 
| 175 | 
            +
                    @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
         | 
| 136 176 | 
             
                  end
         | 
| 137 177 |  | 
| 138 178 | 
             
                  def process(content)
         | 
| 139 179 | 
             
                    result = {}
         | 
| 140 | 
            -
                    as_file(content) do | | 
| 141 | 
            -
                       | 
| 142 | 
            -
                      outfile.close
         | 
| 143 | 
            -
                      %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
         | 
| 144 | 
            -
                      File.open(outfile.path) do |html|
         | 
| 145 | 
            -
                        result = @html_extractor.process(html.read)
         | 
| 146 | 
            -
                      end
         | 
| 147 | 
            -
                      outfile.delete
         | 
| 180 | 
            +
                    as_file(content) do |file|  
         | 
| 181 | 
            +
                      result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
         | 
| 148 182 | 
             
                    end
         | 
| 149 183 | 
             
                    return result || {}
         | 
| 150 184 | 
             
                  end
         | 
| @@ -154,7 +188,8 @@ module RDig | |
| 154 188 | 
             
                # extracts title, content and links from html documents
         | 
| 155 189 | 
             
                class HtmlContentExtractor < ContentExtractor
         | 
| 156 190 |  | 
| 157 | 
            -
                  def initialize
         | 
| 191 | 
            +
                  def initialize(config)
         | 
| 192 | 
            +
                    super(config)
         | 
| 158 193 | 
             
                    @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
         | 
| 159 194 | 
             
                  end
         | 
| 160 195 |  | 
| @@ -181,9 +216,10 @@ module RDig | |
| 181 216 | 
             
                  # children.
         | 
| 182 217 | 
             
                  def extract_content(tag_soup)
         | 
| 183 218 | 
             
                    content = ''
         | 
| 184 | 
            -
                    content_element(tag_soup) | 
| 219 | 
            +
                    ce = content_element(tag_soup)
         | 
| 220 | 
            +
                    ce.children { |child| 
         | 
| 185 221 | 
             
                      extract_text(child, content)
         | 
| 186 | 
            -
                    }
         | 
| 222 | 
            +
                    } unless ce.nil?
         | 
| 187 223 | 
             
                    return content.strip
         | 
| 188 224 | 
             
                  end
         | 
| 189 225 |  | 
| @@ -197,18 +233,20 @@ module RDig | |
| 197 233 |  | 
| 198 234 | 
             
                  # Extracts the title from the given html tree
         | 
| 199 235 | 
             
                  def extract_title(tagsoup)
         | 
| 200 | 
            -
                    title = ''
         | 
| 201 236 | 
             
                    the_title_tag = title_tag(tagsoup)
         | 
| 202 237 | 
             
                    if the_title_tag.is_a? String
         | 
| 203 238 | 
             
                      the_title_tag
         | 
| 204 239 | 
             
                    else
         | 
| 205 | 
            -
                       | 
| 240 | 
            +
                      title = ''
         | 
| 241 | 
            +
                      extract_text(the_title_tag, title)
         | 
| 242 | 
            +
                      title.strip
         | 
| 206 243 | 
             
                    end
         | 
| 207 244 | 
             
                  end
         | 
| 208 245 |  | 
| 209 246 | 
             
                  # Recursively extracts all text contained in the given element, 
         | 
| 210 247 | 
             
                  # and appends it to content.
         | 
| 211 248 | 
             
                  def extract_text(element, content='')
         | 
| 249 | 
            +
                    return nil if element.nil?
         | 
| 212 250 | 
             
                    if element.is_a? NavigableString
         | 
| 213 251 | 
             
                      value = strip_comments(element)
         | 
| 214 252 | 
             
                      value.strip!
         | 
| @@ -234,8 +272,8 @@ module RDig | |
| 234 272 | 
             
                  # This may return a string, e.g. an attribute value selected from a meta
         | 
| 235 273 | 
             
                  # tag, too.
         | 
| 236 274 | 
             
                  def title_tag(tagsoup)
         | 
| 237 | 
            -
                    if  | 
| 238 | 
            -
                       | 
| 275 | 
            +
                    if @config.html.title_tag_selector
         | 
| 276 | 
            +
                      @config.html.title_tag_selector.call(tagsoup)
         | 
| 239 277 | 
             
                    else 
         | 
| 240 278 | 
             
                      tagsoup.html.head.title
         | 
| 241 279 | 
             
                    end
         | 
| @@ -243,8 +281,8 @@ module RDig | |
| 243 281 |  | 
| 244 282 | 
             
                  # Retrieve the root element to extract document content from
         | 
| 245 283 | 
             
                  def content_element(tagsoup)
         | 
| 246 | 
            -
                    if  | 
| 247 | 
            -
                       | 
| 284 | 
            +
                    if @config.html.content_tag_selector
         | 
| 285 | 
            +
                      @config.html.content_tag_selector.call(tagsoup)
         | 
| 248 286 | 
             
                    else
         | 
| 249 287 | 
             
                      tagsoup.html.body
         | 
| 250 288 | 
             
                    end
         | 
    
        data/lib/rdig/crawler.rb
    CHANGED
    
    | @@ -9,30 +9,28 @@ module RDig | |
| 9 9 | 
             
                end
         | 
| 10 10 |  | 
| 11 11 | 
             
                def run
         | 
| 12 | 
            -
                   | 
| 13 | 
            -
                   | 
| 12 | 
            +
                  raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
         | 
| 13 | 
            +
                  @indexer = Index::Indexer.new(RDig.config.index)
         | 
| 14 | 
            +
                  
         | 
| 15 | 
            +
                  # check whether we are indexing on-disk or via http
         | 
| 16 | 
            +
                  url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
         | 
| 17 | 
            +
                  chain_config = RDig.filter_chain[url_type]
         | 
| 18 | 
            +
                  
         | 
| 19 | 
            +
                  filterchain = UrlFilters::FilterChain.new(chain_config)
         | 
| 14 20 | 
             
                  RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
         | 
| 15 21 |  | 
| 16 22 | 
             
                  num_threads = RDig.config.crawler.num_threads
         | 
| 17 23 | 
             
                  group = ThreadsWait.new
         | 
| 18 24 | 
             
                  num_threads.times { |i|
         | 
| 19 25 | 
             
                    group.join_nowait Thread.new("fetcher #{i}") {
         | 
| 20 | 
            -
                      filterchain = UrlFilters::FilterChain.new( | 
| 26 | 
            +
                      filterchain = UrlFilters::FilterChain.new(chain_config)
         | 
| 21 27 | 
             
                      while (doc = @documents.pop) != :exit
         | 
| 22 28 | 
             
                        process_document doc, filterchain
         | 
| 23 29 | 
             
                      end
         | 
| 24 30 | 
             
                    }
         | 
| 25 31 | 
             
                  }
         | 
| 26 32 |  | 
| 27 | 
            -
                  #  | 
| 28 | 
            -
                  # t1 pops the start url from the queue which now is empty
         | 
| 29 | 
            -
                  # as the queue is empty now, t2 blocks until t1 adds the links 
         | 
| 30 | 
            -
                  # retrieved from his document.
         | 
| 31 | 
            -
                  #
         | 
| 32 | 
            -
                  # But we need the 'queue empty' condition as a sign for us to stop
         | 
| 33 | 
            -
                  # waiting for new entries, too.
         | 
| 34 | 
            -
                  
         | 
| 35 | 
            -
                  # check every now and then for an empty queue
         | 
| 33 | 
            +
                  # check for an empty queue every now and then 
         | 
| 36 34 | 
             
                  sleep_interval = RDig.config.crawler.wait_before_leave
         | 
| 37 35 | 
             
                  begin 
         | 
| 38 36 | 
             
                    sleep sleep_interval
         | 
| @@ -54,22 +52,10 @@ module RDig | |
| 54 52 | 
             
                  } unless doc.content[:links].nil?
         | 
| 55 53 |  | 
| 56 54 | 
             
                  return unless @etag_filter.apply(doc)
         | 
| 57 | 
            -
                   | 
| 58 | 
            -
                  when :success
         | 
| 59 | 
            -
                    if doc.content
         | 
| 60 | 
            -
                      if doc.content[:links]
         | 
| 61 | 
            -
                        doc.content[:links].each { |url| add_url(url, filterchain, doc) }
         | 
| 62 | 
            -
                      end
         | 
| 63 | 
            -
                      @indexer << doc
         | 
| 64 | 
            -
                      #else
         | 
| 65 | 
            -
                      #puts "success but no content: #{doc.uri.to_s}"
         | 
| 66 | 
            -
                    end
         | 
| 67 | 
            -
                  when :redirect
         | 
| 68 | 
            -
                    # links contains the url we were redirected to
         | 
| 69 | 
            -
                    doc.content[:links].each { |url| add_url(url, filterchain, doc) }
         | 
| 70 | 
            -
                  end
         | 
| 55 | 
            +
                  @indexer << doc if doc.needs_indexing?
         | 
| 71 56 | 
             
                rescue
         | 
| 72 57 | 
             
                  puts "error processing document #{doc.uri.to_s}: #{$!}"
         | 
| 58 | 
            +
                  puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
         | 
| 73 59 | 
             
                end
         | 
| 74 60 |  | 
| 75 61 |  | 
| @@ -78,82 +64,23 @@ module RDig | |
| 78 64 | 
             
                # processing
         | 
| 79 65 | 
             
                def add_url(url, filterchain, referring_document = nil)
         | 
| 80 66 | 
             
                  return if url.nil? || url.empty?
         | 
| 81 | 
            -
                  if referring_document
         | 
| 82 | 
            -
                    doc = Document. | 
| 83 | 
            -
                    # keep redirect count
         | 
| 84 | 
            -
                    if referring_document.status == :redirect
         | 
| 85 | 
            -
                      doc.redirections = referring_document.redirections + 1
         | 
| 86 | 
            -
                    end
         | 
| 67 | 
            +
                  if referring_document and referring_document.uri.scheme =~ /^https?/i
         | 
| 68 | 
            +
                    doc = Document.create(url, referring_document.uri)
         | 
| 87 69 | 
             
                  else
         | 
| 88 | 
            -
                    doc = Document. | 
| 70 | 
            +
                    doc = Document.create(url)
         | 
| 89 71 | 
             
                  end
         | 
| 90 72 |  | 
| 91 73 | 
             
                  doc = filterchain.apply(doc)
         | 
| 92 74 |  | 
| 93 75 | 
             
                  if doc
         | 
| 94 | 
            -
                     | 
| 95 | 
            -
                    # | 
| 96 | 
            -
                    #puts "skipping url #{url}"
         | 
| 76 | 
            +
                    @documents << doc
         | 
| 77 | 
            +
                    puts "added url #{url}" if RDig::config.verbose
         | 
| 97 78 | 
             
                  end
         | 
| 98 | 
            -
                  @documents << doc if doc
         | 
| 99 79 | 
             
                end
         | 
| 100 80 |  | 
| 101 81 | 
             
              end
         | 
| 102 82 |  | 
| 103 83 |  | 
| 104 | 
            -
              class Document
         | 
| 105 | 
            -
                include HttpClient
         | 
| 106 | 
            -
             | 
| 107 | 
            -
                attr_reader :content
         | 
| 108 | 
            -
                attr_reader :content_type
         | 
| 109 | 
            -
                attr_reader :uri
         | 
| 110 | 
            -
                attr_reader :referring_uri
         | 
| 111 | 
            -
                attr_reader :status
         | 
| 112 | 
            -
                attr_reader :etag
         | 
| 113 | 
            -
                attr_accessor :redirections
         | 
| 114 | 
            -
                
         | 
| 115 | 
            -
                # url: url of this document, may be relative to the referring doc or host.
         | 
| 116 | 
            -
                # referrer: uri of the document we retrieved this link from
         | 
| 117 | 
            -
                def initialize(url, referrer = nil)
         | 
| 118 | 
            -
                  @redirections = 0
         | 
| 119 | 
            -
                  begin
         | 
| 120 | 
            -
                    @uri = URI.parse(url)
         | 
| 121 | 
            -
                  rescue URI::InvalidURIError
         | 
| 122 | 
            -
                    raise "Cannot create document using invalid URL: #{url}"
         | 
| 123 | 
            -
                  end
         | 
| 124 | 
            -
                  @referring_uri = referrer
         | 
| 125 | 
            -
                end
         | 
| 126 | 
            -
             | 
| 127 | 
            -
                def has_content?
         | 
| 128 | 
            -
                  !self.content.nil?
         | 
| 129 | 
            -
                end
         | 
| 130 | 
            -
             | 
| 131 | 
            -
                def title; @content[:title] end
         | 
| 132 | 
            -
                def body; @content[:content] end
         | 
| 133 | 
            -
                def url; @uri.to_s end
         | 
| 134 | 
            -
             | 
| 135 | 
            -
                def fetch
         | 
| 136 | 
            -
                  puts "fetching #{@uri.to_s}"
         | 
| 137 | 
            -
                  response = do_get(@uri)
         | 
| 138 | 
            -
                  case response
         | 
| 139 | 
            -
                  when Net::HTTPSuccess
         | 
| 140 | 
            -
                    @content_type = response['content-type']
         | 
| 141 | 
            -
                    @raw_body = response.body
         | 
| 142 | 
            -
                    @etag = response['etag']
         | 
| 143 | 
            -
                    # todo externalize this (another chain ?)
         | 
| 144 | 
            -
                    @content = ContentExtractors.process(@raw_body, @content_type)
         | 
| 145 | 
            -
                    @status = :success
         | 
| 146 | 
            -
                  when Net::HTTPRedirection
         | 
| 147 | 
            -
                    @status = :redirect
         | 
| 148 | 
            -
                    @content = { :links => [ response['location'] ] }
         | 
| 149 | 
            -
                  else
         | 
| 150 | 
            -
                    puts "don't know what to do with response: #{response}"
         | 
| 151 | 
            -
                  end
         | 
| 152 | 
            -
                   
         | 
| 153 | 
            -
                end
         | 
| 154 | 
            -
             | 
| 155 | 
            -
              end
         | 
| 156 | 
            -
              
         | 
| 157 84 | 
             
              # checks fetched documents' E-Tag headers against the list of E-Tags
         | 
| 158 85 | 
             
              # of the documents already indexed.
         | 
| 159 86 | 
             
              # This is supposed to help against double-indexing documents which can 
         | 
| @@ -169,7 +96,7 @@ module RDig | |
| 169 96 | 
             
                end
         | 
| 170 97 |  | 
| 171 98 | 
             
                def apply(document)
         | 
| 172 | 
            -
                  return document unless document.etag 
         | 
| 99 | 
            +
                  return document unless (document.respond_to?(:etag) && document.etag)
         | 
| 173 100 | 
             
                  synchronize do
         | 
| 174 101 | 
             
                    @etags.add?(document.etag) ? document : nil 
         | 
| 175 102 | 
             
                  end
         | 
| @@ -0,0 +1,133 @@ | |
| 1 | 
            +
            module RDig
         | 
| 2 | 
            +
              
         | 
| 3 | 
            +
              #
         | 
| 4 | 
            +
              # Document base class
         | 
| 5 | 
            +
              #
         | 
| 6 | 
            +
              class Document
         | 
| 7 | 
            +
                
         | 
| 8 | 
            +
                attr_reader :uri
         | 
| 9 | 
            +
                attr_reader :content
         | 
| 10 | 
            +
                attr_reader :content_type
         | 
| 11 | 
            +
                
         | 
| 12 | 
            +
                def self.create(url, referrer_uri = nil)
         | 
| 13 | 
            +
                  # a referrer is a clear enough hint to create an HttpDocument
         | 
| 14 | 
            +
                  if referrer_uri && referrer_uri.scheme =~ /^https?$/i
         | 
| 15 | 
            +
                    return HttpDocument.new(:url => url, :referrer => referrer_uri)
         | 
| 16 | 
            +
                  end
         | 
| 17 | 
            +
                    
         | 
| 18 | 
            +
                  case url
         | 
| 19 | 
            +
                  when /^https?:\/\//i
         | 
| 20 | 
            +
                    HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
         | 
| 21 | 
            +
                  when /^file:\/\//i
         | 
| 22 | 
            +
                    # files don't have referrers - the check for nil prevents us from being
         | 
| 23 | 
            +
                    # tricked into indexing local files by file:// links in the web site
         | 
| 24 | 
            +
                    # we index.
         | 
| 25 | 
            +
                    FileDocument.new(:url => url) if referrer_uri.nil?
         | 
| 26 | 
            +
                  end
         | 
| 27 | 
            +
                end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                # url: url of this document, may be relative to the referring doc or host.
         | 
| 30 | 
            +
                # referrer: uri of the document we retrieved this link from
         | 
| 31 | 
            +
                def initialize(args)
         | 
| 32 | 
            +
                  begin
         | 
| 33 | 
            +
                    @uri = URI.parse(args[:url])
         | 
| 34 | 
            +
                  rescue URI::InvalidURIError
         | 
| 35 | 
            +
                    raise "Cannot create document using invalid URL: #{url}"
         | 
| 36 | 
            +
                  end
         | 
| 37 | 
            +
                end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                def title; @content[:title] end
         | 
| 40 | 
            +
                def body; @content[:content] end
         | 
| 41 | 
            +
                def links; @content[:links] end
         | 
| 42 | 
            +
                
         | 
| 43 | 
            +
                def needs_indexing?
         | 
| 44 | 
            +
                  has_content? && (title || body)
         | 
| 45 | 
            +
                end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                def has_content?
         | 
| 48 | 
            +
                  !self.content.nil?
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
              end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
              
         | 
| 54 | 
            +
              #
         | 
| 55 | 
            +
              # Document in a File system
         | 
| 56 | 
            +
              #
         | 
| 57 | 
            +
              class FileDocument < Document
         | 
| 58 | 
            +
                def initialize(args={})
         | 
| 59 | 
            +
                  super(args)
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                def self.find_files(path)
         | 
| 63 | 
            +
                  links = []
         | 
| 64 | 
            +
                  Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
         | 
| 65 | 
            +
                    # Skip files not matching known mime types
         | 
| 66 | 
            +
                    pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
         | 
| 67 | 
            +
                    if File.directory?(filename) || filename =~ pattern
         | 
| 68 | 
            +
                      links << "file://#{filename}"
         | 
| 69 | 
            +
                    end
         | 
| 70 | 
            +
                  end
         | 
| 71 | 
            +
                  links
         | 
| 72 | 
            +
                end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                def file?
         | 
| 75 | 
            +
                  File.file? @uri.path
         | 
| 76 | 
            +
                end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                def fetch
         | 
| 79 | 
            +
                  if File.directory? @uri.path
         | 
| 80 | 
            +
                    # directories are treated like a link collection
         | 
| 81 | 
            +
                    @content = { :links => self.class.find_files(@uri.path) }
         | 
| 82 | 
            +
                  else
         | 
| 83 | 
            +
                    # process this file's contents
         | 
| 84 | 
            +
                    open(@uri.path) do |file|
         | 
| 85 | 
            +
                      @content = ContentExtractors.process(file.read, file.content_type)
         | 
| 86 | 
            +
                      @content[:links] = nil if @content # don't follow links inside files
         | 
| 87 | 
            +
                    end
         | 
| 88 | 
            +
                  end
         | 
| 89 | 
            +
                  @content ||= {}
         | 
| 90 | 
            +
                end
         | 
| 91 | 
            +
             | 
| 92 | 
            +
              end
         | 
| 93 | 
            +
              
         | 
| 94 | 
            +
              
         | 
| 95 | 
            +
              #
         | 
| 96 | 
            +
              # Remote Document to be retrieved by HTTP
         | 
| 97 | 
            +
              #
         | 
| 98 | 
            +
              class HttpDocument < Document
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                attr_reader :referring_uri
         | 
| 101 | 
            +
                attr_reader :status
         | 
| 102 | 
            +
                attr_reader :etag
         | 
| 103 | 
            +
                
         | 
| 104 | 
            +
                # url: url of this document, may be relative to the referring doc or host.
         | 
| 105 | 
            +
                # referrer: uri of the document we retrieved this link from
         | 
| 106 | 
            +
                def initialize(args={})
         | 
| 107 | 
            +
                  super(args)
         | 
| 108 | 
            +
                  @referring_uri = args[:referrer]
         | 
| 109 | 
            +
                end
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                def fetch
         | 
| 112 | 
            +
                  puts "fetching #{@uri.to_s}" if RDig::config.verbose
         | 
| 113 | 
            +
                  open(@uri.to_s) do |doc|
         | 
| 114 | 
            +
                    case doc.status.first.to_i
         | 
| 115 | 
            +
                    when 200
         | 
| 116 | 
            +
                      @etag = doc.meta['etag']
         | 
| 117 | 
            +
                      # puts "etag: #{@etag}"
         | 
| 118 | 
            +
                      @content = ContentExtractors.process(doc.read, doc.content_type)
         | 
| 119 | 
            +
                      @status = :success
         | 
| 120 | 
            +
                    when 404
         | 
| 121 | 
            +
                      puts "got 404 for #{url}"
         | 
| 122 | 
            +
                    else
         | 
| 123 | 
            +
                      puts "don't know what to do with response: #{doc.status.join(' : ')}"
         | 
| 124 | 
            +
                    end
         | 
| 125 | 
            +
                  end
         | 
| 126 | 
            +
                rescue
         | 
| 127 | 
            +
                  puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
         | 
| 128 | 
            +
                ensure
         | 
| 129 | 
            +
                  @content ||= {}
         | 
| 130 | 
            +
                end
         | 
| 131 | 
            +
             | 
| 132 | 
            +
              end
         | 
| 133 | 
            +
            end
         | 
    
        data/lib/rdig/file.rb
    ADDED
    
    | @@ -0,0 +1,18 @@ | |
| 1 | 
            +
            # Extend class File with a content_type method
         | 
| 2 | 
            +
            class File
         | 
| 3 | 
            +
              
         | 
| 4 | 
            +
              # mime types and file extensions
         | 
| 5 | 
            +
              FILE_EXTENSION_MIME_TYPES = {
         | 
| 6 | 
            +
                'doc'  => 'application/msword',
         | 
| 7 | 
            +
                'html' => 'text/html',
         | 
| 8 | 
            +
                'htm'  => 'text/html',
         | 
| 9 | 
            +
                #'.odt'  => 'application/vnd.oasis.opendocument.text',
         | 
| 10 | 
            +
                'pdf'  => 'application/pdf',
         | 
| 11 | 
            +
                'txt'  => 'text/plain',
         | 
| 12 | 
            +
              }
         | 
| 13 | 
            +
             
         | 
| 14 | 
            +
              def content_type
         | 
| 15 | 
            +
                FILE_EXTENSION_MIME_TYPES[File.extname(self.path).downcase.gsub(/^\./,'')] || 'application/octet-stream'
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
              
         | 
| 18 | 
            +
            end
         | 
    
        data/lib/rdig/index.rb
    CHANGED
    
    | @@ -6,7 +6,7 @@ module RDig | |
| 6 6 | 
             
                  include MonitorMixin, Ferret::Index, Ferret::Document
         | 
| 7 7 |  | 
| 8 8 | 
             
                  def initialize(settings)
         | 
| 9 | 
            -
                     | 
| 9 | 
            +
                    @config = settings
         | 
| 10 10 | 
             
                    @index_writer = IndexWriter.new(settings.path,
         | 
| 11 11 | 
             
                                                    :create   => settings.create,
         | 
| 12 12 | 
             
                                                    :analyzer => settings.analyzer)
         | 
| @@ -14,10 +14,12 @@ module RDig | |
| 14 14 | 
             
                  end
         | 
| 15 15 |  | 
| 16 16 | 
             
                  def add_to_index(document)
         | 
| 17 | 
            -
                    puts "add to index: #{document.uri.to_s}"
         | 
| 17 | 
            +
                    puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
         | 
| 18 18 | 
             
                    doc = Ferret::Document::Document.new
         | 
| 19 | 
            -
                     | 
| 20 | 
            -
             | 
| 19 | 
            +
                    @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
         | 
| 20 | 
            +
                    
         | 
| 21 | 
            +
                    doc << Field.new("url", document.uri.to_s, 
         | 
| 22 | 
            +
                                    Field::Store::YES, Field::Index::TOKENIZED)
         | 
| 21 23 | 
             
                    doc << Field.new("title", document.title, 
         | 
| 22 24 | 
             
                                    Field::Store::YES, Field::Index::TOKENIZED)
         | 
| 23 25 | 
             
                    doc << Field.new("data",  document.body, 
         | 
    
        data/lib/rdig/url_filters.rb
    CHANGED
    
    | @@ -82,7 +82,7 @@ module RDig | |
| 82 82 |  | 
| 83 83 |  | 
| 84 84 | 
             
                # base class for url inclusion / exclusion filters
         | 
| 85 | 
            -
                class  | 
| 85 | 
            +
                class PatternFilter
         | 
| 86 86 | 
             
                  # takes an Array of Regexps, or nil to disable the filter
         | 
| 87 87 | 
             
                  def initialize(args=nil)
         | 
| 88 88 | 
             
                    unless args.nil?
         | 
| @@ -98,8 +98,8 @@ module RDig | |
| 98 98 | 
             
                    end
         | 
| 99 99 | 
             
                  end
         | 
| 100 100 | 
             
                end
         | 
| 101 | 
            -
                class UrlExclusionFilter <  | 
| 102 | 
            -
                  # returns nil if any of the patterns matches it's  | 
| 101 | 
            +
                class UrlExclusionFilter < PatternFilter
         | 
| 102 | 
            +
                  # returns nil if any of the patterns matches it's URI,
         | 
| 103 103 | 
             
                  # the document itself otherwise
         | 
| 104 104 | 
             
                  def apply(document)
         | 
| 105 105 | 
             
                    return document unless @patterns
         | 
| @@ -109,9 +109,9 @@ module RDig | |
| 109 109 | 
             
                    return document
         | 
| 110 110 | 
             
                  end
         | 
| 111 111 | 
             
                end
         | 
| 112 | 
            -
                class UrlInclusionFilter <  | 
| 113 | 
            -
                  # returns  | 
| 114 | 
            -
                  #  | 
| 112 | 
            +
                class UrlInclusionFilter < PatternFilter
         | 
| 113 | 
            +
                  # returns the document if any of the patterns matches it's URI,
         | 
| 114 | 
            +
                  # nil otherwise
         | 
| 115 115 | 
             
                  def apply(document)
         | 
| 116 116 | 
             
                    return document unless @patterns
         | 
| 117 117 | 
             
                    @patterns.each { |p|
         | 
| @@ -121,21 +121,42 @@ module RDig | |
| 121 121 | 
             
                  end
         | 
| 122 122 | 
             
                end
         | 
| 123 123 |  | 
| 124 | 
            -
             | 
| 124 | 
            +
                # returns nil if any of the patterns matches it's path,
         | 
| 125 | 
            +
                # the document itself otherwise. Applied to real files only.
         | 
| 126 | 
            +
                class PathExclusionFilter < PatternFilter
         | 
| 127 | 
            +
                  def apply(document)
         | 
| 128 | 
            +
                    return document unless (@patterns && document.file?)
         | 
| 129 | 
            +
                    @patterns.each { |p|
         | 
| 130 | 
            +
                      return nil if document.uri.path =~ p
         | 
| 131 | 
            +
                    }
         | 
| 132 | 
            +
                    return document
         | 
| 133 | 
            +
                  end
         | 
| 134 | 
            +
                end
         | 
| 135 | 
            +
                # returns the document if any of the patterns matches it's path,
         | 
| 136 | 
            +
                # nil otherwise. Applied to real files only
         | 
| 137 | 
            +
                class PathInclusionFilter < PatternFilter
         | 
| 138 | 
            +
                  def apply(document)
         | 
| 139 | 
            +
                    return document unless (@patterns && document.file?)
         | 
| 140 | 
            +
                    @patterns.each { |p|
         | 
| 141 | 
            +
                      return document if document.uri.path =~ p
         | 
| 142 | 
            +
                    }
         | 
| 143 | 
            +
                    return nil
         | 
| 144 | 
            +
                  end
         | 
| 145 | 
            +
                end
         | 
| 125 146 |  | 
| 126 147 |  | 
| 127 148 | 
             
                # checks redirect count of the given document
         | 
| 128 149 | 
             
                # takes it out of the chain if number of redirections exceeds the
         | 
| 129 150 | 
             
                # max_redirects setting
         | 
| 130 151 | 
             
                def UrlFilters.maximum_redirect_filter(document, max_redirects)
         | 
| 131 | 
            -
                  return nil if document.redirections > max_redirects
         | 
| 152 | 
            +
                  return nil if document.respond_to?(:redirections) && document.redirections > max_redirects
         | 
| 132 153 | 
             
                  return document
         | 
| 133 154 | 
             
                end
         | 
| 134 155 |  | 
| 135 156 | 
             
                # expands both href="/path/xyz.html" and href="affe.html"
         | 
| 136 157 | 
             
                # to full urls
         | 
| 137 158 | 
             
                def UrlFilters.fix_relative_uri(document)
         | 
| 138 | 
            -
                  return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^ | 
| 159 | 
            +
                  #return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
         | 
| 139 160 | 
             
                  ref = document.referring_uri
         | 
| 140 161 | 
             
                  return document unless ref
         | 
| 141 162 | 
             
                  uri = document.uri
         | 
| @@ -150,6 +171,9 @@ module RDig | |
| 150 171 | 
             
                    uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
         | 
| 151 172 | 
             
                  end 
         | 
| 152 173 | 
             
                  return document
         | 
| 174 | 
            +
                rescue
         | 
| 175 | 
            +
                  p document
         | 
| 176 | 
            +
                  p document.uri
         | 
| 153 177 | 
             
                end
         | 
| 154 178 |  | 
| 155 179 | 
             
                def UrlFilters.hostname_filter(document, include_hosts)
         | 
| @@ -167,5 +191,14 @@ module RDig | |
| 167 191 | 
             
                  return document
         | 
| 168 192 | 
             
                end
         | 
| 169 193 |  | 
| 194 | 
            +
                def UrlFilters.scheme_filter_file(document)
         | 
| 195 | 
            +
                  return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^file$/i)
         | 
| 196 | 
            +
                  nil
         | 
| 197 | 
            +
                end
         | 
| 198 | 
            +
                def UrlFilters.scheme_filter_http(document)
         | 
| 199 | 
            +
                  return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^https?$/i)
         | 
| 200 | 
            +
                  nil
         | 
| 201 | 
            +
                end
         | 
| 202 | 
            +
             | 
| 170 203 | 
             
              end
         | 
| 171 204 | 
             
            end
         | 
| Binary file | 
| @@ -0,0 +1,32 @@ | |
| 1 | 
            +
            require 'test_helper'
         | 
| 2 | 
            +
            class CrawlerFsTest < Test::Unit::TestCase
         | 
| 3 | 
            +
              include TestHelper
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              def setup
         | 
| 6 | 
            +
                @fixture_path = File.expand_path(File.join(File.dirname(__FILE__), '../fixtures/'))
         | 
| 7 | 
            +
                index_dir = 'tmp/test-index'
         | 
| 8 | 
            +
                Dir.mkdir index_dir unless File.directory? index_dir
         | 
| 9 | 
            +
                RDig.configuration do |cfg|
         | 
| 10 | 
            +
                  @old_crawler_cfg = cfg.crawler.clone
         | 
| 11 | 
            +
                  cfg.crawler.start_urls = [ "file://#{@fixture_path}" ]
         | 
| 12 | 
            +
                  cfg.crawler.num_threads = 1
         | 
| 13 | 
            +
                  cfg.crawler.wait_before_leave = 1
         | 
| 14 | 
            +
                  cfg.index.path = index_dir
         | 
| 15 | 
            +
                  cfg.verbose = true
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
              end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
              def teardown
         | 
| 20 | 
            +
                RDig.configuration do |cfg|
         | 
| 21 | 
            +
                  cfg.crawler = @old_crawler_cfg
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
              end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
              def test_crawl
         | 
| 26 | 
            +
                crawler = Crawler.new
         | 
| 27 | 
            +
                crawler.run
         | 
| 28 | 
            +
              end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| @@ -0,0 +1,34 @@ | |
| 1 | 
            +
            require 'test_helper'
         | 
| 2 | 
            +
            class FileDocumentTest < Test::Unit::TestCase
         | 
| 3 | 
            +
              include TestHelper
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              def setup
         | 
| 6 | 
            +
                @fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
         | 
| 7 | 
            +
              end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              def test_find_files
         | 
| 10 | 
            +
                links = FileDocument.find_files(@fixture_path)
         | 
| 11 | 
            +
                assert_equal 3, links.size
         | 
| 12 | 
            +
                links = FileDocument.find_files("#{@fixture_path}/html")
         | 
| 13 | 
            +
                assert_equal 3, links.size
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              def test_fetch_directory
         | 
| 17 | 
            +
                dir = Document.create("file://#{@fixture_path}")
         | 
| 18 | 
            +
                dir.fetch
         | 
| 19 | 
            +
                assert_equal 3, dir.links.size
         | 
| 20 | 
            +
                dir = Document.create("file://#{@fixture_path}/pdf")
         | 
| 21 | 
            +
                dir.fetch
         | 
| 22 | 
            +
                assert_equal 1, dir.links.size
         | 
| 23 | 
            +
              end
         | 
| 24 | 
            +
             | 
| 25 | 
            +
              def test_fetch_content
         | 
| 26 | 
            +
                file = Document.create("file://#{@fixture_path}/pdf/simple.pdf")
         | 
| 27 | 
            +
                file.fetch
         | 
| 28 | 
            +
                assert file.needs_indexing?
         | 
| 29 | 
            +
                assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', file.body
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
              
         | 
| 32 | 
            +
            end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| @@ -3,13 +3,9 @@ class HtmlContentExtractorTest < Test::Unit::TestCase | |
| 3 3 | 
             
              include TestHelper
         | 
| 4 4 |  | 
| 5 5 | 
             
              def setup
         | 
| 6 | 
            -
                @ | 
| 6 | 
            +
                @config = OpenStruct.new(:html => RDig.config.content_extraction.html.clone)
         | 
| 7 | 
            +
                @extractor = ContentExtractors::HtmlContentExtractor.new(@config)
         | 
| 7 8 | 
             
                @nbsp = [160].pack('U') # non breaking space
         | 
| 8 | 
            -
                @config_backup = RDig.config.content_extraction.html.clone
         | 
| 9 | 
            -
              end
         | 
| 10 | 
            -
             | 
| 11 | 
            -
              def teardown
         | 
| 12 | 
            -
                RDig.config.content_extraction.html = @config_backup
         | 
| 13 9 | 
             
              end
         | 
| 14 10 |  | 
| 15 11 | 
             
              def test_can_do
         | 
| @@ -41,13 +37,11 @@ class HtmlContentExtractorTest < Test::Unit::TestCase | |
| 41 37 | 
             
              end
         | 
| 42 38 |  | 
| 43 39 | 
             
              def test_custom_content_element
         | 
| 44 | 
            -
                 | 
| 45 | 
            -
                   | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
                   | 
| 49 | 
            -
                    tagsoup.find('div', :attrs => { 'id', 'content' })
         | 
| 50 | 
            -
                  end
         | 
| 40 | 
            +
                @config.html.title_tag_selector = lambda do |tagsoup|
         | 
| 41 | 
            +
                  tagsoup.find('h1', :attrs => { 'class', 'title' })
         | 
| 42 | 
            +
                end
         | 
| 43 | 
            +
                @config.html.content_tag_selector = lambda do |tagsoup|
         | 
| 44 | 
            +
                  tagsoup.find('div', :attrs => { 'id', 'content' })
         | 
| 51 45 | 
             
                end
         | 
| 52 46 | 
             
                result = @extractor.process(html_doc('custom_tag_selectors'))
         | 
| 53 47 | 
             
                assert_equal 'Sample Title in h1', result[:title]
         | 
| @@ -61,23 +55,19 @@ class HtmlContentExtractorTest < Test::Unit::TestCase | |
| 61 55 |  | 
| 62 56 |  | 
| 63 57 | 
             
              def test_title_from_dcmeta
         | 
| 64 | 
            -
                 | 
| 65 | 
            -
                   | 
| 66 | 
            -
                    tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
         | 
| 67 | 
            -
                  end
         | 
| 58 | 
            +
                @config.html.title_tag_selector = lambda do |tagsoup|
         | 
| 59 | 
            +
                  tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
         | 
| 68 60 | 
             
                end
         | 
| 69 61 | 
             
                result = @extractor.process(html_doc('custom_tag_selectors'))
         | 
| 70 62 | 
             
                assert_equal 'Title from DC meta data', result[:title]
         | 
| 71 63 | 
             
              end
         | 
| 72 64 |  | 
| 73 65 | 
             
              def test_preprocessed_title
         | 
| 74 | 
            -
                 | 
| 75 | 
            -
                   | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
                    title =~ /^(.*)meta data$/ ? $1.strip : title.strip
         | 
| 80 | 
            -
                  end
         | 
| 66 | 
            +
                @config.html.title_tag_selector = lambda do |tagsoup|
         | 
| 67 | 
            +
                  title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
         | 
| 68 | 
            +
                  # use only a portion of the title tag's contents if it matches our
         | 
| 69 | 
            +
                  # regexp:
         | 
| 70 | 
            +
                  title =~ /^(.*)meta data$/ ? $1.strip : title.strip
         | 
| 81 71 | 
             
                end
         | 
| 82 72 | 
             
                result = @extractor.process(html_doc('custom_tag_selectors'))
         | 
| 83 73 | 
             
                assert_equal 'Title from DC', result[:title]
         | 
| @@ -3,7 +3,7 @@ class PdfContentExtractorTest < Test::Unit::TestCase | |
| 3 3 | 
             
              include TestHelper
         | 
| 4 4 |  | 
| 5 5 | 
             
              def setup
         | 
| 6 | 
            -
                @ce = ContentExtractors::PdfContentExtractor.new
         | 
| 6 | 
            +
                @ce = ContentExtractors::PdfContentExtractor.new(RDig.configuration.content_extraction)
         | 
| 7 7 | 
             
              end
         | 
| 8 8 |  | 
| 9 9 | 
             
              def test_can_do
         | 
| @@ -23,10 +23,10 @@ class PdfContentExtractorTest < Test::Unit::TestCase | |
| 23 23 | 
             
              private
         | 
| 24 24 | 
             
              def check_content(result)
         | 
| 25 25 | 
             
                assert_not_nil result
         | 
| 26 | 
            -
                 | 
| 26 | 
            +
                assert_equal 'PDF Test', result[:title]
         | 
| 27 27 | 
             
                assert_nil result[:links]
         | 
| 28 28 | 
             
                assert_not_nil result[:content]
         | 
| 29 | 
            -
                assert_equal 'This | 
| 29 | 
            +
                assert_equal 'This is for testing PDF extraction. Some Ümläuts and a €uro. Another Paragraph.', result[:content]
         | 
| 30 30 | 
             
              end
         | 
| 31 31 |  | 
| 32 32 | 
             
            end
         | 
| @@ -13,17 +13,17 @@ class UrlFilterTest < Test::Unit::TestCase | |
| 13 13 | 
             
                ]
         | 
| 14 14 | 
             
                chain = UrlFilters::FilterChain.new(cfg)
         | 
| 15 15 |  | 
| 16 | 
            -
                assert_nil chain.apply(Document. | 
| 17 | 
            -
                assert_not_nil chain.apply(Document. | 
| 18 | 
            -
                assert_nil chain.apply(Document. | 
| 16 | 
            +
                assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
         | 
| 17 | 
            +
                assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
         | 
| 18 | 
            +
                assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
         | 
| 19 19 | 
             
              end
         | 
| 20 20 |  | 
| 21 21 | 
             
              # test default chain config
         | 
| 22 22 | 
             
              def test_default_filterchain
         | 
| 23 | 
            -
                chain = UrlFilters::FilterChain.new(RDig.filter_chain)
         | 
| 24 | 
            -
                assert_nil chain.apply(Document. | 
| 25 | 
            -
                assert_not_nil chain.apply(Document. | 
| 26 | 
            -
                assert_nil chain.apply(Document. | 
| 23 | 
            +
                chain = UrlFilters::FilterChain.new(RDig.filter_chain[:http])
         | 
| 24 | 
            +
                assert_nil chain.apply(Document.create("http://www.example.com/affe.htm"))
         | 
| 25 | 
            +
                assert_not_nil chain.apply(Document.create("http://localhost:3000/affe.html"))
         | 
| 26 | 
            +
                assert_nil chain.apply(Document.create("http://localhost.com/affe.html"))
         | 
| 27 27 | 
             
              end
         | 
| 28 28 |  | 
| 29 29 | 
             
              # check lookup of chain parameters from config
         | 
| @@ -38,59 +38,59 @@ class UrlFilterTest < Test::Unit::TestCase | |
| 38 38 | 
             
                ]
         | 
| 39 39 | 
             
                chain = UrlFilters::FilterChain.new(cfg)
         | 
| 40 40 |  | 
| 41 | 
            -
                assert_nil chain.apply(Document. | 
| 42 | 
            -
                assert_not_nil chain.apply(Document. | 
| 43 | 
            -
                assert_nil chain.apply(Document. | 
| 41 | 
            +
                assert_nil chain.apply(Document.create("http://test.host/affe.htm"))
         | 
| 42 | 
            +
                assert_not_nil chain.apply(Document.create("http://test.host/affe.html"))
         | 
| 43 | 
            +
                assert_nil chain.apply(Document.create("http://test.host.com/affe.html"))
         | 
| 44 44 | 
             
              end
         | 
| 45 45 |  | 
| 46 46 | 
             
              def test_urlpattern_filter
         | 
| 47 47 | 
             
                f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
         | 
| 48 | 
            -
                assert_nil f.apply(Document. | 
| 49 | 
            -
                assert_not_nil f.apply(Document. | 
| 48 | 
            +
                assert_nil f.apply(Document.create("http://test.host/affe.htm"))
         | 
| 49 | 
            +
                assert_not_nil f.apply(Document.create("http://test.host/affe.html"))
         | 
| 50 50 | 
             
                f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
         | 
| 51 | 
            -
                assert_not_nil f.apply(Document. | 
| 52 | 
            -
                assert_nil f.apply(Document. | 
| 53 | 
            -
                assert_nil f.apply(Document. | 
| 51 | 
            +
                assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
         | 
| 52 | 
            +
                assert_nil f.apply(Document.create("http://test.host/affe.html"))
         | 
| 53 | 
            +
                assert_nil f.apply(Document.create("http://test.host/affe.aspx"))
         | 
| 54 54 | 
             
                f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
         | 
| 55 | 
            -
                assert_nil f.apply(Document. | 
| 56 | 
            -
                assert_not_nil f.apply(Document. | 
| 57 | 
            -
                assert_not_nil f.apply(Document. | 
| 58 | 
            -
                assert_not_nil f.apply(Document. | 
| 55 | 
            +
                assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
         | 
| 56 | 
            +
                assert_not_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
         | 
| 57 | 
            +
                assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
         | 
| 58 | 
            +
                assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
         | 
| 59 59 | 
             
                f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
         | 
| 60 | 
            -
                assert_nil f.apply(Document. | 
| 61 | 
            -
                assert_nil f.apply(Document. | 
| 62 | 
            -
                assert_not_nil f.apply(Document. | 
| 63 | 
            -
                assert_not_nil f.apply(Document. | 
| 60 | 
            +
                assert_nil f.apply(Document.create("http://test.host/dir1/affe.aspx"))
         | 
| 61 | 
            +
                assert_nil f.apply(Document.create("http://test.host/dir2/dir1/affe.htm"))
         | 
| 62 | 
            +
                assert_not_nil f.apply(Document.create("http://test.host/affe.htm"))
         | 
| 63 | 
            +
                assert_not_nil f.apply(Document.create("http://test.host/dir2/affe.htm"))
         | 
| 64 64 | 
             
              end
         | 
| 65 65 |  | 
| 66 66 | 
             
              def test_hostname_filter
         | 
| 67 67 | 
             
                include_hosts = [ 'test.host', 'localhost' ]
         | 
| 68 | 
            -
                assert_nil UrlFilters.hostname_filter(Document. | 
| 69 | 
            -
                assert_not_nil UrlFilters.hostname_filter(Document. | 
| 70 | 
            -
                assert_not_nil UrlFilters.hostname_filter(Document. | 
| 68 | 
            +
                assert_nil UrlFilters.hostname_filter(Document.create('http://google.com/'), include_hosts)
         | 
| 69 | 
            +
                assert_not_nil UrlFilters.hostname_filter(Document.create('http://test.host/file.html'), include_hosts)
         | 
| 70 | 
            +
                assert_not_nil UrlFilters.hostname_filter(Document.create('http://localhost/file.html'), include_hosts)
         | 
| 71 71 | 
             
              end
         | 
| 72 72 |  | 
| 73 73 | 
             
              def test_fix_relative_uri
         | 
| 74 | 
            -
                doc = Document. | 
| 74 | 
            +
                doc = Document.create('http://test.host/dir/file.html')
         | 
| 75 75 | 
             
                assert_equal('http://test.host/dir/another.html',
         | 
| 76 | 
            -
                              UrlFilters.fix_relative_uri(Document. | 
| 76 | 
            +
                              UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
         | 
| 77 77 | 
             
                assert_equal('http://test.host/dir/../another.html',
         | 
| 78 | 
            -
                              UrlFilters.fix_relative_uri(Document. | 
| 78 | 
            +
                              UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
         | 
| 79 79 | 
             
                assert_equal('http://test.host/dir/another.html',
         | 
| 80 | 
            -
                              UrlFilters.fix_relative_uri(Document. | 
| 80 | 
            +
                              UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
         | 
| 81 81 | 
             
                assert_equal('http://test.host/dir/another.html',
         | 
| 82 | 
            -
                              UrlFilters.fix_relative_uri(Document. | 
| 82 | 
            +
                              UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
         | 
| 83 83 | 
             
                assert_equal('HTTP://test.host/dir/another.html',
         | 
| 84 | 
            -
                              UrlFilters.fix_relative_uri(Document. | 
| 85 | 
            -
                doc = Document. | 
| 84 | 
            +
                              UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
         | 
| 85 | 
            +
                doc = Document.create('https://test.host/dir/')
         | 
| 86 86 | 
             
                assert_equal('https://test.host/dir/another.html',
         | 
| 87 | 
            -
                              UrlFilters.fix_relative_uri(Document. | 
| 88 | 
            -
                doc = Document. | 
| 87 | 
            +
                              UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
         | 
| 88 | 
            +
                doc = Document.create('https://test.host/')
         | 
| 89 89 | 
             
                assert_equal('https://test.host/another.html',
         | 
| 90 | 
            -
                              UrlFilters.fix_relative_uri(Document. | 
| 91 | 
            -
                doc = Document. | 
| 90 | 
            +
                              UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
         | 
| 91 | 
            +
                doc = Document.create('https://test.host')
         | 
| 92 92 | 
             
                assert_equal('https://test.host/another.html',
         | 
| 93 | 
            -
                              UrlFilters.fix_relative_uri(Document. | 
| 93 | 
            +
                              UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
         | 
| 94 94 | 
             
              end
         | 
| 95 95 | 
             
            end
         | 
| 96 96 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,10 +1,10 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 | 
            -
            rubygems_version: 0.8.11
         | 
| 2 | 
            +
            rubygems_version: 0.8.11.15
         | 
| 3 3 | 
             
            specification_version: 1
         | 
| 4 4 | 
             
            name: rdig
         | 
| 5 5 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 6 | 
            -
              version: 0. | 
| 7 | 
            -
            date: 2006-04- | 
| 6 | 
            +
              version: 0.3.0
         | 
| 7 | 
            +
            date: 2006-04-26 00:00:00 +02:00
         | 
| 8 8 | 
             
            summary: Ruby based web site indexing and searching library.
         | 
| 9 9 | 
             
            require_paths: 
         | 
| 10 10 | 
             
            - lib
         | 
| @@ -25,6 +25,7 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement | |
| 25 25 | 
             
            platform: ruby
         | 
| 26 26 | 
             
            signing_key: 
         | 
| 27 27 | 
             
            cert_chain: 
         | 
| 28 | 
            +
            post_install_message: 
         | 
| 28 29 | 
             
            authors: 
         | 
| 29 30 | 
             
            - Jens Kraemer
         | 
| 30 31 | 
             
            files: 
         | 
| @@ -32,13 +33,14 @@ files: | |
| 32 33 | 
             
            - lib/rdig
         | 
| 33 34 | 
             
            - lib/htmlentities
         | 
| 34 35 | 
             
            - lib/rdig.rb
         | 
| 35 | 
            -
            - lib/rdig/http_client.rb
         | 
| 36 36 | 
             
            - lib/rdig/crawler.rb
         | 
| 37 37 | 
             
            - lib/rdig/search.rb
         | 
| 38 38 | 
             
            - lib/rdig/highlight.rb
         | 
| 39 39 | 
             
            - lib/rdig/index.rb
         | 
| 40 40 | 
             
            - lib/rdig/url_filters.rb
         | 
| 41 41 | 
             
            - lib/rdig/content_extractors.rb
         | 
| 42 | 
            +
            - lib/rdig/documents.rb
         | 
| 43 | 
            +
            - lib/rdig/file.rb
         | 
| 42 44 | 
             
            - lib/htmlentities/CHANGES
         | 
| 43 45 | 
             
            - lib/htmlentities/COPYING
         | 
| 44 46 | 
             
            - lib/htmlentities/README
         | 
| @@ -51,6 +53,8 @@ files: | |
| 51 53 | 
             
            - test/unit/html_content_extractor_test.rb
         | 
| 52 54 | 
             
            - test/unit/pdf_content_extractor_test.rb
         | 
| 53 55 | 
             
            - test/unit/word_content_extractor_test.rb
         | 
| 56 | 
            +
            - test/unit/file_document_test.rb
         | 
| 57 | 
            +
            - test/unit/crawler_fs_test.rb
         | 
| 54 58 | 
             
            - test/fixtures/html
         | 
| 55 59 | 
             
            - test/fixtures/pdf
         | 
| 56 60 | 
             
            - test/fixtures/word
         | 
    
        data/lib/rdig/http_client.rb
    DELETED
    
    | @@ -1,22 +0,0 @@ | |
| 1 | 
            -
            module RDig
         | 
| 2 | 
            -
              
         | 
| 3 | 
            -
              module HttpClient
         | 
| 4 | 
            -
                def do_get(uri, user_agent='RDig crawler')
         | 
| 5 | 
            -
                  # Set up the appropriate http headers
         | 
| 6 | 
            -
                  headers = { "User-Agent" => user_agent }
         | 
| 7 | 
            -
                  result = {}
         | 
| 8 | 
            -
              
         | 
| 9 | 
            -
                  begin
         | 
| 10 | 
            -
                    Net::HTTP.start(uri.host, (uri.port or 80)) { |http|
         | 
| 11 | 
            -
                      final_uri = uri.path 
         | 
| 12 | 
            -
                      final_uri += ('?' + uri.query) if uri.query
         | 
| 13 | 
            -
                      return http.get(final_uri, headers)
         | 
| 14 | 
            -
                    }
         | 
| 15 | 
            -
                  rescue => error
         | 
| 16 | 
            -
                    puts error
         | 
| 17 | 
            -
                  end
         | 
| 18 | 
            -
                end
         | 
| 19 | 
            -
              end
         | 
| 20 | 
            -
             | 
| 21 | 
            -
            end
         | 
| 22 | 
            -
             |