RubyGems - rdig - Versions diffs - 0.3.4 → 0.3.5 - Mend

rdig 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/CHANGES +8 -0
data/doc/examples/config.rb +25 -25
data/lib/rdig.rb +42 -17
data/lib/rdig/content_extractors.rb +3 -3
data/lib/rdig/content_extractors/hpricot.rb +7 -7
data/lib/rdig/crawler.rb +31 -19
data/lib/rdig/documents.rb +32 -25
data/lib/rdig/index.rb +1 -1
data/lib/rdig/search.rb +1 -1
data/lib/rdig/url_filters.rb +14 -2
data/rakefile +7 -9
data/test/unit/http_document_test.rb +17 -0
data/test/unit/rdig_test.rb +38 -0
data/test/unit/searcher_test.rb +2 -0
data/test/unit/url_filters_test.rb +9 -9
metadata +88 -80
data/lib/rdig/content_extractors/rubyful_soup.rb +0 -151
data/test/unit/rubyful_soup_content_extractor_test.rb +0 -83

data/CHANGES CHANGED

@@ -1,3 +1,11 @@
+0.3.5
+- Add max_depth option to crawler configuration for limiting the crawl to a
+  specific depth
+- add support for http proxies including basic authentication
+- remove rubyfoul_soup support
+0.3.4
 0.3.2
 - make RDig compatible with Ferret 0.10.x
 - won't work any more with Ferret 0.9.x and before

data/doc/examples/config.rb CHANGED

@@ -2,6 +2,12 @@ RDig.configuration do |cfg|
   ##################################################################
   # options you really should set
+  # log file location
+  cfg.log_file = '/tmp/rdig.log'
+  # log level, set to :debug, :info, :warn or :error
+  cfg.log_level = :info
   # provide one or more URLs for the crawler to start from
   cfg.crawler.start_urls = [ 'http://www.example.com/' ]
@@ -29,10 +35,11 @@ RDig.configuration do |cfg|
   # content extraction options
   cfg.content_extraction = OpenStruct.new(
-  # HPRICOT configuration
-  # this is the html parser used by default from RDig 0.3.3 upwards.
-  # Hpricot by far outperforms Rubyful Soup, and is at least as flexible when
-  # it comes to selection of portions of the html documents.
+    # HPRICOT configuration
+    # hpricot is the html parsing lib used by RDig. See
+    # http://code.whytheluckystiff.net/hpricot for usage information.
+    # Any code blocks given for content selection will receive an Hpricot instance
+    # containing the full page content when called.
     :hpricot      => OpenStruct.new(
       # css selector for the element containing the page title
       :title_tag_selector => 'title',
@@ -42,26 +49,6 @@ RDig.configuration do |cfg|
       # might also be a proc returning either an element or a string:
       # :content_tag_selector => lambda { |hpricot_doc| ... }
     )
-  # RUBYFUL SOUP
-  # This is a powerful, but somewhat slow, ruby-only html parsing lib which was
-  # RDig's default html parser up to version 0.3.2. To use it, comment the
-  # hpricot config above, and uncomment the following:
-  #
-  #  :rubyful_soup => OpenStruct.new(
-  #    # provide a method that returns the title of an html document
-  #    # this method may either return a tag to extract the title from,
-  #    # or a ready-to-index string.
-  #    :content_tag_selector => lambda { |tagsoup|
-  #      tagsoup.html.body
-  #    },
-  #    # provide a method that selects the tag containing the page content you
-  #    # want to index. Useful to avoid indexing common elements like navigation
-  #    # and page footers for every page.
-  #    :title_tag_selector         => lambda { |tagsoup|
-  #      tagsoup.html.head.title
-  #    }
-  #  )
   )
   # crawler options
@@ -95,12 +82,25 @@ RDig.configuration do |cfg|
   # crawls on slow sites. Don't set to 0, even when crawling a local fs.
   # cfg.crawler.wait_before_leave = 10
+  # limit the crawling depth. Default: nil (unlimited)
+  # Set to 0 to only index the start_urls.
+  # cfg.crawler.max_depth = nil
+  # http proxy configuration
+  # proxy url
+  # cfg.crawler.http_proxy = nil
+  #
+  # proxy username
+  # cfg.crawler.http_proxy_user = nil
+  # proxy password
+  # cfg.crawler.http_proxy_pass = nil
   # indexer options
   # create a new index on each run. Will append to the index if false. Use when
   # building a single index from multiple runs, e.g. one across a website and the
   # other a tree in a local file system
-  # config.index.create = true
+  # cfg.index.create = true
   # rewrite document uris before indexing them. This is useful if you're
   # indexing on disk, but the documents should be accessible via http, e.g. from

data/lib/rdig.rb CHANGED

@@ -24,7 +24,7 @@
 #++
 #
-RDIGVERSION = '0.3.4'
+RDIGVERSION = '0.3.5'
 require 'thread'
@@ -39,6 +39,8 @@ require 'net/http'
 require 'getoptlong'
 require 'tempfile'
 require 'open-uri'
+require 'logger'
+require 'base64'
 begin
   require 'ferret'
@@ -69,10 +71,11 @@ module RDig
           :scheme_filter_http,
           :fix_relative_uri,
           :normalize_uri,
+          { RDig::UrlFilters::DepthFilter => :max_depth },
           { :hostname_filter => :include_hosts },
           { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
           { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
-          RDig::UrlFilters::VisitedUrlFilter
+          RDig::UrlFilters::VisitedUrlFilter
         ],
         # filter chain for file system crawling
         :file => [
@@ -103,6 +106,8 @@ module RDig
         yield configuration
       else
         @config ||= OpenStruct.new(
+          :log_file  => '/tmp/rdig.log',
+          :log_level => :warn,
           :crawler           => OpenStruct.new(
             :start_urls        => [ "http://localhost:3000/" ],
             :include_hosts     => [ "localhost" ],
@@ -111,7 +116,11 @@ module RDig
             :index_document    => nil,
             :num_threads       => 2,
             :max_redirects     => 5,
-            :wait_before_leave => 10
+            :max_depth         => nil,
+            :wait_before_leave => 10,
+            :http_proxy        => nil,
+            :http_proxy_user   => nil,
+            :http_proxy_pass   => nil
           ),
           :content_extraction  => OpenStruct.new(
             # settings for html content extraction (hpricot)
@@ -124,19 +133,6 @@ module RDig
               # might also be a proc returning either an element or a string:
               # :content_tag_selector => lambda { |hpricot_doc| ... }
             )
-            #,
-            # # settings for html content extraction (RubyfulSoup)
-            # :rubyful_soup => OpenStruct.new(
-            #  # select the html element that contains the content to index
-            #  # by default, we index all inside the body tag:
-            #  :content_tag_selector => lambda { |tagsoup|
-            #    tagsoup.html.body
-            #  },
-            #  # select the html element containing the title
-            #  :title_tag_selector         => lambda { |tagsoup|
-            #    tagsoup.html.head.title
-            #  }
-            # )
           ),
           :index                 => OpenStruct.new(
             :path                => "index/",
@@ -151,6 +147,36 @@ module RDig
     end
     alias config configuration
+    def logger
+      @logger ||= create_logger
+    end
+    def logger=(log)
+      @logger = log
+    end
+    def create_logger
+      l = Logger.new(RDig.config.log_file)
+      l.level = Logger.const_get RDig.config.log_level.to_s.upcase rescue Logger::WARN
+      return l
+    end
+    # returns http options for open_uri if configured
+    def open_uri_http_options
+      unless RDig::configuration.crawler.open_uri_http_options
+        opts = {}
+        if RDig::configuration.crawler.http_proxy
+          opts[:proxy] = RDig::configuration.crawler.http_proxy
+          if user = RDig::configuration.crawler.http_proxy_user
+            pass = RDig::configuration.crawler.http_proxy_pass
+            opts['Authorization'] = "Basic " + Base64.encode64("#{user}:#{pass}")
+          end
+        end
+        RDig::configuration.crawler.open_uri_http_options = opts
+      end
+      return RDig::configuration.crawler.open_uri_http_options
+    end
   end
   class Application
@@ -210,7 +236,6 @@ module RDig
       when '--query'
         options.query = value
       when '--version'
-        puts "rdig, version #{RDIGVERSION}"
         exit
       else
         fail "Unknown option: #{opt}"

data/lib/rdig/content_extractors.rb CHANGED

@@ -22,7 +22,7 @@ module RDig
       def self.extractors; @@extractors ||= [] end
       def self.extractor_instances
         @@extractor_instances ||= extractors.map { |ex_class|
-          puts "initializing content extractor: #{ex_class}" if RDig.configuration.verbose
+          RDig.logger.info "initializing content extractor: #{ex_class}"
           ex_class.new(RDig.configuration.content_extraction) rescue nil
         }.compact
       end
@@ -77,8 +77,8 @@ end
 # load content extractors
 Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
   begin
-    require f
+    require f
   rescue LoadError
-    puts "could not load #{f}: #{$!}"
+    RDig::logger.error "could not load #{f}: #{$!}"
   end
 end

data/lib/rdig/content_extractors/hpricot.rb CHANGED

@@ -40,12 +40,11 @@ module RDig
       # all textual content contained in the root element and all it's
       # children.
       def extract_content(doc)
-        content = ''
-        ce = content_element(doc)
-        content = strip_tags(strip_comments(ce.inner_html)) if ce
-#          (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
-#          extract_text child, content
-        return content.strip
+        if ce = content_element(doc)
+          return strip_tags(strip_comments(ce.inner_html))
+        end
+          # return (ce.inner_text || '').gsub(Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' ').strip
+        return ''
       end
       # extracts the href attributes of all a tags, except
@@ -91,7 +90,8 @@ module RDig
                                Regexp::MULTILINE, 'u'), ''
         string.gsub! Regexp.new('<.+?>',
                                Regexp::MULTILINE, 'u'), ''
-        string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
+        string.gsub! Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
+        string.strip
       end
     end

data/lib/rdig/crawler.rb CHANGED

@@ -3,23 +3,30 @@ module RDig
   class Crawler
-    def initialize
+    def initialize(config = RDig.config, logger = RDig.logger)
       @documents = Queue.new
       @etag_filter = ETagFilter.new
+      @logger = logger
+      @config = config
     end
     def run
-      raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
-      @indexer = Index::Indexer.new(RDig.config.index)
+      @indexer = Index::Indexer.new(@config.index)
+      crawl
+    ensure
+      @indexer.close if @indexer
+    end
+    def crawl
+      raise 'no start urls given!' if @config.crawler.start_urls.empty?
       # check whether we are indexing on-disk or via http
-      url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
+      url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
       chain_config = RDig.filter_chain[url_type]
       filterchain = UrlFilters::FilterChain.new(chain_config)
-      RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
-      num_threads = RDig.config.crawler.num_threads
+      @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
+      num_threads = @config.crawler.num_threads
       group = ThreadsWait.new
       num_threads.times { |i|
         group.join_nowait Thread.new("fetcher #{i}") {
@@ -31,20 +38,19 @@ module RDig
       }
       # check for an empty queue every now and then
-      sleep_interval = RDig.config.crawler.wait_before_leave
+      sleep_interval = @config.crawler.wait_before_leave
       begin
         sleep sleep_interval
       end until @documents.empty?
       # nothing to do any more, tell the threads to exit
       num_threads.times { @documents << :exit }
-      puts "waiting for threads to finish..."
+      @logger.info "waiting for threads to finish..."
       group.all_waits
-    ensure
-      @indexer.close if @indexer
     end
     def process_document(doc, filterchain)
+      @logger.debug "processing document #{doc}"
       doc.fetch
       # add links from this document to the queue
       doc.content[:links].each { |url|
@@ -52,10 +58,14 @@ module RDig
       } unless doc.content[:links].nil?
       return unless @etag_filter.apply(doc)
-      @indexer << doc if doc.needs_indexing?
+      add_to_index doc
     rescue
-      puts "error processing document #{doc.uri.to_s}: #{$!}"
-      puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
+      @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
+      @logger.debug "Trace: #{$!.backtrace.join("\n")}"
+    end
+    def add_to_index(doc)
+      @indexer << doc if doc.needs_indexing?
     end
@@ -64,17 +74,19 @@ module RDig
     # processing
     def add_url(url, filterchain, referring_document = nil)
       return if url.nil? || url.empty?
-      if referring_document and referring_document.uri.scheme =~ /^https?/i
-        doc = Document.create(url, referring_document.uri)
+      @logger.debug "add_url #{url}"
+      doc = if referring_document
+        referring_document.create_child(url)
       else
-        doc = Document.create(url)
+        Document.create(url)
       end
       doc = filterchain.apply(doc)
       if doc
         @documents << doc
-        puts "added url #{url}" if RDig::config.verbose
+        @logger.debug "url #{url} survived filterchain"
       end
     rescue
       nil

data/lib/rdig/documents.rb CHANGED

@@ -9,30 +9,23 @@ module RDig
     attr_reader :content
     attr_reader :content_type
-    def self.create(url, referrer_uri = nil)
-      # a referrer is a clear enough hint to create an HttpDocument
-      if referrer_uri && referrer_uri.scheme =~ /^https?$/i
-        return HttpDocument.new(:url => url, :referrer => referrer_uri)
-      end
-      case url
-      when /^https?:\/\//i
-        HttpDocument.new(:url => url, :referrer => referrer_uri) if referrer_uri.nil?
-      when /^file:\/\//i
-        # files don't have referrers - the check for nil prevents us from being
-        # tricked into indexing local files by file:// links in the web site
-        # we index.
-        FileDocument.new(:url => url) if referrer_uri.nil?
+    def self.create(url)
+      return case url
+        when /^https?:\/\//i
+          HttpDocument.new(:uri => url)
+        when /^file:\/\//i
+          FileDocument.new(:uri => url)
       end
     end
     # url: url of this document, may be relative to the referring doc or host.
     # referrer: uri of the document we retrieved this link from
     def initialize(args)
+      RDig.logger.debug "initialize: #{args.inspect}"
       begin
-        @uri = URI.parse(args[:url])
+        @uri = URI.parse(args[:uri])
       rescue URI::InvalidURIError
-        raise "Cannot create document using invalid URL: #{args[:url]}"
+        raise "Cannot create document using invalid URL: #{args[:uri]}"
       end
     end
@@ -48,6 +41,10 @@ module RDig
       !self.content.nil?
     end
+    def to_s
+      "#{self.class.name}, uri=#{uri}, title=#{has_content? ? title : 'not loaded yet'}"
+    end
   end
@@ -59,14 +56,17 @@ module RDig
       super(args)
     end
+    def create_child(uri)
+      FileDocument.new(:uri => uri)
+    end
     def self.find_files(path)
       links = []
+      pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
       Dir.glob(File.expand_path(File.join(path, '*'))) do |filename|
+        RDig.logger.debug "checking file #{filename}"
         # Skip files not matching known mime types
-        pattern = /.+\.(#{File::FILE_EXTENSION_MIME_TYPES.keys.join('|')})$/i
-        if File.directory?(filename) || filename =~ pattern
-          links << "file://#{filename}"
-        end
+        links << "file://#{filename}" if File.directory?(filename) || filename =~ pattern
       end
       links
     end
@@ -97,20 +97,27 @@ module RDig
   #
   class HttpDocument < Document
+    # counts how far this document is away from one of the start urls. Used to limit crawling by depth.
+    attr_reader :depth
     attr_reader :referring_uri
     attr_reader :status
     attr_reader :etag
+    def create_child(uri)
+      HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
+    end
     # url: url of this document, may be relative to the referring doc or host.
     # referrer: uri of the document we retrieved this link from
     def initialize(args={})
       super(args)
       @referring_uri = args[:referrer]
+      @depth = args[:depth] || 0
     end
     def fetch
-      puts "fetching #{@uri.to_s}" if RDig::config.verbose
-      open(@uri.to_s) do |doc|
+      RDig.logger.debug "fetching #{@uri.to_s}"
+      open(@uri.to_s, RDig::open_uri_http_options) do |doc|
         case doc.status.first.to_i
         when 200
           @etag = doc.meta['etag']
@@ -118,13 +125,13 @@ module RDig
           @content = ContentExtractors.process(doc.read, doc.content_type)
           @status = :success
         when 404
-          puts "got 404 for #{@uri}"
+          RDig.logger.info "got 404 for #{@uri}"
         else
-          puts "don't know what to do with response: #{doc.status.join(' : ')}"
+          RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
         end
       end
     rescue
-      puts "error fetching #{@uri.to_s}: #{$!}" if RDig::config.verbose
+      RDig.logger.warn "error fetching #{@uri.to_s}: #{$!}"
     ensure
       @content ||= {}
     end

data/lib/rdig/index.rb CHANGED

@@ -15,7 +15,7 @@ module RDig
       end
       def add_to_index(document)
-        puts "add to index: #{document.uri.to_s}" if RDig::config.verbose
+        RDig.logger.debug "add to index: #{document.uri.to_s}"
         @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
         # all stored and tokenized, should be ferret defaults
         doc = {

data/lib/rdig/search.rb CHANGED

@@ -43,7 +43,7 @@ module RDig
       def search(query, options={})
         result = {}
         query = query_parser.parse(query) if query.is_a?(String)
-        puts "Query: #{query}"
+        RDig.logger.info "Query: #{query}"
         results = []
         searcher = ferret_searcher
         result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|

data/lib/rdig/url_filters.rb CHANGED

@@ -80,6 +80,15 @@ module RDig
       end
     end
+    class DepthFilter
+      def initialize(max_depth = nil)
+        @max_depth = max_depth
+      end
+      def apply(document)
+        return document if @max_depth.nil? || document.depth <= @max_depth
+      end
+    end
     # base class for url inclusion / exclusion filters
     class PatternFilter
@@ -98,6 +107,7 @@ module RDig
         end
       end
     end
     class UrlExclusionFilter < PatternFilter
       # returns nil if any of the patterns matches it's URI,
       # the document itself otherwise
@@ -176,9 +186,11 @@ module RDig
       p document.uri
     end
+    # filter uris by hostname list. With a nil or empty list all documents may
+    # pass this filter.
     def UrlFilters.hostname_filter(document, include_hosts)
-      return document if include_hosts.include?(document.uri.host)
-      return nil
+      #RDig.logger.debug "hostname_filter: #{include_hosts}"
+      return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
     end
     def UrlFilters.normalize_uri(document)

data/rakefile CHANGED

@@ -21,7 +21,7 @@ end
 PKG_NAME      = 'rdig'
 # Determine the current version of the software
-if `ruby -Ilib ./bin/rdig --version` =~ /rdig, version ([0-9.]+)$/
+if `ruby -Ilib ./bin/rdig --version` =~ /RDig version ([0-9.]+)$/
   CURRENT_VERSION = $1
 else
   CURRENT_VERSION = "0.0.0"
@@ -131,10 +131,7 @@ else
     #### Dependencies and requirements.
     s.add_dependency('ferret', '>= 0.10.0')
-    # TODO: check if there is anything like 'suggested' instead of required, or
-    # ORed dependencies...
-    #s.add_dependency('rubyful_soup', '>= 1.0.4')
-    s.add_dependency('hpricot', '>= 0.4')
+    s.add_dependency('hpricot', '>= 0.6')
     #s.requirements << ""
     #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)
@@ -282,9 +279,9 @@ task :prerelease do
     announce "Release Task Testing, skipping checked-in file test"
   else
     announce "Checking for unchecked-in files..."
-    data = `svn st`
-    unless data =~ /^$/
-      fail "SVN status is not clean ... do you have unchecked-in files?"
+    data = `git status`
+    unless data =~ /working directory clean/
+      fail "GIT status is not clean ... do you have unchecked-in files?"
     end
     announce "No outstanding checkins found ... OK"
   end
@@ -310,7 +307,8 @@ task :update_version => [:prerelease] do
     if ENV['RELTEST']
       announce "Release Task Testing, skipping commiting of new version"
     else
-      sh %{svn commit -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
+      sh %{git commit -a -m "Updated to version #{PKG_VERSION}" lib/rdig.rb}
+      sh %{git svn dcommit}
     end
   end
 end

data/test/unit/http_document_test.rb ADDED

@@ -0,0 +1,17 @@
+require 'test_helper'
+class HttpDocumentTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    @fixture_path = File.join(File.expand_path(File.dirname(__FILE__)), '../fixtures/')
+  end
+  def test_initialize
+    d = Document.create 'http://1stlineleewes.com'
+    assert_equal '1stlineleewes.com', d.uri.host
+    assert_equal '', d.uri.path
+  end
+end

data/test/unit/rdig_test.rb ADDED

@@ -0,0 +1,38 @@
+require 'test_helper'
+class RDigTest < Test::Unit::TestCase
+  include TestHelper
+  def setup
+    RDig.configuration do |cfg|
+      @old_crawler_cfg = cfg.crawler.clone
+      cfg.log_level = :debug
+      cfg.log_file = 'tmp/test.log'
+    end
+  end
+  def teardown
+    RDig.configuration do |cfg|
+      cfg.crawler = @old_crawler_cfg
+    end
+  end
+  def test_proxy_config
+    RDig.configuration do |cfg|
+      cfg.crawler.http_proxy = 'http://proxy.com:8080'
+    end
+    assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
+    assert_nil RDig.open_uri_http_options['Authorization']
+  end
+  def test_proxy_auth
+    RDig.configuration do |cfg|
+      cfg.crawler.http_proxy = 'http://proxy.com:8080'
+      cfg.crawler.http_proxy_user = 'username'
+      cfg.crawler.http_proxy_pass = 'password'
+    end
+    assert_equal 'http://proxy.com:8080', RDig.open_uri_http_options[:proxy]
+    assert_equal "Basic dXNlcm5hbWU6cGFzc3dvcmQ=\n", RDig.open_uri_http_options['Authorization']
+  end
+end

data/test/unit/searcher_test.rb CHANGED

@@ -13,6 +13,8 @@ class SearcherTest < Test::Unit::TestCase
       cfg.crawler.wait_before_leave = 1
       cfg.index.path = index_dir
       cfg.verbose = true
+      cfg.log_level = :debug
+      cfg.log_file = 'tmp/test.log'
     end
     crawler = Crawler.new
     crawler.run

data/test/unit/url_filters_test.rb CHANGED

@@ -1,5 +1,5 @@
 require 'test_helper'
-class UrlFilterTest < Test::Unit::TestCase
+class UrlFiltersTest < Test::Unit::TestCase
   include TestHelper, RDig
   def setup
@@ -73,24 +73,24 @@ class UrlFilterTest < Test::Unit::TestCase
   def test_fix_relative_uri
     doc = Document.create('http://test.host/dir/file.html')
     assert_equal('http://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
     assert_equal('http://test.host/dir/../another.html',
-                  UrlFilters.fix_relative_uri(Document.create('../another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
     assert_equal('http://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.create('/dir/another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
     assert_equal('http://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.create('http://test.host/dir/another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(doc.create_child('http://test.host/dir/another.html')).uri.to_s)
     assert_equal('HTTP://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.create('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(doc.create_child('HTTP://test.host/dir/another.html')).uri.to_s)
     doc = Document.create('https://test.host/dir/')
     assert_equal('https://test.host/dir/another.html',
-                  UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
     doc = Document.create('https://test.host/')
     assert_equal('https://test.host/another.html',
-                  UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
     doc = Document.create('https://test.host')
     assert_equal('https://test.host/another.html',
-                  UrlFilters.fix_relative_uri(Document.create('another.html', doc.uri)).uri.to_s)
+                  UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
   end
 end

metadata CHANGED

@@ -1,74 +1,88 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.8.11
-specification_version: 1
 name: rdig
 version: !ruby/object:Gem::Version
-  version: 0.3.4
-date: 2006-12-31 00:00:00 +01:00
-summary: Ruby based web site indexing and searching library.
-require_paths:
-- lib
-email: jk@jkraemer.net
-homepage: http://rdig.rubyforge.org/
-rubyforge_project: rdig
-description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
-autorequire:
-default_executable: rdig
-bindir: bin
-has_rdoc: true
-required_ruby_version: !ruby/object:Gem::Version::Requirement
-  requirements:
-  - - ">"
-    - !ruby/object:Gem::Version
-      version: 0.0.0
-  version:
+  version: 0.3.5
 platform: ruby
-signing_key:
-cert_chain:
 authors:
 - Jens Kraemer
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2008-02-26 00:00:00 +01:00
+default_executable: rdig
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: ferret
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.10.0
+    version:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0.6"
+    version:
+description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
+email: jk@jkraemer.net
+executables:
+- rdig
+extensions: []
+extra_rdoc_files:
+- README
+- CHANGES
+- LICENSE
+- TODO
 files:
 - bin/rdig
-- lib/rdig
-- lib/htmlentities
 - lib/rdig.rb
-- lib/rdig/content_extractors
-- lib/rdig/crawler.rb
-- lib/rdig/search.rb
-- lib/rdig/highlight.rb
-- lib/rdig/index.rb
+- lib/rdig
 - lib/rdig/url_filters.rb
+- lib/rdig/index.rb
+- lib/rdig/crawler.rb
 - lib/rdig/content_extractors.rb
-- lib/rdig/documents.rb
 - lib/rdig/file.rb
-- lib/rdig/content_extractors/rubyful_soup.rb
+- lib/rdig/highlight.rb
+- lib/rdig/documents.rb
+- lib/rdig/search.rb
+- lib/rdig/content_extractors
 - lib/rdig/content_extractors/doc.rb
 - lib/rdig/content_extractors/hpricot.rb
 - lib/rdig/content_extractors/pdf.rb
-- lib/htmlentities/CHANGES
+- lib/htmlentities
+- lib/htmlentities/htmlentities.rb
 - lib/htmlentities/COPYING
+- lib/htmlentities/CHANGES
 - lib/htmlentities/README
-- lib/htmlentities/htmlentities.rb
-- test/unit
 - test/fixtures
-- test/test_helper.rb
-- test/unit/etag_filter_test.rb
-- test/unit/url_filters_test.rb
-- test/unit/searcher_test.rb
-- test/unit/rubyful_soup_content_extractor_test.rb
-- test/unit/pdf_content_extractor_test.rb
-- test/unit/hpricot_content_extractor_test.rb
-- test/unit/word_content_extractor_test.rb
-- test/unit/file_document_test.rb
-- test/unit/crawler_fs_test.rb
-- test/fixtures/html
-- test/fixtures/pdf
 - test/fixtures/word
-- test/fixtures/html/entities.html
-- test/fixtures/html/simple.html
+- test/fixtures/word/simple.doc
+- test/fixtures/html
 - test/fixtures/html/custom_tag_selectors.html
+- test/fixtures/html/simple.html
+- test/fixtures/html/entities.html
+- test/fixtures/pdf
 - test/fixtures/pdf/simple.pdf
-- test/fixtures/word/simple.doc
+- test/unit
+- test/unit/crawler_fs_test.rb
+- test/unit/pdf_content_extractor_test.rb
+- test/unit/word_content_extractor_test.rb
+- test/unit/rdig_test.rb
+- test/unit/http_document_test.rb
+- test/unit/searcher_test.rb
+- test/unit/file_document_test.rb
+- test/unit/url_filters_test.rb
+- test/unit/hpricot_content_extractor_test.rb
+- test/unit/etag_filter_test.rb
+- test/test_helper.rb
 - doc/examples
 - doc/examples/config.rb
 - LICENSE
@@ -77,41 +91,35 @@ files:
 - README
 - install.rb
 - rakefile
-test_files: []
+has_rdoc: true
+homepage: http://rdig.rubyforge.org/
+post_install_message:
 rdoc_options:
 - --title
 - Rake -- Ruby Make
 - --main
 - README
 - --line-numbers
-extra_rdoc_files:
-- README
-- CHANGES
-- LICENSE
-- TODO
-executables:
-- rdig
-extensions: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
 requirements: []
-dependencies:
-- !ruby/object:Gem::Dependency
-  name: ferret
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: 0.10.0
-    version:
-- !ruby/object:Gem::Dependency
-  name: hpricot
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Version::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: "0.4"
-    version:
+rubyforge_project: rdig
+rubygems_version: 1.0.1
+signing_key:
+specification_version: 2
+summary: Ruby based web site indexing and searching library.
+test_files: []

data/lib/rdig/content_extractors/rubyful_soup.rb DELETED

@@ -1,151 +0,0 @@
-begin
-  require 'rubyful_soup'
-rescue LoadError
-  require 'rubygems'
-  require 'rubyful_soup' rescue nil
-end
-if defined?(BeautifulSoup)
-# override some methods concered with entity resolving
-# to convert them to strings
-class BeautifulStoneSoup
-  # resolve unknown html entities using the htmlentities lib
-  alias :orig_unknown_entityref :unknown_entityref
-  def unknown_entityref(ref)
-    if HTMLEntities::MAP.has_key?(ref)
-      handle_data [HTMLEntities::MAP[ref]].pack('U')
-    else
-      orig_unknown_entityref ref
-    end
-  end
-  # resolve numeric entities to utf8
-  def handle_charref(ref)
-    handle_data( ref.gsub(/([0-9]{1,7})/) {
-                            [$1.to_i].pack('U')
-                    }.gsub(/x([0-9a-f]{1,6})/i) {
-                            [$1.to_i(16)].pack('U')
-                    } )
-  end
-end
-module RDig
-  module ContentExtractors
-    # extracts title, content and links from html documents
-    class RubyfulSoupContentExtractor < ContentExtractor
-      def initialize(config)
-        super(config.rubyful_soup)
-        # if not configured, refuse to handle any content:
-        @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
-      end
-      # returns:
-      # { :content => 'extracted clear text',
-      #   :meta => { :title => 'Title' },
-      #   :links => [array of urls] }
-      def process(content)
-        result = { }
-        tag_soup = BeautifulSoup.new(content)
-        result[:title] = extract_title(tag_soup)
-        result[:links] = extract_links(tag_soup)
-        result[:content] = extract_content(tag_soup)
-        return result
-      end
-      # Extracts textual content from the HTML tree.
-      #
-      # - First, the root element to use is determined using the
-      # +content_element+ method, which itself uses the content_tag_selector
-      # from RDig.configuration.
-      # - Then, this element is processed by +extract_text+, which will give
-      # all textual content contained in the root element and all it's
-      # children.
-      def extract_content(tag_soup)
-        content = ''
-        ce = content_element(tag_soup)
-        ce.children { |child|
-          extract_text(child, content)
-        } unless ce.nil?
-        return content.strip
-      end
-      # extracts the href attributes of all a tags, except
-      # internal links like <a href="#top">
-      def extract_links(tagsoup)
-        tagsoup.find_all('a').map { |link|
-          CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
-        }.compact
-      end
-      # Extracts the title from the given html tree
-      def extract_title(tagsoup)
-        the_title_tag = title_tag(tagsoup)
-        if the_title_tag.is_a? String
-          the_title_tag
-        else
-          title = ''
-          extract_text(the_title_tag, title)
-          title.strip
-        end
-      end
-      # Recursively extracts all text contained in the given element,
-      # and appends it to content.
-      def extract_text(element, content='')
-        return nil if element.nil?
-        if element.is_a? NavigableString
-          value = strip_comments(element)
-          value.strip!
-          unless value.empty?
-            content << value
-            content << ' '
-          end
-        elsif element.string  # it's a Tag, and it has some content string
-          # skip inline scripts and styles
-          return nil if element.name =~ /^(script|style)$/i
-          value = element.string.strip
-          unless value.empty?
-            content << value
-            content << ' '
-          end
-        else
-          element.children { |child|
-            extract_text(child, content)
-          }
-        end
-      end
-      # Returns the element to extract the title from.
-      #
-      # This may return a string, e.g. an attribute value selected from a meta
-      # tag, too.
-      def title_tag(tagsoup)
-        if @config.title_tag_selector
-          @config.title_tag_selector.call(tagsoup)
-        else
-          tagsoup.html.head.title
-        end
-      end
-      # Retrieve the root element to extract document content from
-      def content_element(tagsoup)
-        if @config.content_tag_selector
-          @config.content_tag_selector.call(tagsoup)
-        else
-          tagsoup.html.body
-        end
-      end
-      # Return the given string minus all html comments
-      def strip_comments(string)
-        string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
-      end
-    end
-  end
-end
-end

data/test/unit/rubyful_soup_content_extractor_test.rb DELETED

@@ -1,83 +0,0 @@
-require 'test_helper'
-class RubyfulSoupContentExtractorTest < Test::Unit::TestCase
-  include TestHelper
-  def setup
-    @config =  OpenStruct.new(
-              :content_tag_selector => lambda { |tagsoup|
-                tagsoup.html.body
-              },
-              :title_tag_selector         => lambda { |tagsoup|
-                tagsoup.html.head.title
-              })
-    @extractor = ContentExtractors::RubyfulSoupContentExtractor.new(OpenStruct.new(:rubyful_soup => @config))
-    @nbsp = [160].pack('U') # non breaking space
-  end
-  def test_can_do
-    assert !@extractor.can_do('application/pdf')
-    assert !@extractor.can_do('application/msword')
-    assert @extractor.can_do('text/html')
-    assert @extractor.can_do('text/xml')
-    assert @extractor.can_do('application/xml')
-    assert @extractor.can_do('application/xhtml+xml')
-  end
-  def test_simple
-    result = ContentExtractors.process(html_doc('simple'), 'text/html')
-    assert_not_nil result
-    assert_equal 'Sample Title', result[:title]
-    assert_not_nil result[:content]
-    assert_not_nil result[:links]
-    assert_equal 1, result[:links].size
-    assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
-    assert_equal 'http://test.host/affe.html', result[:links].first
-  end
-  def test_entities
-    result = @extractor.process(html_doc('entities'))
-    assert_equal 'Sample & Title', result[:title]
-    assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
-    assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
-    assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
-  end
-  def test_custom_content_element
-    @config.title_tag_selector = lambda do |tagsoup|
-      tagsoup.find('h1', :attrs => { 'class', 'title' })
-    end
-    @config.content_tag_selector = lambda do |tagsoup|
-      tagsoup.find('div', :attrs => { 'id', 'content' })
-    end
-    result = @extractor.process(html_doc('custom_tag_selectors'))
-    assert_equal 'Sample Title in h1', result[:title]
-    assert_equal 'Affe Real content is here.', result[:content]
-    # check if links are collected outside the content tag, too:
-    assert_equal 3, result[:links].size
-    assert_equal 'http://test.host/outside.html', result[:links].first
-    assert_equal '/inside.html', result[:links][1]
-    assert_equal '/footer.html', result[:links][2]
-  end
-  def test_title_from_dcmeta
-    @config.title_tag_selector = lambda do |tagsoup|
-      tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
-    end
-    result = @extractor.process(html_doc('custom_tag_selectors'))
-    assert_equal 'Title from DC meta data', result[:title]
-  end
-  def test_preprocessed_title
-    @config.title_tag_selector = lambda do |tagsoup|
-      title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
-      # use only a portion of the title tag's contents if it matches our
-      # regexp:
-      title =~ /^(.*)meta data$/ ? $1.strip : title.strip
-    end
-    result = @extractor.process(html_doc('custom_tag_selectors'))
-    assert_equal 'Title from DC', result[:title]
-  end
-end