RubyGems - rdig - Versions diffs - 0.3.5 → 0.3.8 - Mend

rdig 0.3.5 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

data/CHANGES +6 -1
data/History.txt +11 -0
data/Manifest.txt +39 -0
data/doc/examples/config.rb +10 -0
data/lib/rdig.rb +12 -8
data/lib/rdig/content_extractors.rb +7 -1
data/lib/rdig/content_extractors/doc.rb +5 -19
data/lib/rdig/content_extractors/hpricot.rb +13 -9
data/lib/rdig/crawler.rb +18 -10
data/lib/rdig/documents.rb +13 -9
data/lib/rdig/url_filters.rb +15 -8
data/rakefile +2 -1
data/test/fixtures/html/frameset.html +13 -0
data/test/fixtures/html/imagemap.html +13 -0
data/test/unit/hpricot_content_extractor_test.rb +12 -0
data/test/unit/searcher_test.rb +2 -2
data/test/unit/url_filters_test.rb +1 -1
metadata +108 -62
data/TODO +0 -0
data/lib/htmlentities/CHANGES +0 -21
data/lib/htmlentities/COPYING +0 -7
data/lib/htmlentities/README +0 -15
data/lib/htmlentities/htmlentities.rb +0 -281

data/CHANGES CHANGED

@@ -1,8 +1,13 @@
+0.3.6
+- remove bundled htmlentities in favor of a gem dependency
+- also extract links from area and frame tags
+- fix etagfilter bug
 0.3.5
 - Add max_depth option to crawler configuration for limiting the crawl to a
   specific depth
 - add support for http proxies including basic authentication
-- remove rubyfoul_soup support
+- remove rubyful_soup support
 0.3.4

data/History.txt ADDED

@@ -0,0 +1,11 @@
+== 0.3.8 2009-04-26
+* bump up version
+== 0.3.7 2009-04-26
+* Gem spec for automatic gem building on github
+* doc enhancements
+* better uri-normalization, re-add result uri of redirection
+  into the queue instea of directly indexing the resulting
+  page

data/Manifest.txt ADDED

@@ -0,0 +1,39 @@
+CHANGES
+History.txt
+install.rb
+LICENSE
+Manifest.txt
+rakefile
+README
+bin/rdig
+doc/examples/config.rb
+lib/rdig/content_extractors/doc.rb
+lib/rdig/content_extractors/hpricot.rb
+lib/rdig/content_extractors/pdf.rb
+lib/rdig/content_extractors.rb
+lib/rdig/crawler.rb
+lib/rdig/documents.rb
+lib/rdig/file.rb
+lib/rdig/highlight.rb
+lib/rdig/index.rb
+lib/rdig/search.rb
+lib/rdig/url_filters.rb
+lib/rdig.rb
+test/fixtures/html/custom_tag_selectors.html
+test/fixtures/html/entities.html
+test/fixtures/html/frameset.html
+test/fixtures/html/imagemap.html
+test/fixtures/html/simple.html
+test/fixtures/pdf/simple.pdf
+test/fixtures/word/simple.doc
+test/test_helper.rb
+test/unit/crawler_fs_test.rb
+test/unit/etag_filter_test.rb
+test/unit/file_document_test.rb
+test/unit/hpricot_content_extractor_test.rb
+test/unit/http_document_test.rb
+test/unit/pdf_content_extractor_test.rb
+test/unit/rdig_test.rb
+test/unit/searcher_test.rb
+test/unit/url_filters_test.rb
+test/unit/word_content_extractor_test.rb

data/doc/examples/config.rb CHANGED

@@ -86,6 +86,13 @@ RDig.configuration do |cfg|
   # Set to 0 to only index the start_urls.
   # cfg.crawler.max_depth = nil
+  # default index document to be appended to URIs ending with a trailing '/'
+  # cfg.crawler.normalize_uri.index_document = nil
+  # strip trailing '/' from URIs to avoid double indexing of pages referred by '
+  # Ignored if index_document is set.
+  # Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
+  # cfg.crawler.normalize_uri.remove_trailing_slash = nil
   # http proxy configuration
   # proxy url
   # cfg.crawler.http_proxy = nil
@@ -94,6 +101,9 @@ RDig.configuration do |cfg|
   # cfg.crawler.http_proxy_user = nil
   # proxy password
   # cfg.crawler.http_proxy_pass = nil
+  #
+  # to use basic auth without a proxy, use this syntax:
+  # cfg.crawler.open_uri_http_options = { :http_basic_authentication => [user, password] }
   # indexer options

data/lib/rdig.rb CHANGED

@@ -24,7 +24,7 @@
 #++
 #
-RDIGVERSION = '0.3.5'
+RDIGVERSION = '0.3.8'
 require 'thread'
@@ -49,7 +49,8 @@ rescue LoadError
   require 'ferret'
 end
-require 'htmlentities/htmlentities'
+#require 'htmlentities/htmlentities'
 $KCODE = 'u'
@@ -60,17 +61,16 @@ module RDig
   class << self
-    # the filter chains are for limiting the set of indexed documents.
-    # there are two chain types - one for http, and one for file system
-    # crawling.
-    # a document has to survive all filters in the chain to get indexed.
+    # Filter chains are used by the crawler to limit the set of documents being indexed.
+    # There are two chains - one for http, and one for file system crawling.
+    # Each document has to survive all filters in the relevant chain to get indexed.
     def filter_chain
       @filter_chain ||= {
         # filter chain for http crawling
         :http => [
           :scheme_filter_http,
           :fix_relative_uri,
-          :normalize_uri,
+          { :normalize_uri => :normalize_uri },
           { RDig::UrlFilters::DepthFilter => :max_depth },
           { :hostname_filter => :include_hosts },
           { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
@@ -120,7 +120,11 @@ module RDig
             :wait_before_leave => 10,
             :http_proxy        => nil,
             :http_proxy_user   => nil,
-            :http_proxy_pass   => nil
+            :http_proxy_pass   => nil,
+            :normalize_uri => OpenStruct.new(
+              :index_document => nil,
+              :remove_trailing_slash => nil
+            )
           ),
           :content_extraction  => OpenStruct.new(
             # settings for html content extraction (hpricot)

data/lib/rdig/content_extractors.rb CHANGED

@@ -23,7 +23,13 @@ module RDig
       def self.extractor_instances
         @@extractor_instances ||= extractors.map { |ex_class|
           RDig.logger.info "initializing content extractor: #{ex_class}"
-          ex_class.new(RDig.configuration.content_extraction) rescue nil
+          ex = nil
+          begin
+            ex = ex_class.new(RDig.configuration.content_extraction)
+          rescue Exception
+            RDig.logger.error "error: #{$!.message}\n#{$!.backtrace.join("\n")}"
+          end
+          ex
         }.compact
       end

data/lib/rdig/content_extractors/doc.rb CHANGED

@@ -13,25 +13,11 @@ module RDig
         @wvhtml = 'wvHtml'
         @pattern = /^application\/msword/
         # html extractor for parsing wvHtml output
-        if defined?(HpricotContentExtractor)
-          @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
-              :hpricot => OpenStruct.new(
-                :content_tag_selector => 'body',
-                :title_tag_selector   => 'title'
-              )))
-         elsif defined?(RubyfulSoupContentExtractor)
-          @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
-              :rubyful_soup => OpenStruct.new(
-                :content_tag_selector => lambda { |tagsoup|
-                  tagsoup.html.body
-                },
-                :title_tag_selector         => lambda { |tagsoup|
-                  tagsoup.html.head.title
-                }
-              )))
-        else
-          raise "need at least one html content extractor - please install hpricot or rubyful_soup"
-        end
+        @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
+            :hpricot => OpenStruct.new(
+              :content_tag_selector => 'body',
+              :title_tag_selector   => 'title'
+            )))
         # TODO: better: if $?.exitstatus == 127 (not found)
         @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
       end

data/lib/rdig/content_extractors/hpricot.rb CHANGED

@@ -1,11 +1,12 @@
 begin
   require 'hpricot'
+  require 'htmlentities'
 rescue LoadError
   require 'rubygems'
   require 'hpricot'
+  require 'htmlentities'
 end
-if defined?(Hpricot)
 module RDig
   module ContentExtractors
@@ -23,11 +24,12 @@ module RDig
       #   :title => 'Title',
       #   :links => [array of urls] }
       def process(content)
+        entities = HTMLEntities.new
         doc = Hpricot(content)
         {
-          :title => extract_title(doc).decode_entities.strip,
+          :title => entities.decode(extract_title(doc)).strip,
           :links => extract_links(doc),
-          :content => extract_content(doc).decode_entities
+          :content => entities.decode(extract_content(doc))
         }
       end
@@ -50,12 +52,14 @@ module RDig
       # extracts the href attributes of all a tags, except
       # internal links like <a href="#top">
       def extract_links(doc)
-        (doc/'a').map { |link|
-          href = link['href']
-          CGI.unescapeHTML(href) if href && href !~ /^#/
-        }.compact
+        {'a' => 'href', 'area' => 'href', 'frame' => 'src'}.map do |tag, attr|
+          (doc/tag).map do |tag|
+            value = tag[attr]
+            CGI.unescapeHTML(value) if value && value !~ /^#/
+          end
+        end.flatten.compact
       end
       # Extracts the title from the given html tree
       def extract_title(doc)
         the_title_tag = title_tag(doc)
@@ -85,6 +89,7 @@ module RDig
       def strip_comments(string)
         string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
       end
       def strip_tags(string)
         string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
                                Regexp::MULTILINE, 'u'), ''
@@ -98,4 +103,3 @@ module RDig
   end
 end
-end

data/lib/rdig/crawler.rb CHANGED

@@ -5,7 +5,6 @@ module RDig
     def initialize(config = RDig.config, logger = RDig.logger)
       @documents = Queue.new
-      @etag_filter = ETagFilter.new
       @logger = logger
       @config = config
     end
@@ -22,7 +21,8 @@ module RDig
       # check whether we are indexing on-disk or via http
       url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
       chain_config = RDig.filter_chain[url_type]
+      @etag_filter = ETagFilter.new
       filterchain = UrlFilters::FilterChain.new(chain_config)
       @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
@@ -52,13 +52,21 @@ module RDig
     def process_document(doc, filterchain)
       @logger.debug "processing document #{doc}"
       doc.fetch
-      # add links from this document to the queue
-      doc.content[:links].each { |url|
-        add_url(url, filterchain, doc)
-      } unless doc.content[:links].nil?
-      return unless @etag_filter.apply(doc)
-      add_to_index doc
+      case doc.status
+      when :success
+        if @etag_filter.apply(doc)
+          # add links from this document to the queue
+          doc.content[:links].each { |url|
+            add_url(url, filterchain, doc)
+          } unless doc.content[:links].nil?
+          add_to_index doc
+        end
+      when :redirect
+        @logger.debug "redirect to #{doc.content}"
+        add_url(doc.content, filterchain, doc)
+      else
+        @logger.error "unknown doc status #{doc.status}: #{doc}"
+      end
     rescue
       @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
       @logger.debug "Trace: #{$!.backtrace.join("\n")}"
@@ -110,7 +118,7 @@ module RDig
     end
     def apply(document)
-      return document unless (document.respond_to?(:etag) && document.etag)
+      return document unless (document.respond_to?(:etag) && document.etag && !document.etag.empty?)
       synchronize do
         @etags.add?(document.etag) ? document : nil
       end

data/lib/rdig/documents.rb CHANGED

@@ -118,16 +118,20 @@ module RDig
     def fetch
       RDig.logger.debug "fetching #{@uri.to_s}"
       open(@uri.to_s, RDig::open_uri_http_options) do |doc|
-        case doc.status.first.to_i
-        when 200
-          @etag = doc.meta['etag']
-          # puts "etag: #{@etag}"
-          @content = ContentExtractors.process(doc.read, doc.content_type)
-          @status = :success
-        when 404
-          RDig.logger.info "got 404 for #{@uri}"
+        if @uri.to_s != doc.base_uri.to_s
+          @status = :redirect
+          @content = doc.base_uri
         else
-          RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
+          case doc.status.first.to_i
+          when 200
+            @etag = doc.meta['etag']
+            @content = ContentExtractors.process(doc.read, doc.content_type)
+            @status = :success
+          when 404
+            RDig.logger.info "got 404 for #{@uri}"
+          else
+            RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
+          end
         end
       end
     rescue

data/lib/rdig/url_filters.rb CHANGED

@@ -22,7 +22,7 @@ module RDig
       end
       # add a filter and it's args to the chain
-      # when args is a symbol, it is treated as a configuration key
+      # if args is a symbol, it is treated as a configuration key
       def add(filter, args=nil)
         args = RDig.config.crawler.send(args) if args.is_a? Symbol
         case filter
@@ -163,7 +163,7 @@ module RDig
       return document
     end
-    # expands both href="/path/xyz.html" and href="affe.html"
+    # expands href="/path/xyz.html", href="affe.html" and href="../lala.html"
     # to full urls
     def UrlFilters.fix_relative_uri(document)
       #return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
@@ -175,11 +175,13 @@ module RDig
       uri.port = ref.port unless uri.port || ref.port==ref.default_port
       uri.path = ref.path unless uri.path
-      if uri.path !~ /^\//
+      old_uri_path = uri.path
+      if uri.path !~ /^\// || uri.path =~ /^\.\./
         ref_path = ref.path || '/'
         ref_path << '/' if ref_path.empty?
         uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
-      end
+      end
+      uri.path = uri.path.sub( /\/[^\/]*\/\.\./, "" ) if old_uri_path =~ /^\.\./
       return document
     rescue
       p document
@@ -193,12 +195,17 @@ module RDig
       return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
     end
-    def UrlFilters.normalize_uri(document)
+    def UrlFilters.normalize_uri(document, cfg)
       document.uri.fragment = nil
       # document.uri.query = nil
-      # append index document if configured and path ends with a slash
-      if RDig.config.index_document && document.uri.path =~ /\/$/
-        document.uri.path << RDig.config.index_document
+      # trailing slash handling
+      if document.uri.path =~ /\/$/
+        # append index document if configured
+        if cfg.index_document
+          document.uri.path << RDig.config.index_document
+        elsif cfg.remove_trailing_slash
+         document.uri.path.gsub! /\/$/, ''
+        end
       end
       return document
     end

data/rakefile CHANGED

@@ -132,6 +132,7 @@ else
     s.add_dependency('ferret', '>= 0.10.0')
     s.add_dependency('hpricot', '>= 0.6')
+    s.add_dependency('htmlentities', '>= 4.0.0')
     #s.requirements << ""
     #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)
@@ -321,7 +322,7 @@ task :tag => [:prerelease] do
   if ENV['RELTEST']
     announce "Release Task Testing, skipping tagging"
   else
-    sh %{cd ..; svn copy trunk tags/#{reltag}}
+    sh %{svn copy svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/trunk svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/tags/#{reltag}}
   end
 end

data/test/fixtures/html/frameset.html ADDED

@@ -0,0 +1,13 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+	<head>
+    <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
+    <title>Sample &amp; Title</title>
+  </head>
+  <body>
+		<frameset>
+			<frame src="http://test.host/first.html" />
+			<frame src="/second.html" />
+		</frameset>
+  </body>
+</html>

data/test/fixtures/html/imagemap.html ADDED

@@ -0,0 +1,13 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+	<head>
+    <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
+    <title>Sample &amp; Title</title>
+  </head>
+  <body>
+		<map>
+			<area href="http://test.host/first.html" />
+			<area href="/second.html" />
+		</map>
+  </body>
+</html>

data/test/unit/hpricot_content_extractor_test.rb CHANGED

@@ -52,6 +52,18 @@ class HpricotContentExtractorTest < Test::Unit::TestCase
     assert_equal '/inside.html', result[:links][1]
     assert_equal '/footer.html', result[:links][2]
   end
+  def test_extracts_links_from_frameset
+    result = @extractor.process(html_doc('frameset'))
+    assert_equal 'http://test.host/first.html', result[:links].first
+    assert_equal '/second.html', result[:links].last
+  end
+  def test_extracts_links_from_imagemap
+    result = @extractor.process(html_doc('imagemap'))
+    assert_equal 'http://test.host/first.html', result[:links].first
+    assert_equal '/second.html', result[:links].last
+  end
   def test_title_from_dcmeta

data/test/unit/searcher_test.rb CHANGED

@@ -28,8 +28,8 @@ class SearcherTest < Test::Unit::TestCase
   def test_search
     result = RDig.searcher.search 'some sample text'
-    assert_equal 3, result[:hitcount]
-    assert_equal 3, result[:list].size
+    assert_equal 5, result[:hitcount]
+    assert_equal 5, result[:list].size
   end
 end

data/test/unit/url_filters_test.rb CHANGED

@@ -74,7 +74,7 @@ class UrlFiltersTest < Test::Unit::TestCase
     doc = Document.create('http://test.host/dir/file.html')
     assert_equal('http://test.host/dir/another.html',
                   UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
-    assert_equal('http://test.host/dir/../another.html',
+    assert_equal('http://test.host/another.html',
                   UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
     assert_equal('http://test.host/dir/another.html',
                   UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)

metadata CHANGED

@@ -1,7 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rdig
 version: !ruby/object:Gem::Version
-  version: 0.3.5
+  hash: 3
+  prerelease:
+  segments:
+  - 0
+  - 3
+  - 8
+  version: 0.3.8
 platform: ruby
 authors:
 - Jens Kraemer
@@ -9,117 +15,157 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-02-26 00:00:00 +01:00
-default_executable: rdig
+date: 2009-04-26 00:00:00 +02:00
+default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ferret
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.10.0
-    version:
+        hash: 63
+        segments:
+        - 0
+        - 11
+        - 6
+        version: 0.11.6
+  type: :runtime
+  version_requirements: *id001
 - !ruby/object:Gem::Dependency
   name: hpricot
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 7
+        segments:
+        - 0
+        - 6
         version: "0.6"
-    version:
-description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: htmlentities
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 63
+        segments:
+        - 4
+        - 0
+        - 0
+        version: 4.0.0
+  type: :runtime
+  version_requirements: *id003
+description: Website crawler and fulltext indexer.
 email: jk@jkraemer.net
-executables:
-- rdig
+executables: []
 extensions: []
 extra_rdoc_files:
+- History.txt
+- Manifest.txt
 - README
+files:
 - CHANGES
+- History.txt
+- install.rb
 - LICENSE
-- TODO
-files:
+- Manifest.txt
+- rakefile
+- README
 - bin/rdig
-- lib/rdig.rb
-- lib/rdig
-- lib/rdig/url_filters.rb
-- lib/rdig/index.rb
-- lib/rdig/crawler.rb
+- doc/examples/config.rb
+- lib/rdig/content_extractors/doc.rb
+- lib/rdig/content_extractors/hpricot.rb
+- lib/rdig/content_extractors/pdf.rb
 - lib/rdig/content_extractors.rb
+- lib/rdig/crawler.rb
+- lib/rdig/documents.rb
 - lib/rdig/file.rb
 - lib/rdig/highlight.rb
-- lib/rdig/documents.rb
+- lib/rdig/index.rb
 - lib/rdig/search.rb
-- lib/rdig/content_extractors
-- lib/rdig/content_extractors/doc.rb
-- lib/rdig/content_extractors/hpricot.rb
-- lib/rdig/content_extractors/pdf.rb
-- lib/htmlentities
-- lib/htmlentities/htmlentities.rb
-- lib/htmlentities/COPYING
-- lib/htmlentities/CHANGES
-- lib/htmlentities/README
-- test/fixtures
-- test/fixtures/word
-- test/fixtures/word/simple.doc
-- test/fixtures/html
+- lib/rdig/url_filters.rb
+- lib/rdig.rb
 - test/fixtures/html/custom_tag_selectors.html
-- test/fixtures/html/simple.html
 - test/fixtures/html/entities.html
-- test/fixtures/pdf
+- test/fixtures/html/frameset.html
+- test/fixtures/html/imagemap.html
+- test/fixtures/html/simple.html
 - test/fixtures/pdf/simple.pdf
-- test/unit
+- test/fixtures/word/simple.doc
+- test/test_helper.rb
 - test/unit/crawler_fs_test.rb
+- test/unit/etag_filter_test.rb
+- test/unit/file_document_test.rb
+- test/unit/hpricot_content_extractor_test.rb
+- test/unit/http_document_test.rb
 - test/unit/pdf_content_extractor_test.rb
-- test/unit/word_content_extractor_test.rb
 - test/unit/rdig_test.rb
-- test/unit/http_document_test.rb
 - test/unit/searcher_test.rb
-- test/unit/file_document_test.rb
 - test/unit/url_filters_test.rb
-- test/unit/hpricot_content_extractor_test.rb
-- test/unit/etag_filter_test.rb
-- test/test_helper.rb
-- doc/examples
-- doc/examples/config.rb
-- LICENSE
-- TODO
-- CHANGES
-- README
-- install.rb
-- rakefile
+- test/unit/word_content_extractor_test.rb
 has_rdoc: true
-homepage: http://rdig.rubyforge.org/
+homepage: http://github.com/jkraemer/rdig/
+licenses: []
 post_install_message:
 rdoc_options:
-- --title
-- Rake -- Ruby Make
 - --main
 - README
-- --line-numbers
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 requirements: []
 rubyforge_project: rdig
-rubygems_version: 1.0.1
+rubygems_version: 1.5.3
 signing_key:
-specification_version: 2
-summary: Ruby based web site indexing and searching library.
-test_files: []
+specification_version: 3
+summary: Crawler and content extractor for building a full text index of a website's contents. Uses Ferret for indexing.
+test_files:
+- test/fixtures/html/custom_tag_selectors.html
+- test/fixtures/html/entities.html
+- test/fixtures/html/frameset.html
+- test/fixtures/html/imagemap.html
+- test/fixtures/html/simple.html
+- test/fixtures/pdf/simple.pdf
+- test/fixtures/word/simple.doc
+- test/test_helper.rb
+- test/unit/crawler_fs_test.rb
+- test/unit/etag_filter_test.rb
+- test/unit/file_document_test.rb
+- test/unit/hpricot_content_extractor_test.rb
+- test/unit/http_document_test.rb
+- test/unit/pdf_content_extractor_test.rb
+- test/unit/rdig_test.rb
+- test/unit/searcher_test.rb
+- test/unit/url_filters_test.rb
+- test/unit/word_content_extractor_test.rb

data/TODO DELETED

File without changes

data/lib/htmlentities/CHANGES DELETED

@@ -1,21 +0,0 @@
-== 2.2 (2005-11-07)
-* Important bug fixes -- thanks to Moonwolf
-* Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
-* Decimal decoding edge cases addressed.
-* Test cases added.
-== 2.1 (2005-10-31)
-* Removed some unnecessary code in basic entity encoding.
-* Improved handling of encoding: commands are now automatically sorted, so the
-  user doesn't have to worry about their order.
-* Now using setup.rb.
-* Tests moved to separate file.
-== 2.0 (2005-08-23)
-* Added encoding to entities.
-* Decoding interface unchanged.
-* Fixed a bug with handling high codepoints.
-== 1.0 (2005-08-03)
-* Initial release.
-* Decoding only.

data/lib/htmlentities/COPYING DELETED

@@ -1,7 +0,0 @@
-Copyright (c) 2005 Paul Battley
-Usage of the works is permitted provided that this instrument is retained
-with the works, so that any entity that uses the works is notified of this
-instrument.
-DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.

data/lib/htmlentities/README DELETED

@@ -1,15 +0,0 @@
-HTML entity encoding and decoding for Ruby
-This library extends the String class to allow encoding and decoding of
-HTML/XML entities from/to their corresponding UTF-8 codepoints.
-To install (requires root/admin privileges):
-# ruby setup.rb
-To test:
-$ ruby setup.rb test
-Comments are welcome.  Send an email to pbattley @ gmail.com.

data/lib/htmlentities/htmlentities.rb DELETED

@@ -1,281 +0,0 @@
-#
-# HTML entity encoding and decoding for Ruby
-#
-# Author::  Paul BATTLEY (pbattley @ gmail.com)
-# Version:: 2.2
-# Date::    2005-11-07
-#
-# == About
-#
-# This library extends the String class to allow encoding and decoding of
-# HTML/XML entities from/to their corresponding UTF-8 codepoints.
-#
-# == Licence
-#
-# Copyright (c) 2005 Paul Battley
-#
-# Usage of the works is permitted provided that this instrument is retained
-# with the works, so that any entity that uses the works is notified of this
-# instrument.
-#
-# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
-#
-module HTMLEntities
-    VERSION = '2.2'
-    #
-    # MAP is a hash of all the HTML entities I could discover, as taken
-    # from the w3schools page on the subject:
-    # http://www.w3schools.com/html/html_entitiesref.asp
-    # The format is 'entity name' => codepoint where entity name is given
-    # without the surrounding ampersand and semicolon.
-    #
-    MAP = {
-        'quot'      => 34,
-        'apos'      => 39,
-        'amp'       => 38,
-        'lt'        => 60,
-        'gt'        => 62,
-        'nbsp'      => 160,
-        'iexcl'     => 161,
-        'curren'    => 164,
-        'cent'      => 162,
-        'pound'     => 163,
-        'yen'       => 165,
-        'brvbar'    => 166,
-        'sect'      => 167,
-        'uml'       => 168,
-        'copy'      => 169,
-        'ordf'      => 170,
-        'laquo'     => 171,
-        'not'       => 172,
-        'shy'       => 173,
-        'reg'       => 174,
-        'trade'     => 8482,
-        'macr'      => 175,
-        'deg'       => 176,
-        'plusmn'    => 177,
-        'sup2'      => 178,
-        'sup3'      => 179,
-        'acute'     => 180,
-        'micro'     => 181,
-        'para'      => 182,
-        'middot'    => 183,
-        'cedil'     => 184,
-        'sup1'      => 185,
-        'ordm'      => 186,
-        'raquo'     => 187,
-        'frac14'    => 188,
-        'frac12'    => 189,
-        'frac34'    => 190,
-        'iquest'    => 191,
-        'times'     => 215,
-        'divide'    => 247,
-        'Agrave'    => 192,
-        'Aacute'    => 193,
-        'Acirc'     => 194,
-        'Atilde'    => 195,
-        'Auml'      => 196,
-        'Aring'     => 197,
-        'AElig'     => 198,
-        'Ccedil'    => 199,
-        'Egrave'    => 200,
-        'Eacute'    => 201,
-        'Ecirc'     => 202,
-        'Euml'      => 203,
-        'Igrave'    => 204,
-        'Iacute'    => 205,
-        'Icirc'     => 206,
-        'Iuml'      => 207,
-        'ETH'       => 208,
-        'Ntilde'    => 209,
-        'Ograve'    => 210,
-        'Oacute'    => 211,
-        'Ocirc'     => 212,
-        'Otilde'    => 213,
-        'Ouml'      => 214,
-        'Oslash'    => 216,
-        'Ugrave'    => 217,
-        'Uacute'    => 218,
-        'Ucirc'     => 219,
-        'Uuml'      => 220,
-        'Yacute'    => 221,
-        'THORN'     => 222,
-        'szlig'     => 223,
-        'agrave'    => 224,
-        'aacute'    => 225,
-        'acirc'     => 226,
-        'atilde'    => 227,
-        'auml'      => 228,
-        'aring'     => 229,
-        'aelig'     => 230,
-        'ccedil'    => 231,
-        'egrave'    => 232,
-        'eacute'    => 233,
-        'ecirc'     => 234,
-        'euml'      => 235,
-        'igrave'    => 236,
-        'iacute'    => 237,
-        'icirc'     => 238,
-        'iuml'      => 239,
-        'eth'       => 240,
-        'ntilde'    => 241,
-        'ograve'    => 242,
-        'oacute'    => 243,
-        'ocirc'     => 244,
-        'otilde'    => 245,
-        'ouml'      => 246,
-        'oslash'    => 248,
-        'ugrave'    => 249,
-        'uacute'    => 250,
-        'ucirc'     => 251,
-        'uuml'      => 252,
-        'yacute'    => 253,
-        'thorn'     => 254,
-        'yuml'      => 255,
-        'OElig'     => 338,
-        'oelig'     => 339,
-        'Scaron'    => 352,
-        'scaron'    => 353,
-        'Yuml'      => 376,
-        'circ'      => 710,
-        'tilde'     => 732,
-        'ensp'      => 8194,
-        'emsp'      => 8195,
-        'thinsp'    => 8201,
-        'zwnj'      => 8204,
-        'zwj'       => 8205,
-        'lrm'       => 8206,
-        'rlm'       => 8207,
-        'ndash'     => 8211,
-        'mdash'     => 8212,
-        'lsquo'     => 8216,
-        'rsquo'     => 8217,
-        'sbquo'     => 8218,
-        'ldquo'     => 8220,
-        'rdquo'     => 8221,
-        'bdquo'     => 8222,
-        'dagger'    => 8224,
-        'Dagger'    => 8225,
-        'hellip'    => 8230,
-        'permil'    => 8240,
-        'lsaquo'    => 8249,
-        'rsaquo'    => 8250,
-        'euro'      => 8364
-    }
-    MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
-    MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
-    # Precompile the regexp
-    NAMED_ENTITY_REGEXP =
-        /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
-    # Reverse map for converting characters to named entities
-    REVERSE_MAP = MAP.invert
-    BASIC_ENTITY_REGEXP = /[<>'"&]/
-    UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
-end
-class String
-    # Because there's no need to make the user worry about the order here,
-    # let's handle it.
-    ENCODE_ENTITIES_COMMAND_ORDER = {
-        :basic => 0,
-        :named => 1,
-        :decimal => 2,
-        :hexadecimal => 3
-    }
-    #
-    # Decode XML and HTML 4.01 entities in a string into their UTF-8
-    # equivalents.  Obviously, if your string is not already in UTF-8, you'd
-    # better convert it before using this method, or the output will be mixed
-    # up.
-    # Unknown named entities are not converted
-    #
-    def decode_entities
-        return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
-            HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
-        }.gsub(/&#([0-9]{1,7});/) {
-            [$1.to_i].pack('U')
-        }.gsub(/&#x([0-9a-f]{1,6});/i) {
-            [$1.to_i(16)].pack('U')
-        }
-    end
-    #
-    # Encode codepoints into their corresponding entities.  Various operations
-    # are possible, and may be specified in order:
-    #
-    # :basic :: Convert the five XML entities ('"<>&)
-    # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
-    # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
-    # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
-    #
-    # You can specify the commands in any order, but they will be executed in
-    # the order listed above to ensure that entity ampersands are not
-    # clobbered and that named entities are replaced before numeric ones.
-    #
-    # If no instructions are specified, :basic will be used.
-    #
-    # Examples:
-    #   str.encode_entities - XML-safe
-    #   str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
-    #   str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
-    #   non-ASCII characters replaced with their named entity where possible, and
-    #   decimal equivalents otherwise.
-    #
-    # Note: It is the program's responsibility to ensure that the string
-    # contains valid UTF-8 before calling this method.
-    #
-    def encode_entities(*instructions)
-        str = nil
-        if (instructions.empty?)
-            instructions = [:basic]
-        else
-            instructions.each do |instr|
-                unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
-                    raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
-                end
-            end
-            instructions.sort! { |a,b|
-                ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
-                ENCODE_ENTITIES_COMMAND_ORDER[b]
-            }
-        end
-        instructions.each do |instruction|
-            case instruction
-            when :basic
-                # Handled as basic ASCII
-                str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
-                    # It's safe to use the simpler [0] here because we know
-                    # that the basic entities are ASCII.
-                    '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
-                }
-            when :named
-                # Test everything except printable ASCII
-                str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
-                    cp = $&.unpack('U')[0]
-                    (e = HTMLEntities::REVERSE_MAP[cp]) ?  "&#{e};" : $&
-                }
-            when :decimal
-                str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
-                    "&##{$&.unpack('U')[0]};"
-                }
-            when :hexadecimal
-                str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
-                    "&#x#{$&.unpack('U')[0].to_s(16)};"
-                }
-            end
-        end
-        return str
-    end
-end