rdig 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +6 -0
- data/bin/rdig +0 -17
- data/lib/rdig.rb +5 -1
- data/lib/rdig/content_extractors.rb +133 -18
- data/rakefile +6 -1
- data/test/fixtures/pdf/simple.pdf +0 -0
- data/test/fixtures/word/simple.doc +0 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/html_content_extractor_test.rb +11 -2
- data/test/unit/pdf_content_extractor_test.rb +33 -0
- data/test/unit/word_content_extractor_test.rb +34 -0
- metadata +81 -76
    
        data/CHANGES
    CHANGED
    
    
    
        data/bin/rdig
    CHANGED
    
    | @@ -13,20 +13,3 @@ end | |
| 13 13 | 
             
            RDig.application.run
         | 
| 14 14 |  | 
| 15 15 |  | 
| 16 | 
            -
            #$LOAD_PATH << File.expand_path(File.dirname(__FILE__) + "/lib")
         | 
| 17 | 
            -
            #$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
         | 
| 18 | 
            -
            #require 'init'
         | 
| 19 | 
            -
             | 
| 20 | 
            -
            #if ARGV[0]
         | 
| 21 | 
            -
            #  require ARGV[0]
         | 
| 22 | 
            -
            #else
         | 
| 23 | 
            -
            #  require 'config'
         | 
| 24 | 
            -
            #end
         | 
| 25 | 
            -
             | 
| 26 | 
            -
            #include SiteSearch
         | 
| 27 | 
            -
             | 
| 28 | 
            -
             | 
| 29 | 
            -
            #puts "creating new index in #{SiteSearch.settings[:index_dir]}"
         | 
| 30 | 
            -
             | 
| 31 | 
            -
            #crawler = Crawler.new
         | 
| 32 | 
            -
            #crawler.run
         | 
    
        data/lib/rdig.rb
    CHANGED
    
    | @@ -24,7 +24,7 @@ | |
| 24 24 | 
             
            #++
         | 
| 25 25 | 
             
            #
         | 
| 26 26 |  | 
| 27 | 
            -
            RDIGVERSION = '0. | 
| 27 | 
            +
            RDIGVERSION = '0.2.0'
         | 
| 28 28 |  | 
| 29 29 |  | 
| 30 30 | 
             
            require 'thread'
         | 
| @@ -37,6 +37,10 @@ require 'cgi' | |
| 37 37 | 
             
            require 'set'
         | 
| 38 38 | 
             
            require 'net/http'
         | 
| 39 39 | 
             
            require 'getoptlong'
         | 
| 40 | 
            +
            require 'tempfile'
         | 
| 41 | 
            +
            # mkmf gives us the handy find_executable method used to check for helper
         | 
| 42 | 
            +
            # programs:
         | 
| 43 | 
            +
            require 'mkmf'      
         | 
| 40 44 |  | 
| 41 45 | 
             
            begin
         | 
| 42 46 | 
             
              require 'rubyful_soup'
         | 
| @@ -23,31 +23,146 @@ end | |
| 23 23 |  | 
| 24 24 | 
             
            module RDig
         | 
| 25 25 |  | 
| 26 | 
            -
              # Contains  | 
| 26 | 
            +
              # Contains classes which are used for extracting content and meta data from
         | 
| 27 27 | 
             
              # various content types.
         | 
| 28 | 
            -
              #
         | 
| 29 | 
            -
              # TODO: support at least pdf, too.
         | 
| 30 28 | 
             
              module ContentExtractors
         | 
| 31 29 |  | 
| 32 30 | 
             
                # process the given +content+ depending on it's +content_type+.
         | 
| 33 | 
            -
                def  | 
| 34 | 
            -
                   | 
| 35 | 
            -
                   | 
| 36 | 
            -
             | 
| 37 | 
            -
                   | 
| 31 | 
            +
                def self.process(content, content_type)
         | 
| 32 | 
            +
                  ContentExtractor.process(content, content_type)
         | 
| 33 | 
            +
                  #      case content_type
         | 
| 34 | 
            +
                  #when /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
         | 
| 35 | 
            +
                  #  return HtmlContentExtractor.process(content)
         | 
| 36 | 
            +
                  #when /^application\/.+pdf/
         | 
| 37 | 
            +
                  #  return PdfContentExtractor.process(content) unless RDig.config.content_extraction.pdf.disabled
         | 
| 38 | 
            +
                  #else
         | 
| 39 | 
            +
                  #  puts "unable to handle content type #{content_type}"
         | 
| 40 | 
            +
                  #end
         | 
| 41 | 
            +
                  #return nil
         | 
| 42 | 
            +
                end
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                # Base class for Content Extractors.
         | 
| 45 | 
            +
                # Extractors inheriting from this class will be auto-discovered and used
         | 
| 46 | 
            +
                # when can_do returns true
         | 
| 47 | 
            +
                class ContentExtractor
         | 
| 48 | 
            +
                  
         | 
| 49 | 
            +
                  def self.inherited(extractor)
         | 
| 50 | 
            +
                    super(extractor)
         | 
| 51 | 
            +
                    puts("discovered content extractor class: #{extractor}")
         | 
| 52 | 
            +
                    self.extractors << extractor
         | 
| 53 | 
            +
                  end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                  def self.extractors; @@extractors ||= [] end
         | 
| 56 | 
            +
                  def self.extractor_instances
         | 
| 57 | 
            +
                    @@extractor_instances ||= extractors.map { |ex_class| ex_class.new }
         | 
| 58 | 
            +
                  end
         | 
| 59 | 
            +
                  
         | 
| 60 | 
            +
                  def self.process(content, content_type)
         | 
| 61 | 
            +
                    self.extractor_instances.each { |extractor|
         | 
| 62 | 
            +
                      return extractor.process(content) if extractor.can_do(content_type)
         | 
| 63 | 
            +
                    }
         | 
| 38 64 | 
             
                    puts "unable to handle content type #{content_type}"
         | 
| 65 | 
            +
                    nil
         | 
| 66 | 
            +
                  end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                  def can_do(content_type)
         | 
| 69 | 
            +
                    content_type =~ @pattern
         | 
| 39 70 | 
             
                  end
         | 
| 40 | 
            -
             | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
             | 
| 74 | 
            +
                # to be used by concrete implementations having a get_content class method
         | 
| 75 | 
            +
                # that takes a path to a file and return the textual content extracted from
         | 
| 76 | 
            +
                # that file.
         | 
| 77 | 
            +
                module ExternalAppHelper
         | 
| 78 | 
            +
                  def process(content)
         | 
| 79 | 
            +
                    result = {}
         | 
| 80 | 
            +
                    as_file(content) do |file|
         | 
| 81 | 
            +
                      result[:content] = get_content(file.path).strip
         | 
| 82 | 
            +
                    end
         | 
| 83 | 
            +
                    result
         | 
| 84 | 
            +
                  end
         | 
| 85 | 
            +
                  
         | 
| 86 | 
            +
                  def as_file(content)
         | 
| 87 | 
            +
                    file = Tempfile.new('rdig')
         | 
| 88 | 
            +
                    file << content
         | 
| 89 | 
            +
                    file.close
         | 
| 90 | 
            +
                    yield file
         | 
| 91 | 
            +
                    file.delete
         | 
| 92 | 
            +
                  end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                  def available
         | 
| 95 | 
            +
                    if @available.nil?
         | 
| 96 | 
            +
                      @available = !find_executable(@executable).nil?
         | 
| 97 | 
            +
                    end
         | 
| 98 | 
            +
                    @available
         | 
| 99 | 
            +
                  end
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                  def can_do(content_type)
         | 
| 102 | 
            +
                    available and super(content_type)
         | 
| 103 | 
            +
                  end
         | 
| 104 | 
            +
                end
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                # Extract text from pdf content.
         | 
| 107 | 
            +
                #
         | 
| 108 | 
            +
                # Requires the pdftotext utility from the xpdf-utils package
         | 
| 109 | 
            +
                # (on debian and friends do 'apt-get install xpdf-utils')
         | 
| 110 | 
            +
                #
         | 
| 111 | 
            +
                # TODO: use pdfinfo to get title from document
         | 
| 112 | 
            +
                class PdfContentExtractor < ContentExtractor
         | 
| 113 | 
            +
                  include ExternalAppHelper
         | 
| 114 | 
            +
                  
         | 
| 115 | 
            +
                  def initialize
         | 
| 116 | 
            +
                    @executable = 'pdftotext'
         | 
| 117 | 
            +
                    @pattern = /^application\/pdf/
         | 
| 118 | 
            +
                  end
         | 
| 119 | 
            +
                  
         | 
| 120 | 
            +
                  def get_content(path_to_tempfile)
         | 
| 121 | 
            +
                    %x{#{@executable} '#{path_to_tempfile}' -}
         | 
| 122 | 
            +
                  end
         | 
| 123 | 
            +
                end
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                # Extract text from word documents
         | 
| 126 | 
            +
                #
         | 
| 127 | 
            +
                # Requires the antiword utility
         | 
| 128 | 
            +
                # (on debian and friends do 'apt-get install antiword')
         | 
| 129 | 
            +
                class WordContentExtractor < ContentExtractor
         | 
| 130 | 
            +
                  include ExternalAppHelper
         | 
| 131 | 
            +
                  
         | 
| 132 | 
            +
                  def initialize
         | 
| 133 | 
            +
                    @executable = 'wvHtml'
         | 
| 134 | 
            +
                    @pattern = /^application\/msword/
         | 
| 135 | 
            +
                    @html_extractor = HtmlContentExtractor.new
         | 
| 136 | 
            +
                  end
         | 
| 137 | 
            +
                  
         | 
| 138 | 
            +
                  def process(content)
         | 
| 139 | 
            +
                    result = {}
         | 
| 140 | 
            +
                    as_file(content) do |infile|  
         | 
| 141 | 
            +
                      outfile = Tempfile.new('rdig')
         | 
| 142 | 
            +
                      outfile.close
         | 
| 143 | 
            +
                      %x{#{@executable} --targetdir='#{File.dirname(outfile.path)}' '#{infile.path}' '#{File.basename(outfile.path)}'}
         | 
| 144 | 
            +
                      File.open(outfile.path) do |html|
         | 
| 145 | 
            +
                        result = @html_extractor.process(html.read)
         | 
| 146 | 
            +
                      end
         | 
| 147 | 
            +
                      outfile.delete
         | 
| 148 | 
            +
                    end
         | 
| 149 | 
            +
                    return result || {}
         | 
| 150 | 
            +
                  end
         | 
| 151 | 
            +
                  
         | 
| 41 152 | 
             
                end
         | 
| 42 153 |  | 
| 43 154 | 
             
                # extracts title, content and links from html documents
         | 
| 44 | 
            -
                class HtmlContentExtractor
         | 
| 155 | 
            +
                class HtmlContentExtractor < ContentExtractor
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                  def initialize
         | 
| 158 | 
            +
                    @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/
         | 
| 159 | 
            +
                  end
         | 
| 45 160 |  | 
| 46 161 | 
             
                  # returns: 
         | 
| 47 162 | 
             
                  # { :content => 'extracted clear text',
         | 
| 48 163 | 
             
                  #   :meta => { :title => 'Title' },
         | 
| 49 164 | 
             
                  #   :links => [array of urls] }
         | 
| 50 | 
            -
                  def  | 
| 165 | 
            +
                  def process(content)
         | 
| 51 166 | 
             
                    result = { }
         | 
| 52 167 | 
             
                    tag_soup = BeautifulSoup.new(content)
         | 
| 53 168 | 
             
                    result[:title] = extract_title(tag_soup)
         | 
| @@ -64,7 +179,7 @@ module RDig | |
| 64 179 | 
             
                  # - Then, this element is processed by +extract_text+, which will give
         | 
| 65 180 | 
             
                  # all textual content contained in the root element and all it's
         | 
| 66 181 | 
             
                  # children.
         | 
| 67 | 
            -
                  def  | 
| 182 | 
            +
                  def extract_content(tag_soup)
         | 
| 68 183 | 
             
                    content = ''
         | 
| 69 184 | 
             
                    content_element(tag_soup).children { |child| 
         | 
| 70 185 | 
             
                      extract_text(child, content)
         | 
| @@ -74,14 +189,14 @@ module RDig | |
| 74 189 |  | 
| 75 190 | 
             
                  # extracts the href attributes of all a tags, except 
         | 
| 76 191 | 
             
                  # internal links like <a href="#top">
         | 
| 77 | 
            -
                  def  | 
| 192 | 
            +
                  def extract_links(tagsoup)
         | 
| 78 193 | 
             
                    tagsoup.find_all('a').map { |link|
         | 
| 79 194 | 
             
                      CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
         | 
| 80 195 | 
             
                    }.compact
         | 
| 81 196 | 
             
                  end
         | 
| 82 197 |  | 
| 83 198 | 
             
                  # Extracts the title from the given html tree
         | 
| 84 | 
            -
                  def  | 
| 199 | 
            +
                  def extract_title(tagsoup)
         | 
| 85 200 | 
             
                    title = ''
         | 
| 86 201 | 
             
                    the_title_tag = title_tag(tagsoup)
         | 
| 87 202 | 
             
                    if the_title_tag.is_a? String
         | 
| @@ -93,7 +208,7 @@ module RDig | |
| 93 208 |  | 
| 94 209 | 
             
                  # Recursively extracts all text contained in the given element, 
         | 
| 95 210 | 
             
                  # and appends it to content.
         | 
| 96 | 
            -
                  def  | 
| 211 | 
            +
                  def extract_text(element, content='')
         | 
| 97 212 | 
             
                    if element.is_a? NavigableString
         | 
| 98 213 | 
             
                      value = strip_comments(element)
         | 
| 99 214 | 
             
                      value.strip!
         | 
| @@ -118,7 +233,7 @@ module RDig | |
| 118 233 | 
             
                  #
         | 
| 119 234 | 
             
                  # This may return a string, e.g. an attribute value selected from a meta
         | 
| 120 235 | 
             
                  # tag, too.
         | 
| 121 | 
            -
                  def  | 
| 236 | 
            +
                  def title_tag(tagsoup)
         | 
| 122 237 | 
             
                    if RDig.config.content_extraction.html.title_tag_selector
         | 
| 123 238 | 
             
                      RDig.config.content_extraction.html.title_tag_selector.call(tagsoup)
         | 
| 124 239 | 
             
                    else 
         | 
| @@ -127,7 +242,7 @@ module RDig | |
| 127 242 | 
             
                  end
         | 
| 128 243 |  | 
| 129 244 | 
             
                  # Retrieve the root element to extract document content from
         | 
| 130 | 
            -
                  def  | 
| 245 | 
            +
                  def content_element(tagsoup)
         | 
| 131 246 | 
             
                    if RDig.config.content_extraction.html.content_tag_selector
         | 
| 132 247 | 
             
                      RDig.config.content_extraction.html.content_tag_selector.call(tagsoup)
         | 
| 133 248 | 
             
                    else
         | 
| @@ -136,7 +251,7 @@ module RDig | |
| 136 251 | 
             
                  end
         | 
| 137 252 |  | 
| 138 253 | 
             
                  # Return the given string minus all html comments
         | 
| 139 | 
            -
                  def  | 
| 254 | 
            +
                  def strip_comments(string)
         | 
| 140 255 | 
             
                    string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
         | 
| 141 256 | 
             
                  end
         | 
| 142 257 | 
             
                end
         | 
    
        data/rakefile
    CHANGED
    
    | @@ -94,7 +94,7 @@ Rake::TestTask.new("test_functional") { |t| | |
| 94 94 | 
             
            # Generate the RDoc documentation ----------------------------------------
         | 
| 95 95 |  | 
| 96 96 | 
             
            rd = Rake::RDocTask.new { |rdoc|
         | 
| 97 | 
            -
              rdoc.rdoc_dir = ' | 
| 97 | 
            +
              rdoc.rdoc_dir = 'html'
         | 
| 98 98 | 
             
              rdoc.title    = "RDig - Ferret based full text search for web sites"
         | 
| 99 99 | 
             
              rdoc.options << '--line-numbers' << '--inline-source'
         | 
| 100 100 | 
             
              rdoc.options << '--main' << 'README'
         | 
| @@ -323,3 +323,8 @@ task :tag => [:prerelease] do | |
| 323 323 | 
             
              end
         | 
| 324 324 | 
             
            end
         | 
| 325 325 |  | 
| 326 | 
            +
            # Publish RDocs ------------------------------------------------------
         | 
| 327 | 
            +
            desc "Publish the API documentation"
         | 
| 328 | 
            +
            task :pdoc => [:rdoc] do
         | 
| 329 | 
            +
              Rake::RubyForgePublisher.new(RUBY_FORGE_PROJECT, RUBY_FORGE_USER).upload
         | 
| 330 | 
            +
            end
         | 
| Binary file | 
| Binary file | 
    
        data/test/test_helper.rb
    CHANGED
    
    
| @@ -3,7 +3,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase | |
| 3 3 | 
             
              include TestHelper
         | 
| 4 4 |  | 
| 5 5 | 
             
              def setup
         | 
| 6 | 
            -
                @extractor = ContentExtractors::HtmlContentExtractor
         | 
| 6 | 
            +
                @extractor = ContentExtractors::HtmlContentExtractor.new
         | 
| 7 7 | 
             
                @nbsp = [160].pack('U') # non breaking space
         | 
| 8 8 | 
             
                @config_backup = RDig.config.content_extraction.html.clone
         | 
| 9 9 | 
             
              end
         | 
| @@ -12,8 +12,17 @@ class HtmlContentExtractorTest < Test::Unit::TestCase | |
| 12 12 | 
             
                RDig.config.content_extraction.html = @config_backup
         | 
| 13 13 | 
             
              end
         | 
| 14 14 |  | 
| 15 | 
            +
              def test_can_do
         | 
| 16 | 
            +
                assert !@extractor.can_do('application/pdf')
         | 
| 17 | 
            +
                assert !@extractor.can_do('application/msword')
         | 
| 18 | 
            +
                assert @extractor.can_do('text/html')
         | 
| 19 | 
            +
                assert @extractor.can_do('text/xml')
         | 
| 20 | 
            +
                assert @extractor.can_do('application/xml')
         | 
| 21 | 
            +
                assert @extractor.can_do('application/xhtml+xml')
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
             
         | 
| 15 24 | 
             
              def test_simple
         | 
| 16 | 
            -
                result =  | 
| 25 | 
            +
                result = ContentExtractors.process(html_doc('simple'), 'text/html')
         | 
| 17 26 | 
             
                assert_not_nil result
         | 
| 18 27 | 
             
                assert_equal 'Sample Title', result[:title]
         | 
| 19 28 | 
             
                assert_not_nil result[:content]
         | 
| @@ -0,0 +1,33 @@ | |
| 1 | 
            +
            require 'test_helper'
         | 
| 2 | 
            +
            class PdfContentExtractorTest < Test::Unit::TestCase
         | 
| 3 | 
            +
              include TestHelper
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              def setup
         | 
| 6 | 
            +
                @ce = ContentExtractors::PdfContentExtractor.new
         | 
| 7 | 
            +
              end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              def test_can_do
         | 
| 10 | 
            +
                assert @ce.can_do('application/pdf')
         | 
| 11 | 
            +
                assert !@ce.can_do('application/msword')
         | 
| 12 | 
            +
              end
         | 
| 13 | 
            +
              def test_simple_with_ctype
         | 
| 14 | 
            +
                result = ContentExtractors.process(pdf_doc('simple'), 'application/pdf')
         | 
| 15 | 
            +
                check_content(result)
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
              
         | 
| 18 | 
            +
              def test_simple
         | 
| 19 | 
            +
                result = @ce.process(pdf_doc('simple'))
         | 
| 20 | 
            +
                check_content(result)
         | 
| 21 | 
            +
              end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              private
         | 
| 24 | 
            +
              def check_content(result)
         | 
| 25 | 
            +
                assert_not_nil result
         | 
| 26 | 
            +
                assert_nil result[:title]
         | 
| 27 | 
            +
                assert_nil result[:links]
         | 
| 28 | 
            +
                assert_not_nil result[:content]
         | 
| 29 | 
            +
                assert_equal 'This is for testing PDF extraction. Another Paragraph.', result[:content]
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
              
         | 
| 32 | 
            +
            end
         | 
| 33 | 
            +
             | 
| @@ -0,0 +1,34 @@ | |
| 1 | 
            +
            require 'test_helper'
         | 
| 2 | 
            +
            class WordContentExtractorTest < Test::Unit::TestCase
         | 
| 3 | 
            +
              include TestHelper
         | 
| 4 | 
            +
             | 
| 5 | 
            +
              def setup
         | 
| 6 | 
            +
                @ce = ContentExtractors::WordContentExtractor.new
         | 
| 7 | 
            +
              end
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              def test_can_do
         | 
| 10 | 
            +
                assert !@ce.can_do('application/pdf')
         | 
| 11 | 
            +
                assert @ce.can_do('application/msword')
         | 
| 12 | 
            +
              end
         | 
| 13 | 
            +
              def test_simple_with_ctype
         | 
| 14 | 
            +
                result = ContentExtractors.process(word_doc('simple'), 'application/msword')
         | 
| 15 | 
            +
                check_content(result)
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
              
         | 
| 18 | 
            +
              def test_simple
         | 
| 19 | 
            +
                result = @ce.process(word_doc('simple'))
         | 
| 20 | 
            +
                check_content(result)
         | 
| 21 | 
            +
              end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              private
         | 
| 24 | 
            +
              def check_content(result)
         | 
| 25 | 
            +
                assert_not_nil result
         | 
| 26 | 
            +
                assert_equal [], result[:links]
         | 
| 27 | 
            +
                assert_not_nil result[:title]
         | 
| 28 | 
            +
                assert_equal 'Untitled', result[:title]
         | 
| 29 | 
            +
                assert_not_nil result[:content]
         | 
| 30 | 
            +
                assert_equal 'Test content for Word content extraction. Another paragraph.', result[:content]
         | 
| 31 | 
            +
              end
         | 
| 32 | 
            +
              
         | 
| 33 | 
            +
            end
         | 
| 34 | 
            +
             | 
    
        metadata
    CHANGED
    
    | @@ -1,102 +1,107 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 | 
            -
            rubygems_version: 0.8. | 
| 2 | 
            +
            rubygems_version: 0.8.11
         | 
| 3 3 | 
             
            specification_version: 1
         | 
| 4 4 | 
             
            name: rdig
         | 
| 5 5 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 6 | 
            -
              version: 0. | 
| 7 | 
            -
            date: 2006- | 
| 6 | 
            +
              version: 0.2.0
         | 
| 7 | 
            +
            date: 2006-04-19 00:00:00 +02:00
         | 
| 8 8 | 
             
            summary: Ruby based web site indexing and searching library.
         | 
| 9 9 | 
             
            require_paths: 
         | 
| 10 | 
            -
             | 
| 10 | 
            +
            - lib
         | 
| 11 11 | 
             
            email: jk@jkraemer.net
         | 
| 12 12 | 
             
            homepage: http://rdig.rubyforge.org/
         | 
| 13 13 | 
             
            rubyforge_project: rdig
         | 
| 14 | 
            -
            description:  | 
| 15 | 
            -
              a site search for web sites or intranets. Internally, Ferret is used for the
         | 
| 16 | 
            -
              full text indexing. After creating a config file  for your site, the index can
         | 
| 17 | 
            -
              be built with a single call to rdig."
         | 
| 14 | 
            +
            description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file  for your site, the index can be built with a single call to rdig.
         | 
| 18 15 | 
             
            autorequire: 
         | 
| 19 16 | 
             
            default_executable: rdig
         | 
| 20 17 | 
             
            bindir: bin
         | 
| 21 18 | 
             
            has_rdoc: true
         | 
| 22 19 | 
             
            required_ruby_version: !ruby/object:Gem::Version::Requirement 
         | 
| 23 20 | 
             
              requirements: 
         | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
                   | 
| 27 | 
            -
                    version: 0.0.0
         | 
| 21 | 
            +
              - - ">"
         | 
| 22 | 
            +
                - !ruby/object:Gem::Version 
         | 
| 23 | 
            +
                  version: 0.0.0
         | 
| 28 24 | 
             
              version: 
         | 
| 29 25 | 
             
            platform: ruby
         | 
| 26 | 
            +
            signing_key: 
         | 
| 27 | 
            +
            cert_chain: 
         | 
| 30 28 | 
             
            authors: 
         | 
| 31 | 
            -
             | 
| 29 | 
            +
            - Jens Kraemer
         | 
| 32 30 | 
             
            files: 
         | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 31 | 
            +
            - bin/rdig
         | 
| 32 | 
            +
            - lib/rdig
         | 
| 33 | 
            +
            - lib/htmlentities
         | 
| 34 | 
            +
            - lib/rdig.rb
         | 
| 35 | 
            +
            - lib/rdig/http_client.rb
         | 
| 36 | 
            +
            - lib/rdig/crawler.rb
         | 
| 37 | 
            +
            - lib/rdig/search.rb
         | 
| 38 | 
            +
            - lib/rdig/highlight.rb
         | 
| 39 | 
            +
            - lib/rdig/index.rb
         | 
| 40 | 
            +
            - lib/rdig/url_filters.rb
         | 
| 41 | 
            +
            - lib/rdig/content_extractors.rb
         | 
| 42 | 
            +
            - lib/htmlentities/CHANGES
         | 
| 43 | 
            +
            - lib/htmlentities/COPYING
         | 
| 44 | 
            +
            - lib/htmlentities/README
         | 
| 45 | 
            +
            - lib/htmlentities/htmlentities.rb
         | 
| 46 | 
            +
            - test/unit
         | 
| 47 | 
            +
            - test/fixtures
         | 
| 48 | 
            +
            - test/test_helper.rb
         | 
| 49 | 
            +
            - test/unit/etag_filter_test.rb
         | 
| 50 | 
            +
            - test/unit/url_filters_test.rb
         | 
| 51 | 
            +
            - test/unit/html_content_extractor_test.rb
         | 
| 52 | 
            +
            - test/unit/pdf_content_extractor_test.rb
         | 
| 53 | 
            +
            - test/unit/word_content_extractor_test.rb
         | 
| 54 | 
            +
            - test/fixtures/html
         | 
| 55 | 
            +
            - test/fixtures/pdf
         | 
| 56 | 
            +
            - test/fixtures/word
         | 
| 57 | 
            +
            - test/fixtures/html/entities.html
         | 
| 58 | 
            +
            - test/fixtures/html/simple.html
         | 
| 59 | 
            +
            - test/fixtures/html/custom_tag_selectors.html
         | 
| 60 | 
            +
            - test/fixtures/pdf/simple.pdf
         | 
| 61 | 
            +
            - test/fixtures/word/simple.doc
         | 
| 62 | 
            +
            - doc/examples
         | 
| 63 | 
            +
            - doc/examples/config.rb
         | 
| 64 | 
            +
            - LICENSE
         | 
| 65 | 
            +
            - TODO
         | 
| 66 | 
            +
            - CHANGES
         | 
| 67 | 
            +
            - README
         | 
| 68 | 
            +
            - install.rb
         | 
| 69 | 
            +
            - rakefile
         | 
| 66 70 | 
             
            test_files: []
         | 
| 71 | 
            +
             | 
| 67 72 | 
             
            rdoc_options: 
         | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            +
            - --title
         | 
| 74 | 
            +
            - Rake -- Ruby Make
         | 
| 75 | 
            +
            - --main
         | 
| 76 | 
            +
            - README
         | 
| 77 | 
            +
            - --line-numbers
         | 
| 73 78 | 
             
            extra_rdoc_files: 
         | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 79 | 
            +
            - README
         | 
| 80 | 
            +
            - CHANGES
         | 
| 81 | 
            +
            - LICENSE
         | 
| 82 | 
            +
            - TODO
         | 
| 78 83 | 
             
            executables: 
         | 
| 79 | 
            -
             | 
| 84 | 
            +
            - rdig
         | 
| 80 85 | 
             
            extensions: []
         | 
| 86 | 
            +
             | 
| 81 87 | 
             
            requirements: []
         | 
| 88 | 
            +
             | 
| 82 89 | 
             
            dependencies: 
         | 
| 83 | 
            -
             | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
               | 
| 94 | 
            -
             | 
| 95 | 
            -
             | 
| 96 | 
            -
                 | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
| 101 | 
            -
                        version: 1.0.4
         | 
| 102 | 
            -
                  version: 
         | 
| 90 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 91 | 
            +
              name: ferret
         | 
| 92 | 
            +
              version_requirement: 
         | 
| 93 | 
            +
              version_requirements: !ruby/object:Gem::Version::Requirement 
         | 
| 94 | 
            +
                requirements: 
         | 
| 95 | 
            +
                - - ">="
         | 
| 96 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 97 | 
            +
                    version: 0.3.2
         | 
| 98 | 
            +
                version: 
         | 
| 99 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 100 | 
            +
              name: rubyful_soup
         | 
| 101 | 
            +
              version_requirement: 
         | 
| 102 | 
            +
              version_requirements: !ruby/object:Gem::Version::Requirement 
         | 
| 103 | 
            +
                requirements: 
         | 
| 104 | 
            +
                - - ">="
         | 
| 105 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 106 | 
            +
                    version: 1.0.4
         | 
| 107 | 
            +
                version: 
         |