RubyGems - rdig - Versions diffs - 0.3.3 → 0.3.4 - Mend

rdig 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/README +5 -1
data/doc/examples/config.rb +6 -13
data/lib/rdig.rb +1 -1
data/lib/rdig/content_extractors.rb +6 -7
data/lib/rdig/content_extractors/doc.rb +19 -10
data/lib/rdig/content_extractors/hpricot.rb +3 -1
data/lib/rdig/content_extractors/rubyful_soup.rb +5 -1
data/rakefile +1 -1
metadata +12 -4

data/README CHANGED

@@ -5,7 +5,11 @@ to help building a site search for web sites or intranets. Internally,
 Ferret is used for the full text indexing. After creating a config file
 for your site, the index can be built with a single call to rdig.
-RDig depends on Ferret (>= 0.10.0) and the RubyfulSoup library (>= 1.0.4).
+RDig depends on Ferret (>= 0.10.0) and, for parsing HTML, on either
+Hpricot (>= 0.4) or the RubyfulSoup library (>= 1.0.4). As I know no way
+to specify such an OR dependency in a gem specification, the gem depends
+on Hpricot. If this is a problem for you, install the gem with --force and
+manually do a +gem install rubyful_soup+.
 == basic usage

data/doc/examples/config.rb CHANGED

@@ -49,28 +49,21 @@ RDig.configuration do |cfg|
   # hpricot config above, and uncomment the following:
   #
   #  :rubyful_soup => OpenStruct.new(
-  #    # select the html element that contains the content to index
-  #    # by default, we index all inside the body tag:
+  #    # provide a method that returns the title of an html document
+  #    # this method may either return a tag to extract the title from,
+  #    # or a ready-to-index string.
   #    :content_tag_selector => lambda { |tagsoup|
   #      tagsoup.html.body
   #    },
-  #    # select the html element containing the title
+  #    # provide a method that selects the tag containing the page content you
+  #    # want to index. Useful to avoid indexing common elements like navigation
+  #    # and page footers for every page.
   #    :title_tag_selector         => lambda { |tagsoup|
   #      tagsoup.html.head.title
   #    }
   #  )
   )
-  # provide a method that returns the title of an html document
-  # this method may either return a tag to extract the title from,
-  # or a ready-to-index string.
-  # cfg.content_extraction.html.title_tag_selector = lambda { |tagsoup| tagsoup.html.head.title }
-  # provide a method that selects the tag containing the page content you
-  # want to index. Useful to avoid indexing common elements like navigation
-  # and page footers for every page.
-  # cfg.content_extraction.html.content_tag_selector = lambda { |tagsoup| tagsoup.html.body }
   # crawler options
   # Notice: for file system crawling the include/exclude_document patterns are

data/lib/rdig.rb CHANGED

@@ -24,7 +24,7 @@
 #++
 #
-RDIGVERSION = '0.3.3'
+RDIGVERSION = '0.3.4'
 require 'thread'

data/lib/rdig/content_extractors.rb CHANGED

@@ -16,15 +16,15 @@ module RDig
       def self.inherited(extractor)
         super(extractor)
-        puts("discovered content extractor class: #{extractor}") if RDig.config.verbose
         self.extractors << extractor
       end
       def self.extractors; @@extractors ||= [] end
       def self.extractor_instances
         @@extractor_instances ||= extractors.map { |ex_class|
-          ex_class.new(RDig.configuration.content_extraction)
-        }
+          puts "initializing content extractor: #{ex_class}" if RDig.configuration.verbose
+          ex_class.new(RDig.configuration.content_extraction) rescue nil
+        }.compact
       end
       def self.process(content, content_type)
@@ -32,7 +32,6 @@ module RDig
           return extractor.process(content) if extractor.can_do(content_type)
         }
         puts "unable to handle content type #{content_type}"
-        nil
       end
       def initialize(config)
@@ -78,8 +77,8 @@ end
 # load content extractors
 Dir["#{File.expand_path(File.dirname(__FILE__))}/content_extractors/**/*.rb"].each do |f|
   begin
-    require f
-  rescue
-    puts "error loading #{f}: #{$!}"
+    require f
+  rescue LoadError
+    puts "could not load #{f}: #{$!}"
   end
 end

data/lib/rdig/content_extractors/doc.rb CHANGED

@@ -13,16 +13,25 @@ module RDig
         @wvhtml = 'wvHtml'
         @pattern = /^application\/msword/
         # html extractor for parsing wvHtml output
-        @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
-            :rubyful_soup => OpenStruct.new(
-              :content_tag_selector => lambda { |tagsoup|
-                tagsoup.html.body
-              },
-              :title_tag_selector         => lambda { |tagsoup|
-                tagsoup.html.head.title
-              }
-            )))
+        if defined?(HpricotContentExtractor)
+          @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
+              :hpricot => OpenStruct.new(
+                :content_tag_selector => 'body',
+                :title_tag_selector   => 'title'
+              )))
+         elsif defined?(RubyfulSoupContentExtractor)
+          @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
+              :rubyful_soup => OpenStruct.new(
+                :content_tag_selector => lambda { |tagsoup|
+                  tagsoup.html.body
+                },
+                :title_tag_selector         => lambda { |tagsoup|
+                  tagsoup.html.head.title
+                }
+              )))
+        else
+          raise "need at least one html content extractor - please install hpricot or rubyful_soup"
+        end
         # TODO: better: if $?.exitstatus == 127 (not found)
         @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
       end

data/lib/rdig/content_extractors/hpricot.rb CHANGED

@@ -5,6 +5,7 @@ rescue LoadError
   require 'hpricot'
 end
+if defined?(Hpricot)
 module RDig
   module ContentExtractors
@@ -24,7 +25,7 @@ module RDig
       def process(content)
         doc = Hpricot(content)
         {
-          :title => extract_title(doc).decode_entities,
+          :title => extract_title(doc).decode_entities.strip,
           :links => extract_links(doc),
           :content => extract_content(doc).decode_entities
         }
@@ -97,3 +98,4 @@ module RDig
   end
 end
+end

data/lib/rdig/content_extractors/rubyful_soup.rb CHANGED

@@ -2,8 +2,10 @@ begin
   require 'rubyful_soup'
 rescue LoadError
   require 'rubygems'
-  require 'rubyful_soup'
+  require 'rubyful_soup' rescue nil
 end
+if defined?(BeautifulSoup)
 # override some methods concered with entity resolving
 # to convert them to strings
@@ -145,3 +147,5 @@ module RDig
   end
 end
+end

data/rakefile CHANGED

@@ -134,7 +134,7 @@ else
     # TODO: check if there is anything like 'suggested' instead of required, or
     # ORed dependencies...
     #s.add_dependency('rubyful_soup', '>= 1.0.4')
-    #s.add_dependency('hpricot', '>= 0.4')
+    s.add_dependency('hpricot', '>= 0.4')
     #s.requirements << ""
     #### Which files are to be included in this gem?  Everything!  (Except CVS directories.)

metadata CHANGED

@@ -1,10 +1,10 @@
 --- !ruby/object:Gem::Specification
-rubygems_version: 0.9.0
+rubygems_version: 0.8.11
 specification_version: 1
 name: rdig
 version: !ruby/object:Gem::Version
-  version: 0.3.3
-date: 2006-10-23 00:00:00 +02:00
+  version: 0.3.4
+date: 2006-12-31 00:00:00 +01:00
 summary: Ruby based web site indexing and searching library.
 require_paths:
 - lib
@@ -25,7 +25,6 @@ required_ruby_version: !ruby/object:Gem::Version::Requirement
 platform: ruby
 signing_key:
 cert_chain:
-post_install_message:
 authors:
 - Jens Kraemer
 files:
@@ -107,3 +106,12 @@ dependencies:
       - !ruby/object:Gem::Version
         version: 0.10.0
     version:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0.4"
+    version: