RubyGems - rdig - Versions diffs - 0.3.9 → 0.3.10 - Mend

rdig 0.3.9 → 0.3.10

Files changed (12) hide show

data/LICENSE +1 -1
data/{README → README.rdoc} +1 -1
data/bin/rdig +0 -0
data/lib/rdig.rb +6 -7
data/lib/rdig/content_extractors.rb +4 -4
data/lib/rdig/content_extractors/pdf.rb +3 -3
data/lib/rdig/crawler.rb +17 -14
data/lib/rdig/documents.rb +8 -8
data/lib/rdig/index.rb +9 -5
data/lib/rdig/search.rb +19 -9
data/lib/rdig/url_filters.rb +26 -7
metadata +4 -4

data/LICENSE CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2006 Jens Kraemer
+Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/{README → README.rdoc} RENAMED

@@ -25,7 +25,7 @@ manually do a +gem install rubyful_soup+.
 === Handle search in your application:
   require 'rdig'
   require 'rdig_config'   # load your config file here
-  search_results = RDig.searcher.search(query, options={})
+  search_results = RDig.searcher.search(query)
 see RDig::Search::Searcher for more information.

data/bin/rdig CHANGED

File without changes

data/lib/rdig.rb CHANGED

@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 #--
-# Copyright (c) 2006 Jens Kraemer
+# Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -84,7 +84,7 @@ module RDig
           { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
         ]
       }
     end
     def application
@@ -150,7 +150,7 @@ module RDig
       end
     end
     alias config configuration
     def logger
       @logger ||= create_logger
     end
@@ -200,12 +200,12 @@ module RDig
     def options
       @options ||= OpenStruct.new
     end
     # Display the program usage line.
     def usage
       puts "rdig -c configfile {options}"
     end
     # Display the rake command line help.
     def help
       usage
@@ -266,8 +266,7 @@ module RDig
       rescue
         puts $!.backtrace
         fail "No Configfile found!\n#{$!}"
-      end
+      end
       puts "using Ferret #{Ferret::VERSION}"

data/lib/rdig/content_extractors.rb CHANGED

@@ -1,5 +1,5 @@
 module RDig
   # Contains classes which are used for extracting content and meta data from
   # various content types.
   module ContentExtractors
@@ -13,7 +13,7 @@ module RDig
     # Extractors inheriting from this class will be auto-discovered and used
     # when can_do returns true
     class ContentExtractor
       def self.inherited(extractor)
         super(extractor)
         self.extractors << extractor
@@ -32,7 +32,7 @@ module RDig
           ex
         }.compact
       end
       def self.process(content, content_type)
         self.extractor_instances.each { |extractor|
           return extractor.process(content) if extractor.can_do(content_type)
@@ -61,7 +61,7 @@ module RDig
         end
         result
       end
       def as_file(content)
         file = Tempfile.new('rdig')
         file << content

data/lib/rdig/content_extractors/pdf.rb CHANGED

@@ -8,7 +8,7 @@ module RDig
     #
     class PdfContentExtractor < ContentExtractor
       include ExternalAppHelper
       def initialize(config)
         super(config)
         @pattern = /^application\/pdf/
@@ -22,7 +22,7 @@ module RDig
           end
         }
       end
       def process(content)
         result = {}
         as_file(content) do |file|
@@ -35,7 +35,7 @@ module RDig
       def get_content(path_to_tempfile)
         %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
       end
       # extracts the title from pdf meta data
       # needs pdfinfo
       # returns the title or nil if no title was found

data/lib/rdig/crawler.rb CHANGED

@@ -1,12 +1,11 @@
 module RDig
   class Crawler
     def initialize(config = RDig.config, logger = RDig.logger)
       @documents = Queue.new
       @logger = logger
       @config = config
+      @indexed_documents = 0
     end
     def run
@@ -22,6 +21,7 @@ module RDig
       url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
       chain_config = RDig.filter_chain[url_type]
+      # the etag filter operates on the fetched document, thats why we cannot put it into the filter chain right now.
       @etag_filter = ETagFilter.new
       filterchain = UrlFilters::FilterChain.new(chain_config)
       @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
@@ -31,9 +31,11 @@ module RDig
       num_threads.times { |i|
         group.join_nowait Thread.new("fetcher #{i}") {
           filterchain = UrlFilters::FilterChain.new(chain_config)
+          @logger.info "thread #{i} running..."
           while (doc = @documents.pop) != :exit
             process_document doc, filterchain
           end
+          @logger.info "thread #{i} is done."
         }
       }
@@ -47,20 +49,21 @@ module RDig
       @logger.info "waiting for threads to finish..."
       group.all_waits
+      @logger.info "indexed #{@indexer.indexed_documents} documents"
     end
     def process_document(doc, filterchain)
-      @logger.debug "processing document #{doc}"
+      @logger.info "processing document #{doc.uri}"
       doc.fetch
       case doc.status
       when :success
-        if @etag_filter.apply(doc)
+        if @etag_filter.apply(doc)
           # add links from this document to the queue
           doc.content[:links].each { |url|
             add_url(url, filterchain, doc)
           } unless doc.content[:links].nil?
           add_to_index doc
-        end
+        end
       when :redirect
         @logger.debug "redirect to #{doc.content}"
         add_url(doc.content, filterchain, doc)
@@ -69,14 +72,16 @@ module RDig
       end
     rescue
       @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
-      @logger.debug "Trace: #{$!.backtrace.join("\n")}"
+      @logger.info "Trace: #{$!.backtrace.join("\n")}"
     end
     def add_to_index(doc)
-      @indexer << doc if doc.needs_indexing?
+      if doc.needs_indexing?
+        @indexer << doc
+      end
     end
     # pipes a new document pointing to url through the filter chain,
     # if it survives that, it gets added to the documents queue for further
     # processing
@@ -90,19 +95,17 @@ module RDig
         Document.create(url)
       end
-      doc = filterchain.apply(doc)
-      if doc
+      if doc = filterchain.apply(doc)
         @documents << doc
         @logger.debug "url #{url} survived filterchain"
       end
     rescue
       nil
     end
   end
   # checks fetched documents' E-Tag headers against the list of E-Tags
   # of the documents already indexed.
   # This is supposed to help against double-indexing documents which can

data/lib/rdig/documents.rb CHANGED

@@ -1,14 +1,14 @@
 module RDig
   #
   # Document base class
   #
   class Document
     attr_reader :uri
     attr_reader :content
     attr_reader :content_type
     def self.create(url)
       return case url
         when /^https?:\/\//i
@@ -32,7 +32,7 @@ module RDig
     def title; @content[:title] end
     def body; @content[:content] end
     def links; @content[:links] end
     def needs_indexing?
       has_content? && (title || body)
     end
@@ -47,7 +47,7 @@ module RDig
   end
   #
   # Document in a File system
   #
@@ -90,8 +90,8 @@ module RDig
     end
   end
   #
   # Remote Document to be retrieved by HTTP
   #
@@ -106,7 +106,7 @@ module RDig
     def create_child(uri)
       HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
     end
     # url: url of this document, may be relative to the referring doc or host.
     # referrer: uri of the document we retrieved this link from
     def initialize(args={})

data/lib/rdig/index.rb CHANGED

@@ -1,11 +1,14 @@
 module RDig
   module Index
     # used by the crawler to build the ferret index
     class Indexer
       include MonitorMixin
+      attr_reader :indexed_documents
       def initialize(settings)
+        @indexed_documents = 0
         @config = settings
         @index_writer = Ferret::Index::IndexWriter.new(
                           :path     => settings.path,
@@ -13,7 +16,7 @@ module RDig
                           :analyzer => settings.analyzer)
         super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
       end
       def add_to_index(document)
         RDig.logger.debug "add to index: #{document.uri.to_s}"
         @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
@@ -25,16 +28,17 @@ module RDig
         }
         synchronize do
           @index_writer << doc
+          @indexed_documents += 1
         end
       end
       alias :<< :add_to_index
       def close
         @index_writer.optimize
         @index_writer.close
         @index_writer = nil
       end
     end
   end
 end

data/lib/rdig/search.rb CHANGED

@@ -5,17 +5,17 @@ module RDig
     # Call RDig::searcher to retrieve an instance ready for use.
     class Searcher
       include Ferret::Search
       # the query parser used to parse query strings
       attr_reader :query_parser
       # takes the ferret section of the rdig configuration as a parameter.
       def initialize(settings)
         @ferret_config = settings
         @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
         ferret_searcher
       end
       # returns the Ferret::Search::IndexSearcher instance used internally.
       def ferret_searcher
         if @ferret_searcher and !@ferret_searcher.reader.latest?
@@ -29,7 +29,14 @@ module RDig
         end
         @ferret_searcher
       end
+      def get_maximum_score(query, options)
+        ferret_searcher.search_each(query, options.merge(:limit => 1, :offset => 0)) do |doc_id, score|
+          return score
+        end
+        0
+      end
       # run a search.
       # +query+ usually will be a user-entered string. See the Ferret query
       # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
@@ -46,23 +53,26 @@ module RDig
         RDig.logger.info "Query: #{query}"
         results = []
         searcher = ferret_searcher
+        maximum_score = get_maximum_score query, options
         result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
           doc = searcher[doc_id]
           results << { :score => score,
                        :title => doc[:title],
                        :url => doc[:url],
-                       :extract => build_extract(doc[:data]) }
+                       :extract => build_extract(doc[:data]),
+                       :relative_score => (score / maximum_score)
+                     }
         end
         result[:list] = results
         result
       end
       def build_extract(data)
         (data && data.length > 200) ? data[0..200] : data
       end
     end
   #  class SearchResult < OpenStruct
   #    def initialize(doc, score)
   #      self.score = score
@@ -72,6 +82,6 @@ module RDig
   #    end
   #  end
   end
 end

data/lib/rdig/url_filters.rb CHANGED

@@ -1,9 +1,10 @@
 module RDig
   module UrlFilters
     class FilterChain
       def initialize(chain_config)
+        @logger = RDig.logger
         @filters = []
         chain_config.each { |filter|
           case filter
@@ -29,11 +30,23 @@ module RDig
         when Symbol
           if args.nil?
             @filters << lambda { |document|
-              UrlFilters.send(filter, document)
+              begin
+                UrlFilters.send(filter, document)
+              rescue Exception
+                @logger.error "error in URL filter #{filter}: #{$!}"
+                @logger.error $!.backtrace.join("\n")
+                nil
+              end
             }
           else
             @filters << lambda { |document|
-              UrlFilters.send(filter, document, args)
+              begin
+                UrlFilters.send(filter, document, args)
+              rescue Exception
+                @logger.error "error in URL filter #{filter}: #{$!}"
+                @logger.error $!.backtrace.join("\n")
+                nil
+              end
             }
           end
         when Class
@@ -54,7 +67,13 @@ module RDig
       def apply(document)
         @filters.each { |filter|
-          return nil unless filter.call(document)
+          @logger.debug "running filter #{filter.inspect} on doc #{document.uri}"
+          unless filter.call(document)
+            @logger.debug "fail"
+            return nil
+          else
+            @logger.debug 'OK'
+          end
         }
         return document
       end
@@ -75,7 +94,7 @@ module RDig
       # nil otherwise
       def apply(document)
         synchronize do
-          @visited_urls.add?(document.uri.to_s) ? document : nil
+          @visited_urls.add?(document.uri.to_s) ? document : nil
         end
       end
     end
@@ -174,7 +193,7 @@ module RDig
       uri.host = ref.host unless uri.host
       uri.port = ref.port unless uri.port || ref.port==ref.default_port
       uri.path = ref.path unless uri.path
       old_uri_path = uri.path
       if uri.path !~ /^\// || uri.path =~ /^\.\./
         ref_path = ref.path || '/'
@@ -202,7 +221,7 @@ module RDig
       if document.uri.path =~ /\/$/
         # append index document if configured
         if cfg.index_document
-          document.uri.path << RDig.config.index_document
+          document.uri.path << cfg.index_document
         elsif cfg.remove_trailing_slash
          document.uri.path.gsub! /\/$/, ''
         end

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 3
-  - 9
-  version: 0.3.9
+  - 10
+  version: 0.3.10
 platform: ruby
 authors:
 - Jens Kraemer
@@ -67,7 +67,7 @@ extensions: []
 extra_rdoc_files:
 - History.txt
 - Manifest.txt
-- README
+- README.rdoc
 files:
 - CHANGES
 - History.txt
@@ -75,7 +75,7 @@ files:
 - LICENSE
 - Manifest.txt
 - rakefile
-- README
+- README.rdoc
 - bin/rdig
 - doc/examples/config.rb
 - lib/rdig/content_extractors/doc.rb