RubyGems - rdig - Versions diffs - 0.3.9 → 0.3.10 - Mend

rdig 0.3.9 → 0.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/LICENSE +1 -1
data/{README → README.rdoc} +1 -1
data/bin/rdig +0 -0
data/lib/rdig.rb +6 -7
data/lib/rdig/content_extractors.rb +4 -4
data/lib/rdig/content_extractors/pdf.rb +3 -3
data/lib/rdig/crawler.rb +17 -14
data/lib/rdig/documents.rb +8 -8
data/lib/rdig/index.rb +9 -5
data/lib/rdig/search.rb +19 -9
data/lib/rdig/url_filters.rb +26 -7
metadata +4 -4

data/LICENSE CHANGED

@@ -1,4 +1,4 @@
-Copyright (c) 2006 Jens Kraemer
+Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/{README → README.rdoc} RENAMED

@@ -25,7 +25,7 @@ manually do a +gem install rubyful_soup+.
 === Handle search in your application:
   require 'rdig'
   require 'rdig_config'   # load your config file here
-  search_results = RDig.searcher.search(query, options={})
+  search_results = RDig.searcher.search(query)
 see RDig::Search::Searcher for more information.

data/bin/rdig CHANGED

File without changes

data/lib/rdig.rb CHANGED

@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 #--
-# Copyright (c) 2006 Jens Kraemer
+# Copyright (c) 2006-2012 Jens Kraemer, jk@jkraemer.net
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -84,7 +84,7 @@ module RDig
           { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
         ]
       }
     end
     def application
@@ -150,7 +150,7 @@ module RDig
       end
     end
     alias config configuration
     def logger
       @logger ||= create_logger
     end
@@ -200,12 +200,12 @@ module RDig
     def options
       @options ||= OpenStruct.new
     end
     # Display the program usage line.
     def usage
       puts "rdig -c configfile {options}"
     end
     # Display the rake command line help.
     def help
       usage
@@ -266,8 +266,7 @@ module RDig
       rescue
         puts $!.backtrace
         fail "No Configfile found!\n#{$!}"
-      end
+      end
       puts "using Ferret #{Ferret::VERSION}"

data/lib/rdig/content_extractors.rb CHANGED

@@ -1,5 +1,5 @@
 module RDig
   # Contains classes which are used for extracting content and meta data from
   # various content types.
   module ContentExtractors
@@ -13,7 +13,7 @@ module RDig
     # Extractors inheriting from this class will be auto-discovered and used
     # when can_do returns true
     class ContentExtractor
       def self.inherited(extractor)
         super(extractor)
         self.extractors << extractor
@@ -32,7 +32,7 @@ module RDig
           ex
         }.compact
       end
       def self.process(content, content_type)
         self.extractor_instances.each { |extractor|
           return extractor.process(content) if extractor.can_do(content_type)
@@ -61,7 +61,7 @@ module RDig
         end
         result
       end
       def as_file(content)
         file = Tempfile.new('rdig')
         file << content

data/lib/rdig/content_extractors/pdf.rb CHANGED

@@ -8,7 +8,7 @@ module RDig
     #
     class PdfContentExtractor < ContentExtractor
       include ExternalAppHelper
       def initialize(config)
         super(config)
         @pattern = /^application\/pdf/
@@ -22,7 +22,7 @@ module RDig
           end
         }
       end
       def process(content)
         result = {}
         as_file(content) do |file|
@@ -35,7 +35,7 @@ module RDig
       def get_content(path_to_tempfile)
         %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
       end
       # extracts the title from pdf meta data
       # needs pdfinfo
       # returns the title or nil if no title was found

data/lib/rdig/crawler.rb CHANGED

@@ -1,12 +1,11 @@
 module RDig
   class Crawler
     def initialize(config = RDig.config, logger = RDig.logger)
       @documents = Queue.new
       @logger = logger
       @config = config
+      @indexed_documents = 0
     end
     def run
@@ -22,6 +21,7 @@ module RDig
       url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
       chain_config = RDig.filter_chain[url_type]
+      # the etag filter operates on the fetched document, thats why we cannot put it into the filter chain right now.
       @etag_filter = ETagFilter.new
       filterchain = UrlFilters::FilterChain.new(chain_config)
       @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
@@ -31,9 +31,11 @@ module RDig
       num_threads.times { |i|
         group.join_nowait Thread.new("fetcher #{i}") {
           filterchain = UrlFilters::FilterChain.new(chain_config)
+          @logger.info "thread #{i} running..."
           while (doc = @documents.pop) != :exit
             process_document doc, filterchain
           end
+          @logger.info "thread #{i} is done."
         }
       }
@@ -47,20 +49,21 @@ module RDig
       @logger.info "waiting for threads to finish..."
       group.all_waits
+      @logger.info "indexed #{@indexer.indexed_documents} documents"
     end
     def process_document(doc, filterchain)
-      @logger.debug "processing document #{doc}"
+      @logger.info "processing document #{doc.uri}"
       doc.fetch
       case doc.status
       when :success
-        if @etag_filter.apply(doc)
+        if @etag_filter.apply(doc)
           # add links from this document to the queue
           doc.content[:links].each { |url|
             add_url(url, filterchain, doc)
           } unless doc.content[:links].nil?
           add_to_index doc
-        end
+        end
       when :redirect
         @logger.debug "redirect to #{doc.content}"
         add_url(doc.content, filterchain, doc)
@@ -69,14 +72,16 @@ module RDig
       end
     rescue
       @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
-      @logger.debug "Trace: #{$!.backtrace.join("\n")}"
+      @logger.info "Trace: #{$!.backtrace.join("\n")}"
     end
     def add_to_index(doc)
-      @indexer << doc if doc.needs_indexing?
+      if doc.needs_indexing?
+        @indexer << doc
+      end
     end
     # pipes a new document pointing to url through the filter chain,
     # if it survives that, it gets added to the documents queue for further
     # processing
@@ -90,19 +95,17 @@ module RDig
         Document.create(url)
       end
-      doc = filterchain.apply(doc)
-      if doc
+      if doc = filterchain.apply(doc)
         @documents << doc
         @logger.debug "url #{url} survived filterchain"
       end
     rescue
       nil
     end
   end
   # checks fetched documents' E-Tag headers against the list of E-Tags
   # of the documents already indexed.
   # This is supposed to help against double-indexing documents which can

data/lib/rdig/documents.rb CHANGED

@@ -1,14 +1,14 @@
 module RDig
   #
   # Document base class
   #
   class Document
     attr_reader :uri
     attr_reader :content
     attr_reader :content_type
     def self.create(url)
       return case url
         when /^https?:\/\//i
@@ -32,7 +32,7 @@ module RDig
     def title; @content[:title] end
     def body; @content[:content] end
     def links; @content[:links] end
     def needs_indexing?
       has_content? && (title || body)
     end
@@ -47,7 +47,7 @@ module RDig
   end
   #
   # Document in a File system
   #
@@ -90,8 +90,8 @@ module RDig
     end
   end
   #
   # Remote Document to be retrieved by HTTP
   #
@@ -106,7 +106,7 @@ module RDig
     def create_child(uri)
       HttpDocument.new(:uri => uri, :referrer => self.uri, :depth => self.depth+1) unless uri =~ /^file:\/\//i
     end
     # url: url of this document, may be relative to the referring doc or host.
     # referrer: uri of the document we retrieved this link from
     def initialize(args={})

data/lib/rdig/index.rb CHANGED

@@ -1,11 +1,14 @@
 module RDig
   module Index
     # used by the crawler to build the ferret index
     class Indexer
       include MonitorMixin
+      attr_reader :indexed_documents
       def initialize(settings)
+        @indexed_documents = 0
         @config = settings
         @index_writer = Ferret::Index::IndexWriter.new(
                           :path     => settings.path,
@@ -13,7 +16,7 @@ module RDig
                           :analyzer => settings.analyzer)
         super() # scary, MonitorMixin won't initialize if we don't call super() here (parens matter)
       end
       def add_to_index(document)
         RDig.logger.debug "add to index: #{document.uri.to_s}"
         @config.rewrite_uri.call(document.uri) if @config.rewrite_uri
@@ -25,16 +28,17 @@ module RDig
         }
         synchronize do
           @index_writer << doc
+          @indexed_documents += 1
         end
       end
       alias :<< :add_to_index
       def close
         @index_writer.optimize
         @index_writer.close
         @index_writer = nil
       end
     end
   end
 end

data/lib/rdig/search.rb CHANGED

@@ -5,17 +5,17 @@ module RDig
     # Call RDig::searcher to retrieve an instance ready for use.
     class Searcher
       include Ferret::Search
       # the query parser used to parse query strings
       attr_reader :query_parser
       # takes the ferret section of the rdig configuration as a parameter.
       def initialize(settings)
         @ferret_config = settings
         @query_parser = Ferret::QueryParser.new(settings.marshal_dump)
         ferret_searcher
       end
       # returns the Ferret::Search::IndexSearcher instance used internally.
       def ferret_searcher
         if @ferret_searcher and !@ferret_searcher.reader.latest?
@@ -29,7 +29,14 @@ module RDig
         end
         @ferret_searcher
       end
+      def get_maximum_score(query, options)
+        ferret_searcher.search_each(query, options.merge(:limit => 1, :offset => 0)) do |doc_id, score|
+          return score
+        end
+        0
+      end
       # run a search.
       # +query+ usually will be a user-entered string. See the Ferret query
       # language[http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html]
@@ -46,23 +53,26 @@ module RDig
         RDig.logger.info "Query: #{query}"
         results = []
         searcher = ferret_searcher
+        maximum_score = get_maximum_score query, options
         result[:hitcount] = searcher.search_each(query, options) do |doc_id, score|
           doc = searcher[doc_id]
           results << { :score => score,
                        :title => doc[:title],
                        :url => doc[:url],
-                       :extract => build_extract(doc[:data]) }
+                       :extract => build_extract(doc[:data]),
+                       :relative_score => (score / maximum_score)
+                     }
         end
         result[:list] = results
         result
       end
       def build_extract(data)
         (data && data.length > 200) ? data[0..200] : data
       end
     end
   #  class SearchResult < OpenStruct
   #    def initialize(doc, score)
   #      self.score = score
@@ -72,6 +82,6 @@ module RDig
   #    end
   #  end
   end
 end

data/lib/rdig/url_filters.rb CHANGED

@@ -1,9 +1,10 @@
 module RDig
   module UrlFilters
     class FilterChain
       def initialize(chain_config)
+        @logger = RDig.logger
         @filters = []
         chain_config.each { |filter|
           case filter
@@ -29,11 +30,23 @@ module RDig
         when Symbol
           if args.nil?
             @filters << lambda { |document|
-              UrlFilters.send(filter, document)
+              begin
+                UrlFilters.send(filter, document)
+              rescue Exception
+                @logger.error "error in URL filter #{filter}: #{$!}"
+                @logger.error $!.backtrace.join("\n")
+                nil
+              end
             }
           else
             @filters << lambda { |document|
-              UrlFilters.send(filter, document, args)
+              begin
+                UrlFilters.send(filter, document, args)
+              rescue Exception
+                @logger.error "error in URL filter #{filter}: #{$!}"
+                @logger.error $!.backtrace.join("\n")
+                nil
+              end
             }
           end
         when Class
@@ -54,7 +67,13 @@ module RDig
       def apply(document)
         @filters.each { |filter|
-          return nil unless filter.call(document)
+          @logger.debug "running filter #{filter.inspect} on doc #{document.uri}"
+          unless filter.call(document)
+            @logger.debug "fail"
+            return nil
+          else
+            @logger.debug 'OK'
+          end
         }
         return document
       end
@@ -75,7 +94,7 @@ module RDig
       # nil otherwise
       def apply(document)
         synchronize do
-          @visited_urls.add?(document.uri.to_s) ? document : nil
+          @visited_urls.add?(document.uri.to_s) ? document : nil
         end
       end
     end
@@ -174,7 +193,7 @@ module RDig
       uri.host = ref.host unless uri.host
       uri.port = ref.port unless uri.port || ref.port==ref.default_port
       uri.path = ref.path unless uri.path
       old_uri_path = uri.path
       if uri.path !~ /^\// || uri.path =~ /^\.\./
         ref_path = ref.path || '/'
@@ -202,7 +221,7 @@ module RDig
       if document.uri.path =~ /\/$/
         # append index document if configured
         if cfg.index_document
-          document.uri.path << RDig.config.index_document
+          document.uri.path << cfg.index_document
         elsif cfg.remove_trailing_slash
          document.uri.path.gsub! /\/$/, ''
         end

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 3
-  - 9
-  version: 0.3.9
+  - 10
+  version: 0.3.10
 platform: ruby
 authors:
 - Jens Kraemer
@@ -67,7 +67,7 @@ extensions: []
 extra_rdoc_files:
 - History.txt
 - Manifest.txt
-- README
+- README.rdoc
 files:
 - CHANGES
 - History.txt
@@ -75,7 +75,7 @@ files:
 - LICENSE
 - Manifest.txt
 - rakefile
-- README
+- README.rdoc
 - bin/rdig
 - doc/examples/config.rb
 - lib/rdig/content_extractors/doc.rb