RubyGems - stellr - Versions diffs - 0.1.0 - Mend

stellr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/History.txt +4 -0
data/Manifest.txt +36 -0
data/README.txt +109 -0
data/Rakefile +28 -0
data/bin/stellr +64 -0
data/bin/stellr-search +50 -0
data/config/stellr.yml +8 -0
data/lib/stellr.rb +37 -0
data/lib/stellr/client.rb +78 -0
data/lib/stellr/collections.rb +6 -0
data/lib/stellr/collections/base.rb +79 -0
data/lib/stellr/collections/multi_collection.rb +32 -0
data/lib/stellr/collections/rsync.rb +38 -0
data/lib/stellr/collections/searchable_collection.rb +166 -0
data/lib/stellr/collections/static.rb +97 -0
data/lib/stellr/collections/writeable_collection.rb +119 -0
data/lib/stellr/config.rb +88 -0
data/lib/stellr/search.rb +2 -0
data/lib/stellr/search/search_result.rb +21 -0
data/lib/stellr/search/search_results.rb +50 -0
data/lib/stellr/server.rb +166 -0
data/lib/stellr/strategies.rb +4 -0
data/lib/stellr/strategies/base.rb +16 -0
data/lib/stellr/strategies/blocking.rb +13 -0
data/lib/stellr/strategies/queueing.rb +78 -0
data/lib/stellr/utils.rb +24 -0
data/lib/stellr/utils/observable.rb +20 -0
data/lib/stellr/utils/shutdown.rb +30 -0
data/test/fixtures/movies.yml +4 -0
data/test/stellr_test.rb +38 -0
data/test/test_client.rb +27 -0
data/test/test_collections_base.rb +25 -0
data/test/test_helper.rb +1 -0
data/test/test_rsync_collection.rb +72 -0
data/test/test_server.rb +94 -0
data/test/test_static_collection.rb +40 -0
data/test/test_stellr.rb +11 -0
metadata +110 -0

data/lib/stellr/collections/multi_collection.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module Stellr
+  module Collections
+    #
+    class MultiCollection < SearchableCollection
+      def initialize( name, collections, options = {} )
+        super name, options
+        @collections = {}
+        collections.each do |collection|
+          @collections[collection.name] = collection
+          collection.add_listener do |event|
+            handle_event(collection.name, event)
+          end
+        end
+      end
+    protected
+      def open_reader
+        IndexReader.new @collections.values.map{|c| c.reader}
+      end
+      def handle_event(collection_name, event)
+        @logger.debug "handle_event: #{event.inspect}"
+        close_reader if event == :closing_reader
+      end
+    end
+  end
+end

data/lib/stellr/collections/rsync.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module Stellr
+  module Collections
+    # RSync collection implementation.
+    #
+    # Keeps two indexes around - one for searching and one for indexing. The
+    # difference when compared with the Static collection is that every
+    # now and then the changes are synced from the latter to the former using RSync,
+    # so no complete rebuild is necessary after a switch.
+    class RSync < Static
+      def batch_finished
+        switch if dirty?
+      end
+      # we want the indexes to be in sync after close, so do a last switch
+      alias :close :switch
+      protected
+      def writer_options
+        super.merge :create => @options[:recreate]
+      end
+      # overridden to sync the indexes after re-linking
+      def relink_indexes
+        super
+        sync_indexes
+      end
+      def sync_indexes
+        system("rsync -r --delete #{searching_directory}/ #{indexing_directory}")
+      end
+    end
+  end
+end

data/lib/stellr/collections/searchable_collection.rb ADDED Viewed

@@ -0,0 +1,166 @@
+module Stellr
+  module Collections
+    # Base class for searchable collection implementations
+    class SearchableCollection < Base
+      def initialize( name, options )
+        super name, options
+        @reader_monitor = Monitor.new
+        @query_parser_monitor = Monitor.new
+        @reader = @searcher = @query_parser = nil
+      end
+      # Search this collection.
+      # Options is a hash taking the usual Ferret::Search::Searcher options,
+      # plus:
+      # [+page+]          Page of results to show, starting with 1
+      # [+per_page+]      Number of records per page, default 10.
+      # [+fields+]        Array of fields to search in
+      # [+get_fields+]    Array of fields to retrieve in addition to the
+      #                   :id field
+      #
+      # The page and per_page options take precedence over any given limit and
+      # offset values.
+      def search(query, options = {})
+        results = Stellr::Search::SearchResults.new
+        if options[:page]
+          results.current_page = options.delete(:page).to_i
+          options[:limit] = results.per_page = options.delete(:per_page) || 10
+          options[:offset] = (p = results.current_page - 1) <= 0 ? 0 : p * results.per_page
+        end
+        get_fields = options.delete :get_fields
+        # TODO replace synchronization with some kind of shared read/exclusive
+        # write locking mechanism allowing parallel searches but guarding
+        # against the reader instance being shut down while we're inside
+        # retrieve_field_data
+        @reader_monitor.synchronize do
+          q = process_query query, options
+          @logger.debug "options: #{options.inspect}"
+          results.total_hits = searcher.search_each q, options do |id, score|
+            field_data = retrieve_field_data(id, get_fields)
+            results << Stellr::Search::SearchResult.new( id, score, field_data )
+          end
+          @logger.info "query #{query} : #{results.total_hits} results"
+        end
+        return results
+      end
+      def highlight( doc_id, query, options = {})
+        return searcher.highlight(process_query(query, options), doc_id, options[:field], options)
+      rescue
+        @logger.error "error in highlight: #{$!}. Document #{doc_id}, Query: #{query}, options: #{options.inspect}"
+        ''
+      end
+      def size
+        reader.num_docs
+      end
+      def on_shutdown( mode )
+        close
+      end
+      # close this collection
+      def close
+        close_reader
+      end
+      protected
+      # should open a reader and return it
+      def open_reader
+        raise 'not implemented'
+      end
+      def reader
+        @reader_monitor.synchronize do
+          @reader ||= open_reader
+        end
+      end
+      def searcher
+        @reader_monitor.synchronize do
+          @searcher ||= Ferret::Search::Searcher.new reader
+        end
+      end
+      def query_parser
+        @query_parser_monitor.synchronize do
+          @query_parser ||= create_query_parser
+        end
+      end
+      def create_query_parser(options = {})
+        Ferret::QueryParser.new( { :analyzer => create_analyzer, :or_default => false }.merge( options ) )
+      end
+      # reads field data for +:id+ and any other given fields from
+      # the document given by +id+
+      # unsynchronized reader access occurs, so only use from within blocks
+      # synchronizing on @reader_monitor.
+      def retrieve_field_data(id, fields = nil)
+        doc = reader[id]
+        field_data = { :id => doc[:id] }
+        fields.each do |f|
+          field_data[f] = doc[f]
+        end if fields
+        return field_data
+      end
+      # Turn a query string into a Ferret Query object.
+      # unsynchronized reader access occurs, so only use from
+      # within blocks synchronizing on @reader_monitor.
+      def process_query(query, options)
+        @logger.debug "process_query: #{query.inspect}"
+        q = query.dup
+        if String === q
+          @query_parser_monitor.synchronize do
+            qp = query_parser
+            tokenized_fields = reader.tokenized_fields
+            qp.fields = options[:fields] || tokenized_fields # reader.fields
+            qp.tokenized_fields = tokenized_fields
+            @logger.debug "tokenized_fields: #{tokenized_fields}"
+            q = qp.parse q
+          end
+        end
+        @logger.debug "processed query: #{q.inspect}"
+        return q
+      rescue
+        @logger.error "error processing query: #{$!}"
+      end
+      def collection_directory
+        @options[:path]
+      end
+      def close_reader
+        @reader_monitor.synchronize do
+          notify_listeners( :closing_reader )
+          return unless @reader
+          @query_parser = nil
+          @searcher.close if @searcher
+          @searcher = nil
+          @reader.close
+          @reader = nil
+        end
+      end
+      # TODO allow declarative analyzer specification in options
+      def create_analyzer
+        if class_name = @options[:analyzer]
+          @logger.debug "instantiating analyzer #{class_name}"
+          return class_name.constantize.new
+        end
+        return Ferret::Analysis::StandardAnalyzer.new
+      end
+    end
+  end
+end

data/lib/stellr/collections/static.rb ADDED Viewed

@@ -0,0 +1,97 @@
+module Stellr
+  module Collections
+    # Static collection implementation.
+    #
+    # This kind of collection is for situations where your index usually doesn't
+    # change but instead is rebuilt from scratch once in a while.
+    #
+    # This collection keeps two indexes, one for searching, and one where all index
+    # modifications take place. Once you are finished building the new index,
+    # call the switch method to put it live. The old index is then dropped and
+    # the new one is used for searching from now on.
+    class Static < WriteableCollection
+      def initialize( name, options )
+        super( name, options )
+        reader
+        writer
+      end
+      def switch
+        @logger.info "switching indexes"
+        @writer_monitor.synchronize do
+          flush
+          optimize
+          close_writer
+          @reader_monitor.synchronize do
+            close_reader
+            relink_indexes
+            clear!
+          end
+        end
+      end
+      protected
+      def open_writer
+        create_directories unless File.exists? indexing_directory # and File.exists? searching_directory
+        IndexWriter.new writer_options
+      end
+      def writer_options
+        {
+          :path        => indexing_directory,
+          :create      => true,
+          :field_infos => create_field_infos,
+          :analyzer    => create_analyzer
+        }
+      end
+      def open_reader
+        already_retried = false
+        begin
+          switch unless File.symlink? searching_directory
+          IndexReader.new searching_directory
+        rescue Ferret::FileNotFoundError
+          switch
+          unless already_retried
+            already_retried = true
+            retry
+          end
+        end
+      end
+      def create_directories
+        FileUtils.mkdir_p index_storage_directory( '0' )
+        FileUtils.mkdir_p index_storage_directory( '1' )
+        FileUtils.ln_s index_storage_directory( '0' ), indexing_directory, :force => true
+        FileUtils.ln_s index_storage_directory( '1' ), searching_directory, :force => true
+      end
+      def indexing_directory
+        File.join( collection_directory, "indexing" )
+      end
+      def searching_directory
+        File.join( collection_directory, "searching" )
+      end
+      def index_storage_directory( suffix )
+        File.join( collection_directory, suffix )
+      end
+      def relink_indexes
+        searching = File.readlink( searching_directory ).untaint
+        indexing  = File.readlink( indexing_directory ).untaint
+        @logger.info "relink_indexes: #{searching} will now be used for indexing"
+        File.delete indexing_directory
+        File.delete searching_directory
+        FileUtils.ln_s indexing, searching_directory, :force => true
+        FileUtils.ln_s searching,  indexing_directory, :force => true
+      end
+    end
+  end
+end

data/lib/stellr/collections/writeable_collection.rb ADDED Viewed

@@ -0,0 +1,119 @@
+module Stellr
+  module Collections
+    # Base class for collection implementations that allow index updates
+    class WriteableCollection < SearchableCollection
+      def initialize( name, options )
+        super
+        @writer_monitor = Monitor.new
+        @processed_records = 0
+        @writer = nil
+      end
+      # Adds the given record to the index.
+      #
+      # Record may be a hash, or a Ferret::Document instance
+      def add_record( record, boost = nil )
+        raise ArgumentError.new("record must contain :id field") if record[:id].nil?
+        if boost
+          if Ferret::Document === record
+            record.boost = boost
+          else
+            hash, record = record, Ferret::Document.new( boost )
+            hash.each_pair do |k,v|
+              record[k] = v
+            end
+          end
+        end
+        @writer_monitor.synchronize do
+          @processed_records += 1
+          w = writer
+          w.delete :id, record[:id].to_s # ensure uniqueness by :id field
+          w << record
+        end
+        true
+      end
+      alias :<< :add_record
+      def delete_record( record )
+        raise ArgumentError.new("record must contain :id field") if record[:id].nil?
+        @writer_monitor.synchronize do
+          @processed_records += 1
+          writer.delete :id, record[:id].to_s
+        end
+        true
+      end
+      # true if records have been processed since the last call to clear!
+      def dirty?
+        @processed_records > 0
+      end
+      def clear!
+        @processed_records = 0
+      end
+      # called whenever the strategy thinks it's a good time do do something
+      # timeconsuming (like switching indexes, optimizing, flushing, ...)
+      def batch_finished
+      end
+      # close this collection
+      def close
+        close_writer
+        super
+      end
+      # flush any unwritten changes to the index
+      def flush
+        @writer_monitor.synchronize do
+          writer.commit
+        end
+      end
+      # optimize the index
+      def optimize
+        @writer_monitor.synchronize do
+          writer.optimize
+        end
+      end
+    protected
+      # should open a writer and return it
+      def open_writer
+        raise 'not implemented'
+      end
+      def writer
+        @writer_monitor.synchronize do
+          @writer ||= open_writer
+        end
+      end
+      def close_writer
+        @writer_monitor.synchronize do
+          notify_listeners( :closing_writer )
+          return unless @writer
+          @writer.close
+          @writer = nil
+        end
+      end
+      def create_field_infos
+        field_infos = FieldInfos.new @options[:field_defaults] || {}
+        @options[:fields].each do |name, definition|
+          field_infos.add_field( name, definition )
+        end if @options[:fields]
+        # provide default settings for :id field
+        field_infos.add_field :id, :store => :yes, :index => :untokenized unless field_infos[:id]
+        field_infos
+      end
+    end
+  end
+end