stellr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ #
5
+ class MultiCollection < SearchableCollection
6
+
7
+ def initialize( name, collections, options = {} )
8
+ super name, options
9
+ @collections = {}
10
+ collections.each do |collection|
11
+ @collections[collection.name] = collection
12
+ collection.add_listener do |event|
13
+ handle_event(collection.name, event)
14
+ end
15
+ end
16
+ end
17
+
18
+ protected
19
+
20
+ def open_reader
21
+ IndexReader.new @collections.values.map{|c| c.reader}
22
+ end
23
+
24
+ def handle_event(collection_name, event)
25
+ @logger.debug "handle_event: #{event.inspect}"
26
+ close_reader if event == :closing_reader
27
+ end
28
+
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,38 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ # RSync collection implementation.
5
+ #
6
+ # Keeps two indexes around - one for searching and one for indexing. The
7
+ # difference when compared with the Static collection is that every
8
+ # now and then the changes are synced from the latter to the former using RSync,
9
+ # so no complete rebuild is necessary after a switch.
10
+ class RSync < Static
11
+
12
+ def batch_finished
13
+ switch if dirty?
14
+ end
15
+
16
+ # we want the indexes to be in sync after close, so do a last switch
17
+ alias :close :switch
18
+
19
+ protected
20
+
21
+ def writer_options
22
+ super.merge :create => @options[:recreate]
23
+ end
24
+
25
+ # overridden to sync the indexes after re-linking
26
+ def relink_indexes
27
+ super
28
+ sync_indexes
29
+ end
30
+
31
+ def sync_indexes
32
+ system("rsync -r --delete #{searching_directory}/ #{indexing_directory}")
33
+ end
34
+
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,166 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ # Base class for searchable collection implementations
5
+ class SearchableCollection < Base
6
+
7
+ def initialize( name, options )
8
+ super name, options
9
+ @reader_monitor = Monitor.new
10
+ @query_parser_monitor = Monitor.new
11
+ @reader = @searcher = @query_parser = nil
12
+ end
13
+
14
+
15
+ # Search this collection.
16
+ # Options is a hash taking the usual Ferret::Search::Searcher options,
17
+ # plus:
18
+ # [+page+] Page of results to show, starting with 1
19
+ # [+per_page+] Number of records per page, default 10.
20
+ # [+fields+] Array of fields to search in
21
+ # [+get_fields+] Array of fields to retrieve in addition to the
22
+ # :id field
23
+ #
24
+ # The page and per_page options take precedence over any given limit and
25
+ # offset values.
26
+ def search(query, options = {})
27
+ results = Stellr::Search::SearchResults.new
28
+
29
+ if options[:page]
30
+ results.current_page = options.delete(:page).to_i
31
+ options[:limit] = results.per_page = options.delete(:per_page) || 10
32
+ options[:offset] = (p = results.current_page - 1) <= 0 ? 0 : p * results.per_page
33
+ end
34
+
35
+ get_fields = options.delete :get_fields
36
+ # TODO replace synchronization with some kind of shared read/exclusive
37
+ # write locking mechanism allowing parallel searches but guarding
38
+ # against the reader instance being shut down while we're inside
39
+ # retrieve_field_data
40
+ @reader_monitor.synchronize do
41
+ q = process_query query, options
42
+ @logger.debug "options: #{options.inspect}"
43
+ results.total_hits = searcher.search_each q, options do |id, score|
44
+ field_data = retrieve_field_data(id, get_fields)
45
+ results << Stellr::Search::SearchResult.new( id, score, field_data )
46
+ end
47
+ @logger.info "query #{query} : #{results.total_hits} results"
48
+ end
49
+ return results
50
+ end
51
+
52
+ def highlight( doc_id, query, options = {})
53
+ return searcher.highlight(process_query(query, options), doc_id, options[:field], options)
54
+ rescue
55
+ @logger.error "error in highlight: #{$!}. Document #{doc_id}, Query: #{query}, options: #{options.inspect}"
56
+ ''
57
+ end
58
+
59
+ def size
60
+ reader.num_docs
61
+ end
62
+
63
+ def on_shutdown( mode )
64
+ close
65
+ end
66
+
67
+ # close this collection
68
+ def close
69
+ close_reader
70
+ end
71
+
72
+ protected
73
+
74
+ # should open a reader and return it
75
+ def open_reader
76
+ raise 'not implemented'
77
+ end
78
+
79
+ def reader
80
+ @reader_monitor.synchronize do
81
+ @reader ||= open_reader
82
+ end
83
+ end
84
+
85
+ def searcher
86
+ @reader_monitor.synchronize do
87
+ @searcher ||= Ferret::Search::Searcher.new reader
88
+ end
89
+ end
90
+
91
+ def query_parser
92
+ @query_parser_monitor.synchronize do
93
+ @query_parser ||= create_query_parser
94
+ end
95
+ end
96
+
97
+ def create_query_parser(options = {})
98
+ Ferret::QueryParser.new( { :analyzer => create_analyzer, :or_default => false }.merge( options ) )
99
+ end
100
+
101
+ # reads field data for +:id+ and any other given fields from
102
+ # the document given by +id+
103
+ # unsynchronized reader access occurs, so only use from within blocks
104
+ # synchronizing on @reader_monitor.
105
+ def retrieve_field_data(id, fields = nil)
106
+ doc = reader[id]
107
+ field_data = { :id => doc[:id] }
108
+ fields.each do |f|
109
+ field_data[f] = doc[f]
110
+ end if fields
111
+ return field_data
112
+ end
113
+
114
+ # Turn a query string into a Ferret Query object.
115
+ # unsynchronized reader access occurs, so only use from
116
+ # within blocks synchronizing on @reader_monitor.
117
+ def process_query(query, options)
118
+ @logger.debug "process_query: #{query.inspect}"
119
+ q = query.dup
120
+ if String === q
121
+ @query_parser_monitor.synchronize do
122
+ qp = query_parser
123
+ tokenized_fields = reader.tokenized_fields
124
+ qp.fields = options[:fields] || tokenized_fields # reader.fields
125
+ qp.tokenized_fields = tokenized_fields
126
+ @logger.debug "tokenized_fields: #{tokenized_fields}"
127
+ q = qp.parse q
128
+ end
129
+ end
130
+ @logger.debug "processed query: #{q.inspect}"
131
+ return q
132
+ rescue
133
+ @logger.error "error processing query: #{$!}"
134
+ end
135
+
136
+ def collection_directory
137
+ @options[:path]
138
+ end
139
+
140
+ def close_reader
141
+ @reader_monitor.synchronize do
142
+ notify_listeners( :closing_reader )
143
+ return unless @reader
144
+ @query_parser = nil
145
+ @searcher.close if @searcher
146
+ @searcher = nil
147
+ @reader.close
148
+ @reader = nil
149
+ end
150
+ end
151
+
152
+ # TODO allow declarative analyzer specification in options
153
+ def create_analyzer
154
+ if class_name = @options[:analyzer]
155
+ @logger.debug "instantiating analyzer #{class_name}"
156
+ return class_name.constantize.new
157
+ end
158
+ return Ferret::Analysis::StandardAnalyzer.new
159
+ end
160
+
161
+ end
162
+
163
+ end
164
+
165
+ end
166
+
@@ -0,0 +1,97 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ # Static collection implementation.
5
+ #
6
+ # This kind of collection is for situations where your index usually doesn't
7
+ # change but instead is rebuilt from scratch once in a while.
8
+ #
9
+ # This collection keeps two indexes, one for searching, and one where all index
10
+ # modifications take place. Once you are finished building the new index,
11
+ # call the switch method to put it live. The old index is then dropped and
12
+ # the new one is used for searching from now on.
13
+ class Static < WriteableCollection
14
+
15
+ def initialize( name, options )
16
+ super( name, options )
17
+ reader
18
+ writer
19
+ end
20
+
21
+ def switch
22
+ @logger.info "switching indexes"
23
+ @writer_monitor.synchronize do
24
+ flush
25
+ optimize
26
+ close_writer
27
+ @reader_monitor.synchronize do
28
+ close_reader
29
+ relink_indexes
30
+ clear!
31
+ end
32
+ end
33
+ end
34
+
35
+ protected
36
+
37
+ def open_writer
38
+ create_directories unless File.exists? indexing_directory # and File.exists? searching_directory
39
+ IndexWriter.new writer_options
40
+ end
41
+
42
+ def writer_options
43
+ {
44
+ :path => indexing_directory,
45
+ :create => true,
46
+ :field_infos => create_field_infos,
47
+ :analyzer => create_analyzer
48
+ }
49
+ end
50
+
51
+ def open_reader
52
+ already_retried = false
53
+ begin
54
+ switch unless File.symlink? searching_directory
55
+ IndexReader.new searching_directory
56
+ rescue Ferret::FileNotFoundError
57
+ switch
58
+ unless already_retried
59
+ already_retried = true
60
+ retry
61
+ end
62
+ end
63
+ end
64
+
65
+ def create_directories
66
+ FileUtils.mkdir_p index_storage_directory( '0' )
67
+ FileUtils.mkdir_p index_storage_directory( '1' )
68
+ FileUtils.ln_s index_storage_directory( '0' ), indexing_directory, :force => true
69
+ FileUtils.ln_s index_storage_directory( '1' ), searching_directory, :force => true
70
+ end
71
+
72
+ def indexing_directory
73
+ File.join( collection_directory, "indexing" )
74
+ end
75
+
76
+ def searching_directory
77
+ File.join( collection_directory, "searching" )
78
+ end
79
+
80
+ def index_storage_directory( suffix )
81
+ File.join( collection_directory, suffix )
82
+ end
83
+
84
+ def relink_indexes
85
+ searching = File.readlink( searching_directory ).untaint
86
+ indexing = File.readlink( indexing_directory ).untaint
87
+ @logger.info "relink_indexes: #{searching} will now be used for indexing"
88
+ File.delete indexing_directory
89
+ File.delete searching_directory
90
+ FileUtils.ln_s indexing, searching_directory, :force => true
91
+ FileUtils.ln_s searching, indexing_directory, :force => true
92
+ end
93
+
94
+ end
95
+
96
+ end
97
+ end
@@ -0,0 +1,119 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ # Base class for collection implementations that allow index updates
5
+ class WriteableCollection < SearchableCollection
6
+
7
+ def initialize( name, options )
8
+ super
9
+ @writer_monitor = Monitor.new
10
+ @processed_records = 0
11
+ @writer = nil
12
+ end
13
+
14
+ # Adds the given record to the index.
15
+ #
16
+ # Record may be a hash, or a Ferret::Document instance
17
+ def add_record( record, boost = nil )
18
+ raise ArgumentError.new("record must contain :id field") if record[:id].nil?
19
+ if boost
20
+ if Ferret::Document === record
21
+ record.boost = boost
22
+ else
23
+ hash, record = record, Ferret::Document.new( boost )
24
+ hash.each_pair do |k,v|
25
+ record[k] = v
26
+ end
27
+ end
28
+ end
29
+ @writer_monitor.synchronize do
30
+ @processed_records += 1
31
+ w = writer
32
+ w.delete :id, record[:id].to_s # ensure uniqueness by :id field
33
+ w << record
34
+ end
35
+ true
36
+ end
37
+ alias :<< :add_record
38
+
39
+ def delete_record( record )
40
+ raise ArgumentError.new("record must contain :id field") if record[:id].nil?
41
+ @writer_monitor.synchronize do
42
+ @processed_records += 1
43
+ writer.delete :id, record[:id].to_s
44
+ end
45
+ true
46
+ end
47
+
48
+ # true if records have been processed since the last call to clear!
49
+ def dirty?
50
+ @processed_records > 0
51
+ end
52
+
53
+ def clear!
54
+ @processed_records = 0
55
+ end
56
+
57
+ # called whenever the strategy thinks it's a good time do do something
58
+ # timeconsuming (like switching indexes, optimizing, flushing, ...)
59
+ def batch_finished
60
+ end
61
+
62
+ # close this collection
63
+ def close
64
+ close_writer
65
+ super
66
+ end
67
+
68
+ # flush any unwritten changes to the index
69
+ def flush
70
+ @writer_monitor.synchronize do
71
+ writer.commit
72
+ end
73
+ end
74
+
75
+ # optimize the index
76
+ def optimize
77
+ @writer_monitor.synchronize do
78
+ writer.optimize
79
+ end
80
+ end
81
+
82
+ protected
83
+
84
+ # should open a writer and return it
85
+ def open_writer
86
+ raise 'not implemented'
87
+ end
88
+
89
+ def writer
90
+ @writer_monitor.synchronize do
91
+ @writer ||= open_writer
92
+ end
93
+ end
94
+
95
+
96
+ def close_writer
97
+ @writer_monitor.synchronize do
98
+ notify_listeners( :closing_writer )
99
+ return unless @writer
100
+ @writer.close
101
+ @writer = nil
102
+ end
103
+ end
104
+
105
+ def create_field_infos
106
+ field_infos = FieldInfos.new @options[:field_defaults] || {}
107
+ @options[:fields].each do |name, definition|
108
+ field_infos.add_field( name, definition )
109
+ end if @options[:fields]
110
+ # provide default settings for :id field
111
+ field_infos.add_field :id, :store => :yes, :index => :untokenized unless field_infos[:id]
112
+ field_infos
113
+ end
114
+
115
+ end
116
+
117
+ end
118
+
119
+ end