stellr 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,32 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ #
5
+ class MultiCollection < SearchableCollection
6
+
7
+ def initialize( name, collections, options = {} )
8
+ super name, options
9
+ @collections = {}
10
+ collections.each do |collection|
11
+ @collections[collection.name] = collection
12
+ collection.add_listener do |event|
13
+ handle_event(collection.name, event)
14
+ end
15
+ end
16
+ end
17
+
18
+ protected
19
+
20
+ def open_reader
21
+ IndexReader.new @collections.values.map{|c| c.reader}
22
+ end
23
+
24
+ def handle_event(collection_name, event)
25
+ @logger.debug "handle_event: #{event.inspect}"
26
+ close_reader if event == :closing_reader
27
+ end
28
+
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,38 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ # RSync collection implementation.
5
+ #
6
+ # Keeps two indexes around - one for searching and one for indexing. The
7
+ # difference when compared with the Static collection is that every
8
+ # now and then the changes are synced from the latter to the former using RSync,
9
+ # so no complete rebuild is necessary after a switch.
10
+ class RSync < Static
11
+
12
+ def batch_finished
13
+ switch if dirty?
14
+ end
15
+
16
+ # we want the indexes to be in sync after close, so do a last switch
17
+ alias :close :switch
18
+
19
+ protected
20
+
21
+ def writer_options
22
+ super.merge :create => @options[:recreate]
23
+ end
24
+
25
+ # overridden to sync the indexes after re-linking
26
+ def relink_indexes
27
+ super
28
+ sync_indexes
29
+ end
30
+
31
+ def sync_indexes
32
+ system("rsync -r --delete #{searching_directory}/ #{indexing_directory}")
33
+ end
34
+
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,166 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ # Base class for searchable collection implementations
5
+ class SearchableCollection < Base
6
+
7
+ def initialize( name, options )
8
+ super name, options
9
+ @reader_monitor = Monitor.new
10
+ @query_parser_monitor = Monitor.new
11
+ @reader = @searcher = @query_parser = nil
12
+ end
13
+
14
+
15
+ # Search this collection.
16
+ # Options is a hash taking the usual Ferret::Search::Searcher options,
17
+ # plus:
18
+ # [+page+] Page of results to show, starting with 1
19
+ # [+per_page+] Number of records per page, default 10.
20
+ # [+fields+] Array of fields to search in
21
+ # [+get_fields+] Array of fields to retrieve in addition to the
22
+ # :id field
23
+ #
24
+ # The page and per_page options take precedence over any given limit and
25
+ # offset values.
26
+ def search(query, options = {})
27
+ results = Stellr::Search::SearchResults.new
28
+
29
+ if options[:page]
30
+ results.current_page = options.delete(:page).to_i
31
+ options[:limit] = results.per_page = options.delete(:per_page) || 10
32
+ options[:offset] = (p = results.current_page - 1) <= 0 ? 0 : p * results.per_page
33
+ end
34
+
35
+ get_fields = options.delete :get_fields
36
+ # TODO replace synchronization with some kind of shared read/exclusive
37
+ # write locking mechanism allowing parallel searches but guarding
38
+ # against the reader instance being shut down while we're inside
39
+ # retrieve_field_data
40
+ @reader_monitor.synchronize do
41
+ q = process_query query, options
42
+ @logger.debug "options: #{options.inspect}"
43
+ results.total_hits = searcher.search_each q, options do |id, score|
44
+ field_data = retrieve_field_data(id, get_fields)
45
+ results << Stellr::Search::SearchResult.new( id, score, field_data )
46
+ end
47
+ @logger.info "query #{query} : #{results.total_hits} results"
48
+ end
49
+ return results
50
+ end
51
+
52
+ def highlight( doc_id, query, options = {})
53
+ return searcher.highlight(process_query(query, options), doc_id, options[:field], options)
54
+ rescue
55
+ @logger.error "error in highlight: #{$!}. Document #{doc_id}, Query: #{query}, options: #{options.inspect}"
56
+ ''
57
+ end
58
+
59
+ def size
60
+ reader.num_docs
61
+ end
62
+
63
+ def on_shutdown( mode )
64
+ close
65
+ end
66
+
67
+ # close this collection
68
+ def close
69
+ close_reader
70
+ end
71
+
72
+ protected
73
+
74
+ # should open a reader and return it
75
+ def open_reader
76
+ raise 'not implemented'
77
+ end
78
+
79
+ def reader
80
+ @reader_monitor.synchronize do
81
+ @reader ||= open_reader
82
+ end
83
+ end
84
+
85
+ def searcher
86
+ @reader_monitor.synchronize do
87
+ @searcher ||= Ferret::Search::Searcher.new reader
88
+ end
89
+ end
90
+
91
+ def query_parser
92
+ @query_parser_monitor.synchronize do
93
+ @query_parser ||= create_query_parser
94
+ end
95
+ end
96
+
97
+ def create_query_parser(options = {})
98
+ Ferret::QueryParser.new( { :analyzer => create_analyzer, :or_default => false }.merge( options ) )
99
+ end
100
+
101
+ # reads field data for +:id+ and any other given fields from
102
+ # the document given by +id+
103
+ # unsynchronized reader access occurs, so only use from within blocks
104
+ # synchronizing on @reader_monitor.
105
+ def retrieve_field_data(id, fields = nil)
106
+ doc = reader[id]
107
+ field_data = { :id => doc[:id] }
108
+ fields.each do |f|
109
+ field_data[f] = doc[f]
110
+ end if fields
111
+ return field_data
112
+ end
113
+
114
+ # Turn a query string into a Ferret Query object.
115
+ # unsynchronized reader access occurs, so only use from
116
+ # within blocks synchronizing on @reader_monitor.
117
+ def process_query(query, options)
118
+ @logger.debug "process_query: #{query.inspect}"
119
+ q = query.dup
120
+ if String === q
121
+ @query_parser_monitor.synchronize do
122
+ qp = query_parser
123
+ tokenized_fields = reader.tokenized_fields
124
+ qp.fields = options[:fields] || tokenized_fields # reader.fields
125
+ qp.tokenized_fields = tokenized_fields
126
+ @logger.debug "tokenized_fields: #{tokenized_fields}"
127
+ q = qp.parse q
128
+ end
129
+ end
130
+ @logger.debug "processed query: #{q.inspect}"
131
+ return q
132
+ rescue
133
+ @logger.error "error processing query: #{$!}"
134
+ end
135
+
136
+ def collection_directory
137
+ @options[:path]
138
+ end
139
+
140
+ def close_reader
141
+ @reader_monitor.synchronize do
142
+ notify_listeners( :closing_reader )
143
+ return unless @reader
144
+ @query_parser = nil
145
+ @searcher.close if @searcher
146
+ @searcher = nil
147
+ @reader.close
148
+ @reader = nil
149
+ end
150
+ end
151
+
152
+ # TODO allow declarative analyzer specification in options
153
+ def create_analyzer
154
+ if class_name = @options[:analyzer]
155
+ @logger.debug "instantiating analyzer #{class_name}"
156
+ return class_name.constantize.new
157
+ end
158
+ return Ferret::Analysis::StandardAnalyzer.new
159
+ end
160
+
161
+ end
162
+
163
+ end
164
+
165
+ end
166
+
@@ -0,0 +1,97 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ # Static collection implementation.
5
+ #
6
+ # This kind of collection is for situations where your index usually doesn't
7
+ # change but instead is rebuilt from scratch once in a while.
8
+ #
9
+ # This collection keeps two indexes, one for searching, and one where all index
10
+ # modifications take place. Once you are finished building the new index,
11
+ # call the switch method to put it live. The old index is then dropped and
12
+ # the new one is used for searching from now on.
13
+ class Static < WriteableCollection
14
+
15
+ def initialize( name, options )
16
+ super( name, options )
17
+ reader
18
+ writer
19
+ end
20
+
21
+ def switch
22
+ @logger.info "switching indexes"
23
+ @writer_monitor.synchronize do
24
+ flush
25
+ optimize
26
+ close_writer
27
+ @reader_monitor.synchronize do
28
+ close_reader
29
+ relink_indexes
30
+ clear!
31
+ end
32
+ end
33
+ end
34
+
35
+ protected
36
+
37
+ def open_writer
38
+ create_directories unless File.exists? indexing_directory # and File.exists? searching_directory
39
+ IndexWriter.new writer_options
40
+ end
41
+
42
+ def writer_options
43
+ {
44
+ :path => indexing_directory,
45
+ :create => true,
46
+ :field_infos => create_field_infos,
47
+ :analyzer => create_analyzer
48
+ }
49
+ end
50
+
51
+ def open_reader
52
+ already_retried = false
53
+ begin
54
+ switch unless File.symlink? searching_directory
55
+ IndexReader.new searching_directory
56
+ rescue Ferret::FileNotFoundError
57
+ switch
58
+ unless already_retried
59
+ already_retried = true
60
+ retry
61
+ end
62
+ end
63
+ end
64
+
65
+ def create_directories
66
+ FileUtils.mkdir_p index_storage_directory( '0' )
67
+ FileUtils.mkdir_p index_storage_directory( '1' )
68
+ FileUtils.ln_s index_storage_directory( '0' ), indexing_directory, :force => true
69
+ FileUtils.ln_s index_storage_directory( '1' ), searching_directory, :force => true
70
+ end
71
+
72
+ def indexing_directory
73
+ File.join( collection_directory, "indexing" )
74
+ end
75
+
76
+ def searching_directory
77
+ File.join( collection_directory, "searching" )
78
+ end
79
+
80
+ def index_storage_directory( suffix )
81
+ File.join( collection_directory, suffix )
82
+ end
83
+
84
+ def relink_indexes
85
+ searching = File.readlink( searching_directory ).untaint
86
+ indexing = File.readlink( indexing_directory ).untaint
87
+ @logger.info "relink_indexes: #{searching} will now be used for indexing"
88
+ File.delete indexing_directory
89
+ File.delete searching_directory
90
+ FileUtils.ln_s indexing, searching_directory, :force => true
91
+ FileUtils.ln_s searching, indexing_directory, :force => true
92
+ end
93
+
94
+ end
95
+
96
+ end
97
+ end
@@ -0,0 +1,119 @@
1
+ module Stellr
2
+ module Collections
3
+
4
+ # Base class for collection implementations that allow index updates
5
+ class WriteableCollection < SearchableCollection
6
+
7
+ def initialize( name, options )
8
+ super
9
+ @writer_monitor = Monitor.new
10
+ @processed_records = 0
11
+ @writer = nil
12
+ end
13
+
14
+ # Adds the given record to the index.
15
+ #
16
+ # Record may be a hash, or a Ferret::Document instance
17
+ def add_record( record, boost = nil )
18
+ raise ArgumentError.new("record must contain :id field") if record[:id].nil?
19
+ if boost
20
+ if Ferret::Document === record
21
+ record.boost = boost
22
+ else
23
+ hash, record = record, Ferret::Document.new( boost )
24
+ hash.each_pair do |k,v|
25
+ record[k] = v
26
+ end
27
+ end
28
+ end
29
+ @writer_monitor.synchronize do
30
+ @processed_records += 1
31
+ w = writer
32
+ w.delete :id, record[:id].to_s # ensure uniqueness by :id field
33
+ w << record
34
+ end
35
+ true
36
+ end
37
+ alias :<< :add_record
38
+
39
+ def delete_record( record )
40
+ raise ArgumentError.new("record must contain :id field") if record[:id].nil?
41
+ @writer_monitor.synchronize do
42
+ @processed_records += 1
43
+ writer.delete :id, record[:id].to_s
44
+ end
45
+ true
46
+ end
47
+
48
+ # true if records have been processed since the last call to clear!
49
+ def dirty?
50
+ @processed_records > 0
51
+ end
52
+
53
+ def clear!
54
+ @processed_records = 0
55
+ end
56
+
57
+ # called whenever the strategy thinks it's a good time do do something
58
+ # timeconsuming (like switching indexes, optimizing, flushing, ...)
59
+ def batch_finished
60
+ end
61
+
62
+ # close this collection
63
+ def close
64
+ close_writer
65
+ super
66
+ end
67
+
68
+ # flush any unwritten changes to the index
69
+ def flush
70
+ @writer_monitor.synchronize do
71
+ writer.commit
72
+ end
73
+ end
74
+
75
+ # optimize the index
76
+ def optimize
77
+ @writer_monitor.synchronize do
78
+ writer.optimize
79
+ end
80
+ end
81
+
82
+ protected
83
+
84
+ # should open a writer and return it
85
+ def open_writer
86
+ raise 'not implemented'
87
+ end
88
+
89
+ def writer
90
+ @writer_monitor.synchronize do
91
+ @writer ||= open_writer
92
+ end
93
+ end
94
+
95
+
96
+ def close_writer
97
+ @writer_monitor.synchronize do
98
+ notify_listeners( :closing_writer )
99
+ return unless @writer
100
+ @writer.close
101
+ @writer = nil
102
+ end
103
+ end
104
+
105
+ def create_field_infos
106
+ field_infos = FieldInfos.new @options[:field_defaults] || {}
107
+ @options[:fields].each do |name, definition|
108
+ field_infos.add_field( name, definition )
109
+ end if @options[:fields]
110
+ # provide default settings for :id field
111
+ field_infos.add_field :id, :store => :yes, :index => :untokenized unless field_infos[:id]
112
+ field_infos
113
+ end
114
+
115
+ end
116
+
117
+ end
118
+
119
+ end