lucene 0.5.0.beta.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,267 @@
1
+ require 'monitor'
2
+ require 'lucene/jars'
3
+ require 'lucene/transaction'
4
+ require 'lucene/index_searcher'
5
+ require 'lucene/document'
6
+ require 'lucene/field_info'
7
+ require 'lucene/index_info'
8
+
9
+ #
10
+ # A wrapper for the Java lucene search library.
11
+ #
12
+ module Lucene
13
+
14
+ class DocumentDeletedException < StandardError;
15
+ end
16
+ class IdFieldMissingException < StandardError;
17
+ end
18
+
19
+ #
20
+ # Represents a Lucene Index.
21
+ # The index is written/updated only when the commit method is called.
22
+ # This is done since writing to the index file should be done as a batch operation.
23
+ # (Performance will be bad otherwise).
24
+ #
25
+ # For each Thread there is zero or one Index instance. There are at most one Index instance per thread
26
+ # so there is no need for this class to use synchronization for Thread safety.
27
+ #
28
+ class Index
29
+ attr_reader :path, :uncommited
30
+
31
+
32
+ # locks per index path, must not write to the same index from 2 threads
33
+ @@locks = {}
34
+ @@locks.extend MonitorMixin
35
+
36
+ def initialize(path, index_info)
37
+ @path = path # a key (i.e. filepath) where the index is stored on disk/or RAM
38
+ @index_info = index_info # the actual storage of the index
39
+ @uncommited = {} # documents to be commited, a hash of Document
40
+ @deleted_ids = [] # documents to be deleted
41
+ end
42
+
43
+ def field_infos
44
+ IndexInfo.instance(@path)
45
+ end
46
+
47
+
48
+ # Returns an Index instance for the current running transaction.
49
+ #
50
+ # Tries to reuse an Index instance for the current running transaction.
51
+ # If a Lucene::Transaction is running it will register this index in that transaction if
52
+ # this has not already been done.
53
+ # When it has been registered in the transaction the transaction will commit the index
54
+ # when the transaction is finished.
55
+ # The configuration (kept in the #field_infos) for this index will be the same for all indexes with the same path/key.
56
+ #
57
+ # ==== Parameters
58
+ # path<String>:: The key or location where the index should be stored (relative Lucene::Config[:storage_path]
59
+ #
60
+ # ==== Examples
61
+ # Index.new 'foo/lucene-db'
62
+ #
63
+ # ==== Returns
64
+ # Returns a new or an already existing Index
65
+ #
66
+ def self.new(path)
67
+ # make sure no one modifies the index specified at given path
68
+ lock(path).synchronize do
69
+ # create a new transaction if needed
70
+ Transaction.new unless Transaction.running?
71
+
72
+ # create a new instance only if it does not already exist in the current transaction
73
+ unless Transaction.current.index?(path)
74
+ info = IndexInfo.instance(path)
75
+ index = super(path, info)
76
+ Transaction.current.register_index(path, index)
77
+ end
78
+ end
79
+ # return the index for the current transaction
80
+ Transaction.current.index(path)
81
+ end
82
+
83
+
84
+ #
85
+ # Delete all uncommited documents. Also deregister this index
86
+ # from the current transaction (if there is one transaction)
87
+ #
88
+ def clear
89
+ @uncommited.clear
90
+ Transaction.current.deregister_index self if Transaction.running?
91
+ end
92
+
93
+ #
94
+ # See instance method Index.clear
95
+ #
96
+ def self.clear(path)
97
+ return unless Transaction.running?
98
+ return unless Transaction.current.index?(path)
99
+ Transaction.current.index(path).clear
100
+ end
101
+
102
+ # Creates a new document from the given hash of values.
103
+ # This document will be stored in this instance till it is commited.
104
+ #
105
+ # ==== Parameters
106
+ # path<String>:: The key or location where the index should be stored (relative Lucene::Config[:storage_path]
107
+ #
108
+ # ==== Examples
109
+ # index = Index.new('name_or_path_to_index')
110
+ # index << {:id=>'1', :name=>'foo'}
111
+ #
112
+ # ==== Returns
113
+ # Returns the index instance so that this method can be chained
114
+ #
115
+ def <<(key_values)
116
+ doc = Document.new(field_infos, key_values)
117
+ @uncommited[doc.id] = doc
118
+ self
119
+ end
120
+
121
+ def id_field
122
+ @index_info.id_field
123
+ end
124
+
125
+ #
126
+ # Updates the specified document.
127
+ # The index file will not be updated until the transaction commits.
128
+ # The doc is stored in memory till the transaction commits.
129
+ #
130
+ def update(doc)
131
+ @uncommited[doc.id] = doc
132
+ end
133
+
134
+ #
135
+ # Delete the specified document.
136
+ # The index file not be updated until the transaction commits.
137
+ # The id of the deleted document is stored in memory till the transaction commits.
138
+ #
139
+ def delete(id)
140
+ @deleted_ids << id.to_s
141
+ end
142
+
143
+
144
+ def deleted?(id)
145
+ @deleted_ids.include?(id.to_s)
146
+ end
147
+
148
+ def updated?(id)
149
+ @uncommited[id.to_s]
150
+ end
151
+
152
+ # Writes to the index files.
153
+ # Open and closes an lucene IndexWriter
154
+ # Close the IndexSearcher so that it will read the updated index next time.
155
+ # This method will automatically be called from a Lucene::Transaction if it was running when the index was created.
156
+ #
157
+ # This method is synchronized since it is not allowed to update a lucene index from several threads at the same time.
158
+ #
159
+ def commit
160
+ lock.synchronize do
161
+ delete_documents # deletes all documents given @deleted_ids
162
+
163
+ # are any updated document deleted ?
164
+ deleted_ids = @uncommited.keys & @deleted_ids
165
+ # make sure we don't index deleted document
166
+ deleted_ids.each {|id| @uncommited.delete(id)}
167
+
168
+ # update the remaining documents that has not been deleted
169
+
170
+ begin
171
+ index_writer = org.apache.lucene.index.IndexWriter.new(@index_info.storage, @index_info.analyzer, ! exist?)
172
+ # removes the document and adds it again
173
+ @uncommited.each_value { |doc| doc.update(index_writer) }
174
+ ensure
175
+ # TODO exception handling, what if ...
176
+ index_writer.close
177
+
178
+ @uncommited.clear
179
+ @deleted_ids.clear
180
+
181
+ # if we are running in a transaction remove this so it will not be committed twice
182
+ Transaction.current.deregister_index(self) if Transaction.running?
183
+ end
184
+ end
185
+ end
186
+
187
+
188
+ #
189
+ # Delegates to the IndexSearcher.find method
190
+ #
191
+ def find(*query, &block)
192
+ # new method is a factory method, does not create if it already exists
193
+ searcher = IndexSearcher.new(@index_info.storage)
194
+
195
+ if block.nil?
196
+ case query.first
197
+ when String
198
+ return searcher.find(@index_info, query)
199
+ when Hash, Array
200
+ return searcher.find(@index_info, query.first)
201
+ end
202
+ else
203
+ return searcher.find_dsl(@index_info, &block)
204
+ end
205
+ end
206
+
207
+
208
+ def to_s
209
+ "Index [path: '#@path', #{@uncommited.size} documents]"
210
+ end
211
+
212
+ #
213
+ # -------------------------------------------------------------------------
214
+ # Private methods
215
+ #
216
+
217
+ private
218
+
219
+ #
220
+ # There is one lock per index path.
221
+ #
222
+ def lock
223
+ @@locks.synchronize do
224
+ @@locks[@path] ||= Monitor.new
225
+ @@locks[@path]
226
+ end
227
+ end
228
+
229
+ def self.lock(path)
230
+ @@locks.synchronize do
231
+ @@locks[path] ||= Monitor.new
232
+ @@locks[path]
233
+ end
234
+ end
235
+
236
+ #
237
+ # Returns true if the index already exists.
238
+ #
239
+ def exist?
240
+ @index_info.index_exists?
241
+ end
242
+
243
+ #
244
+ # --------------------------------------------------------------------------
245
+ #
246
+ private
247
+
248
+ def delete_documents # :nodoc:
249
+ return unless exist? # if no index exists then there is nothing to do
250
+
251
+ writer = org.apache.lucene.index.IndexWriter.new(@index_info.storage, @index_info.analyzer, false)
252
+ id_field = @index_info.infos[@index_info.id_field]
253
+
254
+ @deleted_ids.each do |id|
255
+ converted_value = id_field.convert_to_lucene(id)
256
+ writer.deleteDocuments(org.apache.lucene.index.Term.new(@index_info.id_field.to_s, converted_value))
257
+ end
258
+ ensure
259
+ # TODO exception handling, what if ...
260
+ writer.close unless writer.nil?
261
+ end
262
+
263
+
264
+ end
265
+ end
266
+
267
+
@@ -0,0 +1,146 @@
1
+ module Lucene
2
+
3
+ #
4
+ # Contains info for a specific Index identified by a path
5
+ # Contains a
6
+ # * collection of FieldInfo objects.
7
+ # * the name of the id field.
8
+ # * the index storage, either file based or RAM based.
9
+ #
10
+ # Fields has default value IndexInfo::DEFAULTS.
11
+ #
12
+ class IndexInfo #:nodoc:
13
+ DEFAULTS = FieldInfo.new({}).freeze
14
+
15
+ attr_reader :infos, :path
16
+ attr_accessor :id_field
17
+ attr_writer :store_on_file
18
+
19
+ # Initializes this object by setting values to default values specified in the Lucene::Config.
20
+ # The path/id to the index is specified by the the path parameter.
21
+ # If the index is Lucene::Config[:storage_path]
22
+ # ==== Block parameters
23
+ # path<String>:: The id or the (incomplete) path on the filesystem of the index
24
+ #
25
+ # :api: private
26
+ def initialize(path)
27
+ $LUCENE_LOGGER.debug{"IndexInfo#initialize(#{path})"}
28
+ @id_field = Lucene::Config[:id_field].to_sym
29
+ @path = path
30
+ @store_on_file = Lucene::Config[:store_on_file]
31
+ @infos = {}
32
+ # always store the id field
33
+ @infos[@id_field] = FieldInfo.new(:store => true)
34
+ end
35
+
36
+ def to_s
37
+ "IndexInfo [#{@id_field}, #{@infos.inspect}]"
38
+ end
39
+
40
+ def store_on_file?
41
+ @store_on_file
42
+ end
43
+
44
+ def storage
45
+ @storage ||= create_storage
46
+ end
47
+
48
+ def create_storage
49
+ if store_on_file?
50
+ raise StandardError.new("Lucene::Config[:storage_path] is nil but index configured to be stored on filesystem") if Lucene::Config[:storage_path].nil?
51
+ Lucene::Config[:storage_path] + @path
52
+ else
53
+ org.apache.lucene.store.RAMDirectory.new
54
+ end
55
+ end
56
+
57
+
58
+ def self.instance?(path)
59
+ return false if @instances.nil?
60
+ ! @instances[path].nil?
61
+ end
62
+
63
+ # Creates and initializes an IndexInfo object by setting values to default
64
+ # values specified in the Lucene::Config. Does not create new object if it has
65
+ # already been created before with the given path.
66
+ #
67
+ # If the index is stored on the filesystem the complete path will be
68
+ # Lucene::Config[:storage_path] + /path
69
+ #
70
+ # ==== Block parameters
71
+ # path<String>:: The id or the (incomplete) path on the filesystem of the index
72
+ #
73
+ # :api: public
74
+ def self.instance(path)
75
+ @instances ||= {}
76
+ $LUCENE_LOGGER.debug{"IndexInfos#instance(#{path}) : @instances[path]: #{@instances[path]}"}
77
+ @instances[path] ||= IndexInfo.new(path)
78
+ end
79
+
80
+ def self.delete_all
81
+ $LUCENE_LOGGER.debug{"IndexInfos#delete_all"}
82
+ @instances = nil
83
+ end
84
+
85
+ def self.index_exists(path)
86
+ return false if @instances[path].nil?
87
+ instance(path).index_exists?
88
+ end
89
+
90
+ def index_exists?
91
+ org.apache.lucene.index.IndexReader.index_exists(storage)
92
+ end
93
+
94
+ def each_pair
95
+ @infos.each_pair{|key,value| yield key,value}
96
+ end
97
+
98
+ def analyzer
99
+ # do all fields have the default value :standard analyzer ?
100
+ if @infos.values.find {|info| info[:analyzer] != :standard}
101
+ # no, one or more has set
102
+ wrapper = org.apache.lucene.analysis.PerFieldAnalyzerWrapper.new(org.apache.lucene.analysis.standard.StandardAnalyzer.new)
103
+ @infos.each_pair do |key,value|
104
+ case value[:analyzer]
105
+ when :keyword
106
+ wrapper.addAnalyzer(key.to_s, org.apache.lucene.analysis.KeywordAnalyzer.new)
107
+ when :standard
108
+ # default
109
+ when :simple
110
+ wrapper.addAnalyzer(key.to_s, org.apache.lucene.analysis.SimpleAnalyzer.new)
111
+ when :whitespace
112
+ wrapper.addAnalyzer(key.to_s, org.apache.lucene.analysis.WhitespaceAnalyzer.new)
113
+ when :stop
114
+ wrapper.addAnalyzer(key.to_s, org.apache.lucene.analysis.StopAnalyzer.new)
115
+ else
116
+ raise "Unknown analyzer, supports :keyword, :standard, :simple, :stop, :whitspace, got '#{value}' for field '#{key}'"
117
+ end
118
+ end
119
+ wrapper
120
+ else
121
+ # yes, all fields has standard analyzer
122
+ org.apache.lucene.analysis.standard.StandardAnalyzer.new
123
+ end
124
+ end
125
+
126
+ # Returns true if it has one or more tokenized fields
127
+ def tokenized?
128
+ @infos.values.find{|field_info| field_info.tokenized?}
129
+ end
130
+
131
+ def [](key)
132
+ k = key.to_sym
133
+ $LUCENE_LOGGER.debug{"FieldInfos create new FieldInfo key '#{k}'"} if @infos[k].nil?
134
+ @infos[k] ||= DEFAULTS.dup
135
+ @infos[k]
136
+ end
137
+
138
+ def []=(key,value)
139
+ case value
140
+ when Hash then @infos[key] = FieldInfo.new(value)
141
+ when FieldInfo then @infos[key] = value
142
+ else raise ArgumentError.new("only accept Hash and FieldInfo, got #{value.class.to_s}")
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,157 @@
1
+ module Lucene
2
+
3
+ class Asc
4
+ class << self
5
+
6
+ # Specifies which fields should be sorted in ascending order
7
+ #
8
+ # ==== Parameters
9
+ # fields:: One or more fields to sort in ascending order (Array)
10
+ #
11
+ # ==== Examples
12
+ # Asc[:name, :age]
13
+ #
14
+ # ==== Returns
15
+ # An array of sort fields
16
+ #
17
+ def [](*fields)
18
+ fields.map{|x| org.apache.lucene.search.SortField.new(x.to_s)}
19
+ end
20
+ end
21
+ end
22
+
23
+ class Desc
24
+ class << self
25
+ # Specifies which fields should be sorted in descending order
26
+ #
27
+ # ==== Block parameters
28
+ # fields:: One or more fields to sort in descending order (Array)
29
+ #
30
+ # ==== Examples
31
+ # Desc[:name, :age]
32
+ #
33
+ # ==== Returns
34
+ # An array of sort fields
35
+ #
36
+ def [](*fields)
37
+ fields.map{|x| org.apache.lucene.search.SortField.new(x.to_s, true)}
38
+ #org.apache.lucene.search.Sort.new(values.map{|x| org.apache.lucene.search.SortField.new(x.to_s, true)}.to_java(:'org.apache.lucene.search.SortField'))
39
+ end
40
+ end
41
+ end
42
+
43
+ #
44
+ # Does reuse Lucene Index Search for the same index.
45
+ # Reloads the index if the index has changed.
46
+ #
47
+ class IndexSearcher
48
+
49
+ @@paths = {}
50
+
51
+ def initialize(path)
52
+ @path = path
53
+ end
54
+
55
+ #
56
+ # Only create a new object if it does not already exist for this path
57
+ #
58
+ def self.new(path)
59
+ @@paths[path] = super(path) if @@paths[path].nil?
60
+ @@paths[path]
61
+ end
62
+
63
+ def find_dsl(field_infos,&block)
64
+ exp = QueryDSL.parse(&block)
65
+ query = exp.to_lucene(field_infos)
66
+
67
+ Hits.new(field_infos, index_searcher.search(query))
68
+ end
69
+
70
+
71
+ def find(field_info, query)
72
+ # are there any index for this node ?
73
+ # if not return an empty array
74
+ return [] unless exist?
75
+
76
+ #puts "QUERY #{query.inspect}" # '#{query.first.class.to_s}' value #{query.first}"
77
+ sort_by ||= query[1].delete(:sort_by) if query[1].kind_of?(Hash)
78
+ sort_by ||= query.delete(:sort_by)
79
+ #puts "QUERY sort #{sort_by}"
80
+ # TODO Refactoring ! too long and complex method
81
+ lucene_query = case query
82
+ when Array
83
+ sort_by ||= query.last.delete(:sort_by) if query.last.kind_of?(Hash)
84
+ parser = org.apache.lucene.queryParser.QueryParser.new(field_info.id_field.to_s, field_info.analyzer)
85
+ parser.parse(query.first)
86
+ when Hash
87
+ bquery = org.apache.lucene.search.BooleanQuery.new
88
+ query.each_pair do |key,value|
89
+ field = field_info[key]
90
+ q = field.convert_to_query(key, value)
91
+ bquery.add(q, org.apache.lucene.search.BooleanClause::Occur::MUST)
92
+ end
93
+ bquery
94
+ else
95
+ raise StandardError.new("Unknown type #{query.class.to_s} for find #{query}")
96
+ end
97
+
98
+ if sort_by.nil?
99
+ Hits.new(field_info, index_searcher.search(lucene_query))
100
+ else
101
+ sort = create_sort(sort_by)
102
+ Hits.new(field_info, index_searcher.search(lucene_query, sort))
103
+ end
104
+
105
+ end
106
+
107
+ def parse_field(field)
108
+ case field
109
+ when String,Symbol
110
+ [org.apache.lucene.search.SortField.new(field.to_s)]
111
+ when org.apache.lucene.search.SortField
112
+ [field]
113
+ when Array
114
+ raise StandardError.new("Unknown sort field '#{field}'") unless field.first.kind_of?(org.apache.lucene.search.SortField)
115
+ field
116
+ end
117
+ end
118
+
119
+
120
+ def create_sort(fields)
121
+ case fields
122
+ when String,Symbol
123
+ org.apache.lucene.search.Sort.new(fields.to_s)
124
+ when org.apache.lucene.search.SortField
125
+ org.apache.lucene.search.Sort.new(fields)
126
+ when Array
127
+ sorts = []
128
+ fields.each do |field|
129
+ sorts += parse_field(field)
130
+ end
131
+ org.apache.lucene.search.Sort.new(sorts.to_java(:'org.apache.lucene.search.SortField'))
132
+ else
133
+ StandardError.new("Unknown type #{fields.class.to_s}")
134
+ end
135
+ end
136
+
137
+ #
138
+ # Checks if it needs to reload the index searcher
139
+ #
140
+ def index_searcher
141
+ if @index_reader.nil? || @index_reader.getVersion() != org.apache.lucene.index.IndexReader.getCurrentVersion(@path)
142
+ @index_reader = org.apache.lucene.index.IndexReader.open(@path)
143
+ @index_searcher = org.apache.lucene.search.IndexSearcher.new(@index_reader)
144
+ $LUCENE_LOGGER.debug("Opened new IndexSearcher for #{to_s}")
145
+ end
146
+ @index_searcher
147
+ end
148
+
149
+ #
150
+ # Returns true if the index already exists.
151
+ #
152
+ def exist?
153
+ org.apache.lucene.index.IndexReader.index_exists(@path)
154
+ end
155
+
156
+ end
157
+ end