lucene 0.5.0.beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,267 @@
1
+ require 'monitor'
2
+ require 'lucene/jars'
3
+ require 'lucene/transaction'
4
+ require 'lucene/index_searcher'
5
+ require 'lucene/document'
6
+ require 'lucene/field_info'
7
+ require 'lucene/index_info'
8
+
9
+ #
10
+ # A wrapper for the Java lucene search library.
11
+ #
12
+ module Lucene
13
+
14
+ class DocumentDeletedException < StandardError;
15
+ end
16
+ class IdFieldMissingException < StandardError;
17
+ end
18
+
19
+ #
20
+ # Represents a Lucene Index.
21
+ # The index is written/updated only when the commit method is called.
22
+ # This is done since writing to the index file should be done as a batch operation.
23
+ # (Performance will be bad otherwise).
24
+ #
25
+ # For each Thread there is zero or one Index instance. There are at most one Index instance per thread
26
+ # so there is no need for this class to use synchronization for Thread safety.
27
+ #
28
+ class Index
29
+ attr_reader :path, :uncommited
30
+
31
+
32
+ # locks per index path, must not write to the same index from 2 threads
33
+ @@locks = {}
34
+ @@locks.extend MonitorMixin
35
+
36
+ def initialize(path, index_info)
37
+ @path = path # a key (i.e. filepath) where the index is stored on disk/or RAM
38
+ @index_info = index_info # the actual storage of the index
39
+ @uncommited = {} # documents to be commited, a hash of Document
40
+ @deleted_ids = [] # documents to be deleted
41
+ end
42
+
43
+ def field_infos
44
+ IndexInfo.instance(@path)
45
+ end
46
+
47
+
48
+ # Returns an Index instance for the current running transaction.
49
+ #
50
+ # Tries to reuse an Index instance for the current running transaction.
51
+ # If a Lucene::Transaction is running it will register this index in that transaction if
52
+ # this has not already been done.
53
+ # When it has been registered in the transaction the transaction will commit the index
54
+ # when the transaction is finished.
55
+ # The configuration (kept in the #field_infos) for this index will be the same for all indexes with the same path/key.
56
+ #
57
+ # ==== Parameters
58
+ # path<String>:: The key or location where the index should be stored (relative Lucene::Config[:storage_path]
59
+ #
60
+ # ==== Examples
61
+ # Index.new 'foo/lucene-db'
62
+ #
63
+ # ==== Returns
64
+ # Returns a new or an already existing Index
65
+ #
66
+ def self.new(path)
67
+ # make sure no one modifies the index specified at given path
68
+ lock(path).synchronize do
69
+ # create a new transaction if needed
70
+ Transaction.new unless Transaction.running?
71
+
72
+ # create a new instance only if it does not already exist in the current transaction
73
+ unless Transaction.current.index?(path)
74
+ info = IndexInfo.instance(path)
75
+ index = super(path, info)
76
+ Transaction.current.register_index(path, index)
77
+ end
78
+ end
79
+ # return the index for the current transaction
80
+ Transaction.current.index(path)
81
+ end
82
+
83
+
84
+ #
85
+ # Delete all uncommited documents. Also deregister this index
86
+ # from the current transaction (if there is one transaction)
87
+ #
88
+ def clear
89
+ @uncommited.clear
90
+ Transaction.current.deregister_index self if Transaction.running?
91
+ end
92
+
93
+ #
94
+ # See instance method Index.clear
95
+ #
96
+ def self.clear(path)
97
+ return unless Transaction.running?
98
+ return unless Transaction.current.index?(path)
99
+ Transaction.current.index(path).clear
100
+ end
101
+
102
+ # Creates a new document from the given hash of values.
103
+ # This document will be stored in this instance till it is commited.
104
+ #
105
+ # ==== Parameters
106
+ # path<String>:: The key or location where the index should be stored (relative Lucene::Config[:storage_path]
107
+ #
108
+ # ==== Examples
109
+ # index = Index.new('name_or_path_to_index')
110
+ # index << {:id=>'1', :name=>'foo'}
111
+ #
112
+ # ==== Returns
113
+ # Returns the index instance so that this method can be chained
114
+ #
115
+ def <<(key_values)
116
+ doc = Document.new(field_infos, key_values)
117
+ @uncommited[doc.id] = doc
118
+ self
119
+ end
120
+
121
+ def id_field
122
+ @index_info.id_field
123
+ end
124
+
125
+ #
126
+ # Updates the specified document.
127
+ # The index file will not be updated until the transaction commits.
128
+ # The doc is stored in memory till the transaction commits.
129
+ #
130
+ def update(doc)
131
+ @uncommited[doc.id] = doc
132
+ end
133
+
134
+ #
135
+ # Delete the specified document.
136
+ # The index file not be updated until the transaction commits.
137
+ # The id of the deleted document is stored in memory till the transaction commits.
138
+ #
139
+ def delete(id)
140
+ @deleted_ids << id.to_s
141
+ end
142
+
143
+
144
+ def deleted?(id)
145
+ @deleted_ids.include?(id.to_s)
146
+ end
147
+
148
+ def updated?(id)
149
+ @uncommited[id.to_s]
150
+ end
151
+
152
+ # Writes to the index files.
153
+ # Open and closes an lucene IndexWriter
154
+ # Close the IndexSearcher so that it will read the updated index next time.
155
+ # This method will automatically be called from a Lucene::Transaction if it was running when the index was created.
156
+ #
157
+ # This method is synchronized since it is not allowed to update a lucene index from several threads at the same time.
158
+ #
159
+ def commit
160
+ lock.synchronize do
161
+ delete_documents # deletes all documents given @deleted_ids
162
+
163
+ # are any updated document deleted ?
164
+ deleted_ids = @uncommited.keys & @deleted_ids
165
+ # make sure we don't index deleted document
166
+ deleted_ids.each {|id| @uncommited.delete(id)}
167
+
168
+ # update the remaining documents that has not been deleted
169
+
170
+ begin
171
+ index_writer = org.apache.lucene.index.IndexWriter.new(@index_info.storage, @index_info.analyzer, ! exist?)
172
+ # removes the document and adds it again
173
+ @uncommited.each_value { |doc| doc.update(index_writer) }
174
+ ensure
175
+ # TODO exception handling, what if ...
176
+ index_writer.close
177
+
178
+ @uncommited.clear
179
+ @deleted_ids.clear
180
+
181
+ # if we are running in a transaction remove this so it will not be committed twice
182
+ Transaction.current.deregister_index(self) if Transaction.running?
183
+ end
184
+ end
185
+ end
186
+
187
+
188
+ #
189
+ # Delegates to the IndexSearcher.find method
190
+ #
191
+ def find(*query, &block)
192
+ # new method is a factory method, does not create if it already exists
193
+ searcher = IndexSearcher.new(@index_info.storage)
194
+
195
+ if block.nil?
196
+ case query.first
197
+ when String
198
+ return searcher.find(@index_info, query)
199
+ when Hash, Array
200
+ return searcher.find(@index_info, query.first)
201
+ end
202
+ else
203
+ return searcher.find_dsl(@index_info, &block)
204
+ end
205
+ end
206
+
207
+
208
+ def to_s
209
+ "Index [path: '#@path', #{@uncommited.size} documents]"
210
+ end
211
+
212
+ #
213
+ # -------------------------------------------------------------------------
214
+ # Private methods
215
+ #
216
+
217
+ private
218
+
219
+ #
220
+ # There is one lock per index path.
221
+ #
222
+ def lock
223
+ @@locks.synchronize do
224
+ @@locks[@path] ||= Monitor.new
225
+ @@locks[@path]
226
+ end
227
+ end
228
+
229
+ def self.lock(path)
230
+ @@locks.synchronize do
231
+ @@locks[path] ||= Monitor.new
232
+ @@locks[path]
233
+ end
234
+ end
235
+
236
+ #
237
+ # Returns true if the index already exists.
238
+ #
239
+ def exist?
240
+ @index_info.index_exists?
241
+ end
242
+
243
+ #
244
+ # --------------------------------------------------------------------------
245
+ #
246
+ private
247
+
248
+ def delete_documents # :nodoc:
249
+ return unless exist? # if no index exists then there is nothing to do
250
+
251
+ writer = org.apache.lucene.index.IndexWriter.new(@index_info.storage, @index_info.analyzer, false)
252
+ id_field = @index_info.infos[@index_info.id_field]
253
+
254
+ @deleted_ids.each do |id|
255
+ converted_value = id_field.convert_to_lucene(id)
256
+ writer.deleteDocuments(org.apache.lucene.index.Term.new(@index_info.id_field.to_s, converted_value))
257
+ end
258
+ ensure
259
+ # TODO exception handling, what if ...
260
+ writer.close unless writer.nil?
261
+ end
262
+
263
+
264
+ end
265
+ end
266
+
267
+
@@ -0,0 +1,146 @@
1
+ module Lucene
2
+
3
+ #
4
+ # Contains info for a specific Index identified by a path
5
+ # Contains a
6
+ # * collection of FieldInfo objects.
7
+ # * the name of the id field.
8
+ # * the index storage, either file based or RAM based.
9
+ #
10
+ # Fields has default value IndexInfo::DEFAULTS.
11
+ #
12
+ class IndexInfo #:nodoc:
13
+ DEFAULTS = FieldInfo.new({}).freeze
14
+
15
+ attr_reader :infos, :path
16
+ attr_accessor :id_field
17
+ attr_writer :store_on_file
18
+
19
+ # Initializes this object by setting values to default values specified in the Lucene::Config.
20
+ # The path/id to the index is specified by the the path parameter.
21
+ # If the index is Lucene::Config[:storage_path]
22
+ # ==== Block parameters
23
+ # path<String>:: The id or the (incomplete) path on the filesystem of the index
24
+ #
25
+ # :api: private
26
+ def initialize(path)
27
+ $LUCENE_LOGGER.debug{"IndexInfo#initialize(#{path})"}
28
+ @id_field = Lucene::Config[:id_field].to_sym
29
+ @path = path
30
+ @store_on_file = Lucene::Config[:store_on_file]
31
+ @infos = {}
32
+ # always store the id field
33
+ @infos[@id_field] = FieldInfo.new(:store => true)
34
+ end
35
+
36
+ def to_s
37
+ "IndexInfo [#{@id_field}, #{@infos.inspect}]"
38
+ end
39
+
40
+ def store_on_file?
41
+ @store_on_file
42
+ end
43
+
44
+ def storage
45
+ @storage ||= create_storage
46
+ end
47
+
48
+ def create_storage
49
+ if store_on_file?
50
+ raise StandardError.new("Lucene::Config[:storage_path] is nil but index configured to be stored on filesystem") if Lucene::Config[:storage_path].nil?
51
+ Lucene::Config[:storage_path] + @path
52
+ else
53
+ org.apache.lucene.store.RAMDirectory.new
54
+ end
55
+ end
56
+
57
+
58
+ def self.instance?(path)
59
+ return false if @instances.nil?
60
+ ! @instances[path].nil?
61
+ end
62
+
63
+ # Creates and initializes an IndexInfo object by setting values to default
64
+ # values specified in the Lucene::Config. Does not create new object if it has
65
+ # already been created before with the given path.
66
+ #
67
+ # If the index is stored on the filesystem the complete path will be
68
+ # Lucene::Config[:storage_path] + /path
69
+ #
70
+ # ==== Block parameters
71
+ # path<String>:: The id or the (incomplete) path on the filesystem of the index
72
+ #
73
+ # :api: public
74
+ def self.instance(path)
75
+ @instances ||= {}
76
+ $LUCENE_LOGGER.debug{"IndexInfos#instance(#{path}) : @instances[path]: #{@instances[path]}"}
77
+ @instances[path] ||= IndexInfo.new(path)
78
+ end
79
+
80
+ def self.delete_all
81
+ $LUCENE_LOGGER.debug{"IndexInfos#delete_all"}
82
+ @instances = nil
83
+ end
84
+
85
+ def self.index_exists(path)
86
+ return false if @instances[path].nil?
87
+ instance(path).index_exists?
88
+ end
89
+
90
+ def index_exists?
91
+ org.apache.lucene.index.IndexReader.index_exists(storage)
92
+ end
93
+
94
+ def each_pair
95
+ @infos.each_pair{|key,value| yield key,value}
96
+ end
97
+
98
+ def analyzer
99
+ # do all fields have the default value :standard analyzer ?
100
+ if @infos.values.find {|info| info[:analyzer] != :standard}
101
+ # no, one or more has set
102
+ wrapper = org.apache.lucene.analysis.PerFieldAnalyzerWrapper.new(org.apache.lucene.analysis.standard.StandardAnalyzer.new)
103
+ @infos.each_pair do |key,value|
104
+ case value[:analyzer]
105
+ when :keyword
106
+ wrapper.addAnalyzer(key.to_s, org.apache.lucene.analysis.KeywordAnalyzer.new)
107
+ when :standard
108
+ # default
109
+ when :simple
110
+ wrapper.addAnalyzer(key.to_s, org.apache.lucene.analysis.SimpleAnalyzer.new)
111
+ when :whitespace
112
+ wrapper.addAnalyzer(key.to_s, org.apache.lucene.analysis.WhitespaceAnalyzer.new)
113
+ when :stop
114
+ wrapper.addAnalyzer(key.to_s, org.apache.lucene.analysis.StopAnalyzer.new)
115
+ else
116
+ raise "Unknown analyzer, supports :keyword, :standard, :simple, :stop, :whitspace, got '#{value}' for field '#{key}'"
117
+ end
118
+ end
119
+ wrapper
120
+ else
121
+ # yes, all fields has standard analyzer
122
+ org.apache.lucene.analysis.standard.StandardAnalyzer.new
123
+ end
124
+ end
125
+
126
+ # Returns true if it has one or more tokenized fields
127
+ def tokenized?
128
+ @infos.values.find{|field_info| field_info.tokenized?}
129
+ end
130
+
131
+ def [](key)
132
+ k = key.to_sym
133
+ $LUCENE_LOGGER.debug{"FieldInfos create new FieldInfo key '#{k}'"} if @infos[k].nil?
134
+ @infos[k] ||= DEFAULTS.dup
135
+ @infos[k]
136
+ end
137
+
138
+ def []=(key,value)
139
+ case value
140
+ when Hash then @infos[key] = FieldInfo.new(value)
141
+ when FieldInfo then @infos[key] = value
142
+ else raise ArgumentError.new("only accept Hash and FieldInfo, got #{value.class.to_s}")
143
+ end
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,157 @@
1
+ module Lucene
2
+
3
+ class Asc
4
+ class << self
5
+
6
+ # Specifies which fields should be sorted in ascending order
7
+ #
8
+ # ==== Parameters
9
+ # fields:: One or more fields to sort in ascending order (Array)
10
+ #
11
+ # ==== Examples
12
+ # Asc[:name, :age]
13
+ #
14
+ # ==== Returns
15
+ # An array of sort fields
16
+ #
17
+ def [](*fields)
18
+ fields.map{|x| org.apache.lucene.search.SortField.new(x.to_s)}
19
+ end
20
+ end
21
+ end
22
+
23
+ class Desc
24
+ class << self
25
+ # Specifies which fields should be sorted in descending order
26
+ #
27
+ # ==== Block parameters
28
+ # fields:: One or more fields to sort in descending order (Array)
29
+ #
30
+ # ==== Examples
31
+ # Desc[:name, :age]
32
+ #
33
+ # ==== Returns
34
+ # An array of sort fields
35
+ #
36
+ def [](*fields)
37
+ fields.map{|x| org.apache.lucene.search.SortField.new(x.to_s, true)}
38
+ #org.apache.lucene.search.Sort.new(values.map{|x| org.apache.lucene.search.SortField.new(x.to_s, true)}.to_java(:'org.apache.lucene.search.SortField'))
39
+ end
40
+ end
41
+ end
42
+
43
+ #
44
+ # Does reuse Lucene Index Search for the same index.
45
+ # Reloads the index if the index has changed.
46
+ #
47
+ class IndexSearcher
48
+
49
+ @@paths = {}
50
+
51
+ def initialize(path)
52
+ @path = path
53
+ end
54
+
55
+ #
56
+ # Only create a new object if it does not already exist for this path
57
+ #
58
+ def self.new(path)
59
+ @@paths[path] = super(path) if @@paths[path].nil?
60
+ @@paths[path]
61
+ end
62
+
63
+ def find_dsl(field_infos,&block)
64
+ exp = QueryDSL.parse(&block)
65
+ query = exp.to_lucene(field_infos)
66
+
67
+ Hits.new(field_infos, index_searcher.search(query))
68
+ end
69
+
70
+
71
+ def find(field_info, query)
72
+ # are there any index for this node ?
73
+ # if not return an empty array
74
+ return [] unless exist?
75
+
76
+ #puts "QUERY #{query.inspect}" # '#{query.first.class.to_s}' value #{query.first}"
77
+ sort_by ||= query[1].delete(:sort_by) if query[1].kind_of?(Hash)
78
+ sort_by ||= query.delete(:sort_by)
79
+ #puts "QUERY sort #{sort_by}"
80
+ # TODO Refactoring ! too long and complex method
81
+ lucene_query = case query
82
+ when Array
83
+ sort_by ||= query.last.delete(:sort_by) if query.last.kind_of?(Hash)
84
+ parser = org.apache.lucene.queryParser.QueryParser.new(field_info.id_field.to_s, field_info.analyzer)
85
+ parser.parse(query.first)
86
+ when Hash
87
+ bquery = org.apache.lucene.search.BooleanQuery.new
88
+ query.each_pair do |key,value|
89
+ field = field_info[key]
90
+ q = field.convert_to_query(key, value)
91
+ bquery.add(q, org.apache.lucene.search.BooleanClause::Occur::MUST)
92
+ end
93
+ bquery
94
+ else
95
+ raise StandardError.new("Unknown type #{query.class.to_s} for find #{query}")
96
+ end
97
+
98
+ if sort_by.nil?
99
+ Hits.new(field_info, index_searcher.search(lucene_query))
100
+ else
101
+ sort = create_sort(sort_by)
102
+ Hits.new(field_info, index_searcher.search(lucene_query, sort))
103
+ end
104
+
105
+ end
106
+
107
+ def parse_field(field)
108
+ case field
109
+ when String,Symbol
110
+ [org.apache.lucene.search.SortField.new(field.to_s)]
111
+ when org.apache.lucene.search.SortField
112
+ [field]
113
+ when Array
114
+ raise StandardError.new("Unknown sort field '#{field}'") unless field.first.kind_of?(org.apache.lucene.search.SortField)
115
+ field
116
+ end
117
+ end
118
+
119
+
120
+ def create_sort(fields)
121
+ case fields
122
+ when String,Symbol
123
+ org.apache.lucene.search.Sort.new(fields.to_s)
124
+ when org.apache.lucene.search.SortField
125
+ org.apache.lucene.search.Sort.new(fields)
126
+ when Array
127
+ sorts = []
128
+ fields.each do |field|
129
+ sorts += parse_field(field)
130
+ end
131
+ org.apache.lucene.search.Sort.new(sorts.to_java(:'org.apache.lucene.search.SortField'))
132
+ else
133
+ StandardError.new("Unknown type #{fields.class.to_s}")
134
+ end
135
+ end
136
+
137
+ #
138
+ # Checks if it needs to reload the index searcher
139
+ #
140
+ def index_searcher
141
+ if @index_reader.nil? || @index_reader.getVersion() != org.apache.lucene.index.IndexReader.getCurrentVersion(@path)
142
+ @index_reader = org.apache.lucene.index.IndexReader.open(@path)
143
+ @index_searcher = org.apache.lucene.search.IndexSearcher.new(@index_reader)
144
+ $LUCENE_LOGGER.debug("Opened new IndexSearcher for #{to_s}")
145
+ end
146
+ @index_searcher
147
+ end
148
+
149
+ #
150
+ # Returns true if the index already exists.
151
+ #
152
+ def exist?
153
+ org.apache.lucene.index.IndexReader.index_exists(@path)
154
+ end
155
+
156
+ end
157
+ end