rsi 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ #
2
+ # Defines a serializer which uses zlib compression in an attempt to
3
+ # reduce on-disk index size.
4
+ #
5
+ require 'zlib'
6
+ require 'rsi/serializers'
7
+
8
+ module RSI
9
+
10
+ # Serializer which performs gzip/libz compression on the output
11
+ # of another base serializer.
12
+ # By default, this sits on top of an RSI::NativeSerializer.
13
+ #
14
+ # (unconfirmed) This may be particularly sensitive to when the
15
+ # underlying stream passed to #dump and #load is closed.
16
+ class CompressedSerializer
17
+ attr_accessor :base
18
+
19
+ # By default, this serializer is based on NativeSerializer.
20
+ # If you prefer another underlying serializer, you may
21
+ # pass it as an argument to new().
22
+ def initialize( base_serializer=RSI::NativeSerializer.new() )
23
+ @base=base_serializer
24
+ end
25
+
26
+ def dump( obj, stream )
27
+ w = ZLib::GZipWriter.new( stream )
28
+ @base.dump( obj, w )
29
+ end
30
+
31
+ def load( stream )
32
+ r = ZLib.GZipReader.new( stream )
33
+ return @base.load( r )
34
+ end
35
+ end
36
+
37
+ # Serializer which performs bzip (de)compression on the
38
+ # output of another base serializer.
39
+ # By default, this sits on a NativeSerializer.
40
+ # This requires BZ2: http://raa.ruby-lang.org/project/bz2/ .
41
+ class BZip2Serializer
42
+ attr_accessor :base
43
+ # Pass another Serializer as an argument, if don't want the default.
44
+ def initialize( base_serializer=NativeSerializer.new() )
45
+ unless defined? BZ2
46
+ raise "The BZ2 module is not loaded (it must be installed and `require`d before this module is used)"
47
+ end
48
+ @base = base_serializer
49
+ end
50
+ def dump( obj, stream )
51
+ w = BZ2::Writer.new( stream )
52
+ @base.dump( obj, w )
53
+ end
54
+ def load( stream )
55
+ r = BZ2::Reader.new( stream )
56
+ return @base.load( r )
57
+ end
58
+ end
59
+
60
+ end
@@ -0,0 +1,232 @@
1
+ #
2
+ # Dictionary maintenence for text indexes
3
+ #
4
+ require 'yaml'
5
+ require 'rsi/logmanager'
6
+
7
+ module RSI
8
+
9
+ # An occurrence of a term in a document.
10
+ # [huh. freq and pos_list don't seem to add much to the dict size]
11
+ # freq is redundant. ==pos_list.length
12
+ class TermEntry
13
+ attr_accessor :docid, :freq, :pos_list
14
+ def initialize( docid )
15
+ @docid = docid
16
+ @freq = 0
17
+ @pos_list = []
18
+ end
19
+ def to_s
20
+ YAML.dump(self)
21
+ end
22
+ end
23
+
24
+ class Dictionary
25
+ include Loggable
26
+
27
+ attr_accessor :terms, :serializer
28
+ attr_reader :root
29
+
30
+ @@termgroup_loading = 50
31
+
32
+ META_FILE = "meta.yaml"
33
+ TERMS_FILE = "terms.list"
34
+ TERMGROUP_FILE = "termgroup.list"
35
+
36
+ def initialize( root )
37
+ @root = root
38
+ @serializer = NativeSerializer.new()
39
+ @terms_root = File.join( @root, "terms" )
40
+ @terms = {} # term => id
41
+ @entries = {} # termid => [TermEntry...]
42
+ @pending_entries = {} # of termid=>1
43
+ @termgroups = {} # termid -> termgroupid
44
+ @meta = {
45
+ :next_termid => 0,
46
+ :next_termgroup_id => 0,
47
+ :next_termgroup_count => 0,
48
+ }
49
+ end
50
+
51
+ def open()
52
+ Dir.mkdir( @root ) unless FileTest.exists?( @root )
53
+ logger.info( "Opening dictionary" )
54
+ begin
55
+ self.reload()
56
+ rescue
57
+ logger.debug( $! )
58
+ logger.info( "DB does not exist (#{$!}), creating..." )
59
+ self.create_store()
60
+ end
61
+ @opened = true
62
+ end
63
+
64
+ # Serialize the current state of the dictionary.
65
+ # (Currently requires time proportional (at least) to the
66
+ # full size of the dictionary. This is a bug.)
67
+ def store()
68
+ logger.info( "Storing at #{@root}" )
69
+ # meta info, stored as yaml
70
+ @meta[ :serializer ] = @serializer;
71
+ File.open( File.join( @root, META_FILE ), "w" ) do |meta_f|
72
+ YAML.dump( @meta, meta_f )
73
+ end
74
+
75
+ # store terms
76
+ term_fn = File.join( @root, TERMS_FILE )
77
+ logger.debug( "Storing terms to #{term_fn}" )
78
+ File.open( term_fn, "w" ) do |term_f|
79
+ logger.debug( "terms=#{@terms}" )
80
+ @serializer.dump( @terms, term_f )
81
+ end
82
+
83
+ File.open( File.join( @root, TERMGROUP_FILE), "w" ) do |termgroups_f|
84
+ @serializer.dump( @termgroups, termgroups_f )
85
+ end
86
+
87
+ store_term_entries()
88
+ end
89
+
90
+ def has_term?( term )
91
+ return @terms.has_key?( term )
92
+ end
93
+
94
+ # Get the termid for the given (tokenized) term. If create is
95
+ # true (the default), the given term has not been previously added
96
+ # to the dictionary, a new id will be created and returned.
97
+ def get_termid_for( term, create=false )
98
+ unless @terms.has_key?( term )
99
+ return nil unless create
100
+ t = self.next_termid()
101
+ @terms[term] = t
102
+ end
103
+ return @terms[term]
104
+ end
105
+
106
+ def add_term_entries( docid, termid, pos_list=[0] )
107
+ e = TermEntry.new( docid )
108
+ e.pos_list = pos_list
109
+ e.freq = pos_list.length()
110
+ add_entry( termid, e )
111
+ end
112
+
113
+ # Get a list of entries for the given termid.
114
+ # Creates the entry list, if it doesn't already exist.
115
+ # Returns a list of TermEntries
116
+ def get_entry_list( termid )
117
+ logger.debug( "[termid #{termid}]" )
118
+ unless @entries.has_key?( termid )
119
+ logger.debug( " No entry[#{termid}]" )
120
+ unless @termgroups.has_key?( termid )
121
+ logger.debug( " No termgroups[#{termid}]" )
122
+ @termgroups[ termid ] = next_termgroup_id()
123
+ end
124
+ id = @termgroups[ termid ]
125
+ logger.debug( " Termgroup id=#{id}" )
126
+ tg_fn = File.join( @terms_root, "#{id}.tg" )
127
+ logger.debug( " fn=#{tg_fn}" )
128
+ if FileTest.exists?( tg_fn )
129
+ logger.debug( " Reloading termgroup record #{tg_fn}" )
130
+ tg_f = File.open( tg_fn, "r" )
131
+ tg = @serializer.load( tg_f )
132
+ tg_f.close()
133
+ tg.each do |tid, term_entries|
134
+ @entries[tid] = term_entries
135
+ end
136
+ end
137
+ unless @entries.has_key?( termid )
138
+ logger.debug( " Creating termgroup record" )
139
+ @entries[termid] = []
140
+ end
141
+ end
142
+ logger.debug( "[returning #{@entries[termid]}]" )
143
+ return @entries[termid]
144
+ end
145
+
146
+ protected
147
+
148
+ # Create a new storage location.
149
+ def create_store()
150
+ logger.info( "Creating store at #{@root}" )
151
+ Dir.mkdir( @root ) unless FileTest.exists?( @root )
152
+ Dir.mkdir( @terms_root ) unless FileTest.exists?( @terms_root )
153
+ end
154
+
155
+ # Load the dictionary from storage.
156
+ def reload()
157
+ logger.info( "Reloading from #{@root}" )
158
+ # meta file is dumped/loaded as yaml, always
159
+ File.open( File.join( @root, META_FILE ), "r" ) do |meta_f|
160
+ @meta = YAML.load( meta_f )
161
+ @serializer = @meta[ :serializer ]
162
+ logger.debug( "Loaded meta from #{META_FILE}" )
163
+ end
164
+ term_fn = File.join( @root, TERMS_FILE )
165
+ File.open( term_fn, "r" ) do |term_f|
166
+ @terms = @serializer.load( term_f )
167
+ logger.debug( "Loaded terms from #{term_fn}" )
168
+ end
169
+ File.open( File.join( @root, TERMGROUP_FILE), "r" ) do |termgroups_f|
170
+ @termgroups = @serializer.load( termgroups_f )
171
+ logger.debug( "Loaded termgroup map from #{TERMGROUP_FILE}" )
172
+ end
173
+ # entries are loaded lazily... use get_entry_list
174
+ end
175
+
176
+ # Return the next sequential document id.
177
+ def next_docid #not threadsafe
178
+ @meta[:next_docid] += 1
179
+ return @meta[:next_docid]
180
+ end
181
+
182
+ # Return the next sequential term id.
183
+ def next_termid
184
+ @meta[:next_termid] += 1
185
+ return @meta[:next_termid]
186
+ end
187
+
188
+ def next_termgroup_id
189
+ # totally not threadsafe
190
+ @meta[:next_termgroup_count] += 1
191
+ if @meta[:next_termgroup_count] > @@termgroup_loading
192
+ @meta[:next_termgroup_id] += 1
193
+ @meta[:next_termgroup_count] = 0
194
+ end
195
+ return @meta[:next_termgroup_id]
196
+ end
197
+
198
+ def store_term_entries()
199
+ logger.info( "Storing term entries" )
200
+ @pending_entries.each do |tg_id, termids|
201
+ tg_fn = File.join( @terms_root, "#{tg_id}.tg" )
202
+ tg = nil
203
+ if FileTest.exists?( tg_fn )
204
+ File.open( tg_fn, "r" ) {|tg_f| tg = @serializer.load( tg_f )}
205
+ else
206
+ tg = {}
207
+ end
208
+ termids.each do |termid|
209
+ tg[ termid ] = @entries[ termid ] # update to internal state
210
+ end
211
+ logger.debug( "Writing #{tg_fn}" )
212
+ File.open( tg_fn, "w" ) {|f| @serializer.dump( tg, f )}
213
+ end
214
+ @pending_entries = {} # clear pending set
215
+ end
216
+
217
+
218
+ # Add an entry for the given termid.
219
+ def add_entry( termid, entry )
220
+ # load entrylist and add entry to it, for internal state
221
+ get_entry_list( termid ) << entry
222
+ # track entry for later store()
223
+ tg_id = @termgroups[ termid ]
224
+ unless @pending_entries.has_key?( tg_id )
225
+ @pending_entries[ tg_id ] = []
226
+ end
227
+ @pending_entries[ tg_id ] << termid
228
+ end
229
+
230
+ end
231
+
232
+ end
@@ -0,0 +1,245 @@
1
+
2
+
3
+ require 'rsi/porter'
4
+ require 'rsi/logmanager'
5
+
6
+ # mixin Stemmable.stem (from porter.rb) into String
7
+ class String
8
+ include Stemmable
9
+ end
10
+
11
+ #
12
+ # Classes for building and querying indexes.
13
+ #
14
+ module RSI
15
+
16
+ class IndexException < RuntimeError; end
17
+
18
+ # Document index. Interface for adding documents to index, and
19
+ # for querying an index.
20
+ class Indexer
21
+ include RSI::Loggable
22
+
23
+ # Dictionary of terms.
24
+ attr_reader :root
25
+ # Analyzer to use for document and query tokenization.
26
+ attr_accessor :analyzer, :query_analyzer, :serializer, :dicts
27
+
28
+ META_FILE = "meta.yaml"
29
+ DOCS_FILE = "docs.list"
30
+
31
+ def initialize( root )
32
+ @root = root
33
+ @docs = {}
34
+ @meta = { :next_docid => 0 }
35
+ @serializer = RSI::NativeSerializer.new()
36
+ @analyzer = RSI::DefaultTextAnalyzer.new()
37
+ @query_analyzer = RSI::DefaultTextAnalyzer.new()
38
+ @dicts = {}
39
+ @opened = false
40
+ end
41
+
42
+ def open()
43
+ Dir.mkdir( @root ) unless FileTest.exists?( @root )
44
+ log_fh = File.open( File.join( @root, "index.log" ),
45
+ File::WRONLY|File::APPEND|File::CREAT )
46
+ log_fh.sync = true
47
+ logger.info( "Trying to reload index..." )
48
+ begin
49
+ reload()
50
+ rescue
51
+ logger.info( "Reload failed (#{$!}), creating new index" )
52
+ # nothing to do
53
+ end
54
+ # Query the analyzer, getting the fields it tokenizes.
55
+ # Initialize and open a dictionary for each field.
56
+ logger.info( "Assigning dictionaries..." )
57
+ @analyzer.get_field_types().each do |field, type|
58
+ field_root = File.join( @root, field )
59
+ klass = map_field_type( type )
60
+ logger.debug( "Field: #{field} at #{field_root} is #{klass}" )
61
+ @dicts[field] = klass.new( field_root )
62
+ @dicts[field].serializer = @serializer
63
+ end
64
+ logger.info( "Opening dictionaries" )
65
+ @dicts.each do |name, dict|
66
+ logger.debug( "Dictionary: #{name}" )
67
+ dict.open()
68
+ end
69
+ @opened = true
70
+ end
71
+
72
+ # Gets a dictionary instance for the given field type
73
+ def map_field_type( type )
74
+ case type
75
+ when RSI::FIELD_TYPE_TEXT
76
+ return RSI::Dictionary
77
+ when RSI::FIELD_TYPE_DATE
78
+ raise "implement me! XXX"
79
+ end
80
+ end
81
+
82
+ # Add a document to the index.
83
+ def add_document( doc_uri, content )
84
+ open() unless @opened
85
+ logger.info("Adding document #{doc_uri}")
86
+ if @docs.has_value?( doc_uri )
87
+ raise IndexException, "Cannot do updates yet"
88
+ else
89
+ docid = next_docid()
90
+ @docs[ docid ] = doc_uri
91
+ pos = 0
92
+ term_entries = {}
93
+ logger.debug("Tokenizing")
94
+ @analyzer.tokenize( content ).each do |field, termlist|
95
+ termlist.each do |term|
96
+ termid = @dicts[field].get_termid_for(term, true)
97
+ raise "POO" if termid==nil
98
+ unless term_entries.has_key?( termid )
99
+ term_entries[termid] = []
100
+ end
101
+ term_entries[termid] << pos
102
+ pos += 1
103
+ end
104
+ logger.debug("Adding term entries to #{field}")
105
+ term_entries.each do |termid, pos_list|
106
+ @dicts[field].add_term_entries(docid, termid, term_entries[termid])
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ # Remove a document from the index (slow!).
113
+ def delete_document( doc_uri )
114
+ open() unless @opened
115
+ raise "This is too hard for me, yet"
116
+ end
117
+
118
+ # Stop adding documents to the index, and serialize to storage.
119
+ def flush()
120
+ open() unless @opened
121
+ logger.info("Finishing")
122
+ store_metadata()
123
+ store_doclist()
124
+ @dicts.each do |field, dict|
125
+ dict.store()
126
+ end
127
+ end
128
+
129
+ # Return a list of document ids which contain any of the given
130
+ # search termsn (OR query). The terms will be tokenized by the
131
+ # current Analyzer.
132
+ def find_any( terms_str )
133
+ open() unless @opened
134
+ raise "unimplemented"
135
+ end
136
+
137
+ def get_dict_for_field( field )
138
+ return @dicts[field]
139
+ end
140
+
141
+ # Return a list of document ids which contain any of the given
142
+ # search terms (AND query). The terms will be tokenized by the
143
+ # current Analyzer.
144
+ #
145
+ def find_all( terms_str )
146
+ q = @query_analyzer.tokenize_query( terms_str )
147
+ logger.debug( "Query=#{q.to_s}" )
148
+ docids = q.evaluate( self )
149
+ docids.uniq!
150
+ return docids.collect {|id| @docs[id]}
151
+ end
152
+
153
+ def OLD_find_all( terms_str )
154
+ open() unless @opened
155
+
156
+ # this querying logic is too fragile
157
+ logger.info { "Query: #{terms_str}" }
158
+ t_set = @query_analyzer.tokenize_query( terms_str )
159
+ logger.debug { "Tokenized: #{t_set}" }
160
+ # build map of docid => term-match-count
161
+ finds = {}
162
+ t_set.each do |field, term_list|
163
+ term_list.each do |term|
164
+ logger.debug { "field='#{field}', term='#{term}'" }
165
+ # lookup termid in dict for field
166
+ unless @dicts[field].has_term?( term )
167
+ logger.info { "No term #{term} in dictionary #{field}" }
168
+ next
169
+ end
170
+ termid = @dicts[field].get_termid_for( term )
171
+ logger.debug { "termid=#{termid}" }
172
+ # get list of entries for termid
173
+ e_list = @dicts[field].get_entry_list( termid )
174
+ # get list of docids
175
+ e_list.each do |e|
176
+ logger.debug { " docid=#{e.docid}" }
177
+ finds[ e.docid ] = finds[ e.docid ].to_i + 1
178
+ end
179
+ end
180
+ end
181
+ total_terms = 0
182
+ t_set.each_value {|vl| total_terms += vl.size() }
183
+ logger.debug { "Total terms: #{total_terms}" }
184
+ # foreach docid in map: match if term-match-count == terms-count
185
+ d_return = []
186
+ finds.each do |docid, count|
187
+ if count == total_terms
188
+ # return docid
189
+ uri = @docs[ docid ]
190
+ d_return << uri
191
+ end
192
+ end
193
+ return d_return
194
+ end
195
+
196
+ protected
197
+
198
+ # needs synchro
199
+ def next_docid()
200
+ @meta[ :next_docid ] += 1
201
+ return @meta[ :next_docid ]
202
+ end
203
+
204
+ def reload()
205
+ logger.info("Reloading from #{@root}")
206
+ load_metadata()
207
+ load_doclist()
208
+ end
209
+
210
+ def store_metadata()
211
+ @meta[ :serializer ] = @serializer
212
+ @meta[ :analyzer ] = @analyzer.class.name
213
+ @meta[ :query_analyzer ] = @query_analyzer.class.name
214
+ fn = File.join( @root, META_FILE )
215
+ logger.info( "Storing metadata to #{fn}" )
216
+ File.open( fn, "w" ) {|f| YAML.dump( @meta, f ) }
217
+ end
218
+
219
+ def load_metadata()
220
+ fn = File.join( @root, META_FILE )
221
+ logger.info( "Loading metadata from #{fn}" )
222
+ File.open( fn, "r" ) {|f| @meta = YAML.load( f ) }
223
+ @serializer = @meta[ :serializer ]
224
+ @analyzer = eval "#{@meta[ :analyzer ]}.new()"
225
+ @query_analyzer = eval "#{@meta[ :query_analyzer ]}.new()"
226
+ end
227
+
228
+ def store_doclist()
229
+ fn = File.join( @root, DOCS_FILE )
230
+ logger.info( "Storing doc list to #{fn}" )
231
+ File.open( fn, "w" ) {|f| @serializer.dump( @docs, f ) }
232
+ end
233
+
234
+ def load_doclist()
235
+ fn = File.join( @root, DOCS_FILE )
236
+ logger.info( "Loading doc list from #{fn}" )
237
+ File.open( fn, "r" ) {|f| @docs = @serializer.load( f ) }
238
+ end
239
+
240
+
241
+
242
+ end
243
+
244
+ end
245
+