ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,175 @@
1
+ require 'zlib'
2
+
3
+
4
+ module Ferret::Index
5
+
6
+
7
+ # Class responsible for access to stored document fields.
8
+ #
9
+ # It uses <segment>.fdt and <segment>.fdx; files.
10
+ class FieldsReader
11
+ include Ferret::Document
12
+ attr_reader :size
13
+ alias :length :size
14
+
15
+ def initialize(d, segment, fi)
16
+ @field_infos = fi
17
+
18
+ @fields_stream = d.open_input(segment + ".fdt")
19
+ @index_stream = d.open_input(segment + ".fdx")
20
+
21
+ @size = (@index_stream.length() / 8).to_i
22
+ end
23
+
24
+ def close()
25
+ @fields_stream.close()
26
+ @index_stream.close()
27
+ end
28
+
29
+
30
+ def doc(n)
31
+ @index_stream.seek(n * 8)
32
+ position = @index_stream.read_long()
33
+ @fields_stream.seek(position)
34
+
35
+ doc = Document.new
36
+ @fields_stream.read_vint().times do
37
+ field_number = @fields_stream.read_vint()
38
+ fi = @field_infos[field_number]
39
+
40
+ bits = @fields_stream.read_byte()
41
+
42
+ compressed = (bits & FieldsWriter::FIELD_IS_COMPRESSED) != 0
43
+ tokenize = (bits & FieldsWriter::FIELD_IS_TOKENIZED) != 0
44
+ binary = (bits & FieldsWriter::FIELD_IS_BINARY) != 0
45
+
46
+ if binary
47
+ b = " " * @fields_stream.read_vint()
48
+ @fields_stream.read_bytes(b, 0, b.length)
49
+ if compressed
50
+ doc << Field.new_binary_field(fi.name,
51
+ uncompress(b),
52
+ Field::Store::COMPRESS)
53
+ else # No compression
54
+ doc << Field.new_binary_field(fi.name, b, Field::Store::YES)
55
+ end
56
+ else
57
+ store = Field::Store::YES
58
+ if fi.indexed? and tokenize
59
+ index = Field::Index::TOKENIZED
60
+ elsif fi.indexed? and not tokenize
61
+ index = Field::Index::UNTOKENIZED
62
+ else
63
+ index = Field::Index::NO
64
+ end
65
+ data = nil
66
+ if (compressed)
67
+ store = Field::Store::COMPRESS
68
+ b = " " * @fields_stream.read_vint()
69
+ @fields_stream.read_bytes(b, 0, b.length)
70
+ data = uncompress(b)
71
+ else
72
+ data = @fields_stream.read_string()
73
+ end
74
+ stv = Field::TermVector::NO
75
+ if fi.store_term_vector?
76
+ if fi.store_positions? and fi.store_offsets?
77
+ stv = Field::TermVector::WITH_POSITIONS_OFFSETS
78
+ elsif fi.store_positions?
79
+ stv = Field::TermVector::WITH_POSITIONS
80
+ elsif fi.store_offsets?
81
+ stv = Field::TermVector::WITH_OFFSETS
82
+ else
83
+ stv = Field::TermVector::YES
84
+ end
85
+ end
86
+ doc << Field.new(fi.name, data, store, index, stv)
87
+ end
88
+ end
89
+
90
+ return doc
91
+ end
92
+
93
+ def uncompress(input)
94
+ zstream = Zlib::Inflate.new
95
+ buf = zstream.inflate(input)
96
+ zstream.finish
97
+ zstream.close
98
+ buf
99
+ end
100
+ end
101
+
102
+
103
+ class FieldsWriter
104
+
105
+ FIELD_IS_TOKENIZED = 0X1
106
+ FIELD_IS_BINARY = 0X2
107
+ FIELD_IS_COMPRESSED = 0X4
108
+
109
+ def initialize(dir, segment, fi)
110
+ @field_infos = fi
111
+ @fields_stream = dir.create_output(segment + ".fdt")
112
+ @index_stream = dir.create_output(segment + ".fdx")
113
+ end
114
+
115
+ def close()
116
+ @fields_stream.close()
117
+ @index_stream.close()
118
+ end
119
+
120
+ def add_document(doc)
121
+ @index_stream.write_long(@fields_stream.pos)
122
+ stored_count = 0
123
+ doc.all_fields.each() { |field| stored_count += 1 if field.stored?() }
124
+ @fields_stream.write_vint(stored_count)
125
+
126
+ doc.all_fields.each() do |field|
127
+ if (field.stored?())
128
+ @fields_stream.write_vint(@field_infos.field_number(field.name))
129
+
130
+ bits = 0
131
+ bits |= FIELD_IS_TOKENIZED if field.tokenized?
132
+ bits |= FIELD_IS_BINARY if field.binary?
133
+ bits |= FIELD_IS_COMPRESSED if field.compressed?
134
+ @fields_stream.write_byte(bits)
135
+
136
+ data = nil
137
+ if field.compressed?
138
+ if field.binary?
139
+ data = compress(field.binary_value)
140
+ else
141
+ data = compress(field.string_value)
142
+ end
143
+ save_data(data)
144
+ else
145
+ if field.binary?
146
+ save_data(field.binary_value)
147
+ else
148
+ @fields_stream.write_string(field.string_value)
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
154
+ alias :<< :add_document
155
+
156
+ private
157
+
158
+ def compress(input)
159
+ zstream = Zlib::Deflate.new(Zlib::BEST_COMPRESSION)
160
+ buf = zstream.deflate(input, Zlib::FINISH)
161
+ zstream.close
162
+ return buf
163
+ end
164
+
165
+ def save_data(data)
166
+ len = data.length
167
+ if data.is_a? Array
168
+ data = data.pack("C*")
169
+ end
170
+
171
+ @fields_stream.write_vint(len)
172
+ @fields_stream.write_bytes(data, len)
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,228 @@
1
+ module Ferret::Index
2
+ class Index
3
+ include Ferret::Store
4
+ include Ferret::Search
5
+ include Ferret::Document
6
+
7
+ def initialize(options = {})
8
+ if options[:path]
9
+ @dir = FSDirectory.new(options[:path], true)
10
+ options[:close_dir] = true
11
+ elsif options[:dir]
12
+ @dir = options[:dir]
13
+ else
14
+ options[:create] = true # this should always be true for a new RAMDir
15
+ @dir = RAMDirectory.new
16
+ end
17
+
18
+ @options = options
19
+ @writer = IndexWriter.new(@dir, options)
20
+ options[:analyzer] = @analyzer = @writer.analyzer
21
+ @has_writes = false
22
+ @reader = nil
23
+ @options.delete(:create) # only want to create the first time if at all
24
+ @close_dir = @options.delete(:close_dir) || false # we'll hold this here
25
+ @default_field = @options[:default_field] || ""
26
+ @open = true
27
+ end
28
+
29
+ def close
30
+ if not @open
31
+ raise "tried to close an already closed directory"
32
+ end
33
+ @reader.close() if @reader
34
+ @writer.close() if @writer
35
+ @dir.close()
36
+
37
+ @open = false
38
+ end
39
+
40
+ # Get the reader for this index.
41
+ # NOTE:: This will close the writer from this index.
42
+ def reader
43
+ ensure_reader_open()
44
+ return @reader
45
+ end
46
+
47
+ # Get the searcher for this index.
48
+ # NOTE:: This will close the writer from this index.
49
+ def searcher
50
+ ensure_searcher_open()
51
+ return @searcher
52
+ end
53
+
54
+ # Get the writer for this index.
55
+ # NOTE:: This will close the reader from this index.
56
+ def writer
57
+ ensure_writer_open()
58
+ return @writer
59
+ end
60
+
61
+ # Adds a document to this index, using the provided analyzer instead of
62
+ # the local analyzer if provided. If the document contains more than
63
+ # IndexWriter::MAX_FIELD_LENGTH terms for a given field, the remainder are
64
+ # discarded.
65
+ def add_document(doc, analyzer = nil)
66
+ ensure_writer_open()
67
+ fdoc = nil
68
+ if doc.is_a?(String)
69
+ fdoc = Document.new
70
+ fdoc << Field.new(@default_field, doc,
71
+ Field::Store::YES, Field::Index::TOKENIZED)
72
+ elsif doc.is_a?(Array)
73
+ fdoc = Document.new
74
+ doc.each() do |field|
75
+ fdoc << Field.new(@default_field, field,
76
+ Field::Store::YES, Field::Index::TOKENIZED)
77
+ end
78
+ elsif doc.is_a?(Hash)
79
+ fdoc = Document.new
80
+ doc.each_pair() do |field, text|
81
+ fdoc << Field.new(field.to_s, text.to_s,
82
+ Field::Store::YES, Field::Index::TOKENIZED)
83
+ end
84
+ elsif doc.is_a?(Document)
85
+ fdoc = doc
86
+ else
87
+ raise ArgumentError, "Unknown document type #{doc.class}"
88
+ end
89
+ @has_writes = true
90
+
91
+ @writer.add_document(fdoc, analyzer || @writer.analyzer)
92
+ end
93
+ alias :<< :add_document
94
+
95
+ # The main search method for the index. You need to create a query to
96
+ # pass to this method. You can also pass a hash with one or more of the
97
+ # following; {filter, num_docs, first_doc, sort}
98
+ #
99
+ # query:: the query to run on the index
100
+ # filter:: filters docs from the search result
101
+ # first_doc:: The index in the results of the first doc retrieved.
102
+ # Default is 0
103
+ # num_docs:: The number of results returned. Default is 10
104
+ # sort:: an array of SortFields describing how to sort the results.
105
+ def search(query, options = {})
106
+ if query.is_a?(String)
107
+ if @qp.nil?
108
+ @qp = Ferret::QueryParser.new(@default_field, options)
109
+ end
110
+ query = @qp.parse(query)
111
+ end
112
+
113
+ ensure_searcher_open()
114
+ return @searcher.search(query, options)
115
+ end
116
+
117
+ # See Index#search
118
+ #
119
+ # This method yields the doc and score for each hit.
120
+ # eg.
121
+ # index.search_each() do |doc, score|
122
+ # puts "hit document number #{doc} with a score of #{score}"
123
+ # end
124
+ #
125
+ def search_each(query, options = {}) # :yield: doc, score
126
+ search(query, options).score_docs.each do |score_doc|
127
+ yield score_doc.doc, score_doc.score
128
+ end
129
+ end
130
+
131
+ # Retrieve the document referenced by the document number +id+, if id is
132
+ # an integer or the first document with term +id+ if +id+ is a term.
133
+ #
134
+ # id:: The number of the document to retrieve, or the term used as the id
135
+ # for the document we wish to retrieve
136
+ def doc(id)
137
+ ensure_reader_open()
138
+ if id.is_a?(String)
139
+ t = Term.new("id", id.to_s)
140
+ return @reader.get_document_with_term(t)
141
+ elsif id.is_a?(Term)
142
+ return @reader.get_document_with_term(id)
143
+ else
144
+ return @reader.get_document(id)
145
+ end
146
+ end
147
+ alias :[] :doc
148
+
149
+ # Delete the document referenced by the document number +id+ if +id+ is an
150
+ # integer or all of the documents which have the term +id+ if +id+ is a
151
+ # term..
152
+ #
153
+ # id:: The number of the document to delete
154
+ def delete(id)
155
+ ensure_reader_open()
156
+ if id.is_a?(String)
157
+ t = Term.new("id", id.to_s)
158
+ return @reader.delete_docs_with_term(t)
159
+ elsif id.is_a?(Term)
160
+ return @reader.delete_docs_with_term(id)
161
+ else
162
+ return @reader.delete(id)
163
+ end
164
+ end
165
+
166
+ # Returns true if document +n+ has been deleted
167
+ def deleted?(n)
168
+ ensure_reader_open()
169
+ return @reader.deleted?(n)
170
+ end
171
+
172
+ # Returns true if any documents have been deleted since the index was last
173
+ # flushed.
174
+ def has_deletions?()
175
+ ensure_reader_open()
176
+ return @reader.has_deletions?
177
+ end
178
+
179
+ # Returns true if any documents have been added to the index since the
180
+ # last flush.
181
+ def has_writes?()
182
+ return @has_writes
183
+ end
184
+
185
+ # optimizes the index. This should only be called when the index will no
186
+ # longer be updated very often, but will be read a lot.
187
+ def optimize()
188
+ ensure_writer_open()
189
+ @writer.optimize()
190
+ @modified = true
191
+ end
192
+
193
+ # returns the number of documents in the index
194
+ def size()
195
+ ensure_reader_open()
196
+ return @reader.num_docs()
197
+ end
198
+
199
+ protected
200
+ def ensure_writer_open()
201
+ raise "tried to use a closed index" if not @open
202
+ return if @writer
203
+ if @reader
204
+ @reader.close
205
+ @reader = nil
206
+ @searcher = nil
207
+ end
208
+ @writer = IndexWriter.new(@dir, @options)
209
+ end
210
+
211
+ def ensure_reader_open()
212
+ raise "tried to use a closed index" if not @open
213
+ return if @reader
214
+ if @writer
215
+ @writer.close
216
+ @writer = nil
217
+ end
218
+ @reader = IndexReader.open(@dir, false)
219
+ end
220
+
221
+ def ensure_searcher_open()
222
+ raise "tried to use a closed index" if not @open
223
+ return if @searcher
224
+ ensure_reader_open()
225
+ @searcher = IndexSearcher.new(@reader)
226
+ end
227
+ end
228
+ end
@@ -0,0 +1,33 @@
1
+ module Ferret
2
+ module Index
3
+ # Useful constants representing filenames and extensions used by lucene
4
+ class IndexFileNames
5
+
6
+ # Name of the index segment file
7
+ SEGMENTS = "segments"
8
+
9
+ # Name of the index deletable file
10
+ DELETABLE = "deletable"
11
+
12
+ # This array contains all filename extensions used by Lucene's index files, with
13
+ # one exception, namely the extension made up from +.f+ + a number.
14
+ # Also note that two of Lucene's files (+deletable+ and
15
+ # +segments+) don't have any filename extension.
16
+ INDEX_EXTENSIONS = [
17
+ "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
18
+ "tvx", "tvd", "tvf", "tvp"
19
+ ]
20
+
21
+ # File extensions of old-style index files
22
+ COMPOUND_EXTENSIONS = [
23
+ "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
24
+ ]
25
+
26
+ # File extensions for term vector support
27
+ VECTOR_EXTENSIONS = [
28
+ "tvx", "tvd", "tvf"
29
+ ]
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,462 @@
1
+ require 'monitor'
2
+
3
+ module Ferret::Index
4
+ # IndexReader is an abstract class, providing an interface for accessing an
5
+ # index. Search of an index is done entirely through this abstract interface,
6
+ # class which implements it is searchable.
7
+ #
8
+ # Concrete subclasses of IndexReader are usually constructed with a call to
9
+ # one of the static <tt>open()</tt> methods, e.g. <tt>#open</tt>.
10
+ #
11
+ # For efficiency, in this API documents are often referred to via
12
+ # _document numbers_, non-negative integers which each name a unique
13
+ # document in the index. These document numbers are ephemeral, ie they may change
14
+ # as documents are added to and deleted from an index. Clients should thus not
15
+ # rely on a given document having the same number between sessions.
16
+ #
17
+ # An IndexReader can be opened on a directory for which an IndexWriter is
18
+ # opened already, but it cannot be used to delete documents from the index then.
19
+ class IndexReader
20
+ include MonitorMixin
21
+
22
+ # This array contains all filename extensions used by Lucene's index files, with
23
+ # one exception, namely the extension made up from +.f+ + a number.
24
+ # Also note that two of Lucene's files (+deletable+ and
25
+ # +segments+) don't have any filename extension.
26
+ FILENAME_EXTENSIONS = ["cfs",
27
+ "fnm",
28
+ "fdx",
29
+ "fdt",
30
+ "tii",
31
+ "tis",
32
+ "frq",
33
+ "prx",
34
+ "del",
35
+ "tvx",
36
+ "tvd",
37
+ "tvf",
38
+ "tvp"]
39
+
40
+ attr_reader :directory
41
+
42
+ class FieldOption < Ferret::Utils::Parameter
43
+ # all fields
44
+ ALL = FieldOption.new("ALL")
45
+ # all indexed fields
46
+ INDEXED = FieldOption.new("INDEXED")
47
+ # all fields which are not indexed
48
+ UNINDEXED = FieldOption.new("UNINDEXED")
49
+ # all fields which are indexed with termvectors enables
50
+ INDEXED_WITH_TERM_VECTOR = FieldOption.new("INDEXED_WITH_TERM_VECTOR")
51
+ # all fields which are indexed but don't have termvectors enabled
52
+ INDEXED_NO_TERM_VECTOR = FieldOption.new("INDEXED_NO_TERM_VECTOR")
53
+ # all fields where termvectors are enabled. Please note that only standard
54
+ # termvector fields are returned
55
+ TERM_VECTOR = FieldOption.new("TERM_VECTOR")
56
+ # all field with termvectors wiht positions enabled
57
+ TERM_VECTOR_WITH_POSITION = FieldOption.new("TERM_VECTOR_WITH_POSITION")
58
+ # all fields where termvectors with offset position are set
59
+ TERM_VECTOR_WITH_OFFSET = FieldOption.new("TERM_VECTOR_WITH_OFFSET")
60
+ # all fields where termvectors with offset and position values set
61
+ TERM_VECTOR_WITH_POSITION_OFFSET =
62
+ FieldOption.new("TERM_VECTOR_WITH_POSITION_OFFSET")
63
+ end
64
+
65
+ # directory:: Directory where IndexReader files reside.
66
+ # segment_infos:: Used for write-l
67
+ # close_directory:: close the directory when the index reader is closed
68
+ def initialize(directory, segment_infos = nil,
69
+ close_directory = false, directory_owner = false)
70
+ super()
71
+ @directory = directory
72
+ @close_directory = close_directory
73
+ @segment_infos = segment_infos
74
+ @directory_owner = directory_owner
75
+
76
+ @has_changes = false
77
+ @stale = false
78
+ @write_lock = nil
79
+
80
+ #ObjectSpace.define_finalizer(self, lambda { |id| @write_lock.release() if @write_lock})
81
+ end
82
+
83
+ # Returns an index reader to read the index in the directory
84
+ def IndexReader.open(directory, close_directory = true, infos = nil)
85
+ directory.synchronize do # in- & inter-process sync
86
+ commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
87
+ commit_lock.while_locked() do
88
+ if infos.nil?
89
+ infos = SegmentInfos.new()
90
+ infos.read(directory)
91
+ end
92
+ if (infos.size() == 1) # index is optimized
93
+ return SegmentReader.get(infos[0], infos, close_directory)
94
+ end
95
+ readers = Array.new(infos.size)
96
+ infos.size.times do |i|
97
+ readers[i] = SegmentReader.get(infos[i])
98
+ end
99
+ return MultiReader.new(readers, directory, infos, close_directory)
100
+ end
101
+ end
102
+ end
103
+
104
+ # Reads version number from segments files. The version number counts the
105
+ # number of changes of the index.
106
+ #
107
+ # directory:: where the index resides.
108
+ # returns:: version number.
109
+ # raises:: IOError if segments file cannot be read.
110
+ def IndexReader.get_current_version(directory)
111
+ return SegmentInfos.read_current_version(directory)
112
+ end
113
+
114
+ # Return an array of term vectors for the specified document. The array
115
+ # contains a vector for each vectorized field in the document. Each vector
116
+ # contains terms and frequencies for all terms in a given vectorized field.
117
+ # If no such fields existed, the method returns nil. The term vectors that
118
+ # are returned my either be of type TermFreqVector or of type
119
+ # TermDocPosEnumVector if positions or offsets have been stored.
120
+ #
121
+ # doc_number:: document for which term vectors are returned
122
+ # returns:: array of term vectors. May be nil if no term vectors have been
123
+ # stored for the specified document.
124
+ # raises:: IOError if index cannot be accessed
125
+ #
126
+ # See Field.TermVector
127
+ def get_term_vectors(doc_number)
128
+ raise NotImplementedError
129
+ end
130
+
131
+
132
+
133
+ # Return a term vector for the specified document and field. The returned
134
+ # vector contains terms and frequencies for the terms in the specified
135
+ # field of this document, if the field had the storeTermVector flag set. If
136
+ # termvectors had been stored with positions or offsets, a
137
+ # TermDocPosEnumVector is returned.
138
+ #
139
+ # doc_number:: document for which the term vector is returned
140
+ # field:: field for which the term vector is returned.
141
+ # returns:: term vector May be nil if field does not exist in the specified
142
+ # document or term vector was not stored.
143
+ # raises:: IOError if index cannot be accessed
144
+ # See Field.TermVector
145
+ def get_term_vector(doc_number, field)
146
+ raise NotImplementedError
147
+ end
148
+
149
+
150
+ # Returns +true+ if an index exists at the specified directory. If the
151
+ # directory does not exist or if there is no index in it.
152
+ #
153
+ # directory:: the directory to check for an index
154
+ # returns:: +true+ if an index exists; +false+ otherwise
155
+ # raises:: IOError if there is a problem with accessing the index
156
+ def IndexReader.index_exists?(directory)
157
+ return directory.exists?("segments")
158
+ end
159
+
160
+ # Returns the number of documents in this index.
161
+ def num_docs()
162
+ raise NotImplementedError
163
+ end
164
+
165
+ # Returns one greater than the largest possible document number.
166
+ #
167
+ # This may be used to, e.g., determine how big to allocate an array which
168
+ # will have an element for every document number in an index.
169
+ def max_doc()
170
+ raise NotImplementedError
171
+ end
172
+
173
+ # Returns the stored fields of the +n+<sup>th</sup>
174
+ # +Document+ in this index.
175
+ def get_document(n)
176
+ raise NotImplementedError
177
+ end
178
+
179
+ # Returns the first document with the term +term+. This is useful, for
180
+ # example, if we are indexing rows from a database. We can store the id of
181
+ # each row in a field in the index and use this method get the document by
182
+ # the id. Hence, only one document is returned.
183
+ #
184
+ # term: The term we are searching for.
185
+ def get_document_with_term(term)
186
+ docs = term_docs_for(term)
187
+ if (docs == nil) then return nil end
188
+ document = nil
189
+ begin
190
+ document = get_document(docs.doc) if docs.next?
191
+ ensure
192
+ docs.close()
193
+ end
194
+ return document
195
+ end
196
+
197
+ # Returns true if document _n_ has been deleted
198
+ def deleted?(n)
199
+ raise NotImplementedError
200
+ end
201
+
202
+ # Returns true if any documents have been deleted
203
+ def has_deletions?()
204
+ raise NotImplementedError
205
+ end
206
+
207
+ # Returns the byte-encoded normalization factor for the named field of
208
+ # every document. This is used by the search code to score documents.
209
+ #
210
+ # See Field#boost
211
+ def get_norms(field, bytes=nil, offset=nil)
212
+ raise NotImplementedError
213
+ end
214
+
215
+ # Expert: Resets the normalization factor for the named field of the named
216
+ # document. The norm represents the product of the field's Field#boost and
217
+ # its Similarity#length_norm length normalization. Thus, to preserve the
218
+ # length normalization values when resetting this, one should base the new
219
+ # value upon the old.
220
+ #
221
+ # See #get_norms
222
+ # See Similarity#decode_norm
223
+ def set_norm(doc, field, value)
224
+ synchronize do
225
+ value = Similarity.encode_norm(value) if value.is_a? Float
226
+ if(@directory_owner)
227
+ acquire_write_lock()
228
+ end
229
+ do_set_norm(doc, field, value)
230
+ @has_changes = true
231
+ end
232
+ end
233
+
234
+ # Implements set_norm in subclass.
235
+ def do_set_norm(doc, field, value)
236
+ raise NotImplementedError
237
+ end
238
+
239
+ # Returns an enumeration of all the terms in the index.
240
+ # Each term is greater than all that precede it in the enumeration.
241
+ def terms()
242
+ raise NotImplementedError
243
+ end
244
+
245
+ # Returns an enumeration of all terms after a given term.
246
+ #
247
+ # Each term is greater than all that precede it in the enumeration.
248
+ def terms_from(t)
249
+ raise NotImplementedError
250
+ end
251
+
252
+ # Returns the number of documents containing the term +t+.
253
+ def doc_freq(t)
254
+ raise NotImplementedError
255
+ end
256
+
257
+ # Returns an enumeration of all the documents which contain +term+. For each
258
+ # document, the document number, the frequency of the term in that document
259
+ # is also provided, for use in search scoring. Thus, this method implements
260
+ # the mapping:
261
+ #
262
+ # Term => <doc_num, freq><sup>*</sup>
263
+ #
264
+ # The enumeration is ordered by document number. Each document number is
265
+ # greater than all that precede it in the enumeration.
266
+ def term_docs_for(term)
267
+ term_docs = term_docs()
268
+ term_docs.seek(term)
269
+ return term_docs
270
+ end
271
+
272
+ # Returns an unpositioned TermDocEnum enumerator.
273
+ def term_docs()
274
+ raise NotImplementedError
275
+ end
276
+
277
+ # Returns an enumeration of all the documents which contain
278
+ # +term+. For each document, in addition to the document number
279
+ # and frequency of the term in that document, a list of all of the ordinal
280
+ # positions of the term in the document is available. Thus, this method
281
+ # implements the mapping:
282
+ #
283
+ # Term => <doc_num, freq, < pos<sub>1</sub>, pos<sub>2</sub>, ...
284
+ # pos<sub>freq-1</sub> > > <sup>*</sup>
285
+ #
286
+ # This positional information faciliates phrase and proximity searching.
287
+ # The enumeration is ordered by document number. Each document number is
288
+ # greater than all that precede it in the enumeration.
289
+ def term_positions_for(term)
290
+ term_positions = term_positions()
291
+ term_positions.seek(term)
292
+ return term_positions
293
+ end
294
+
295
+ # Returns an unpositioned @link TermDocPosEnumendenumerator.
296
+ def term_positions()
297
+ raise NotImplementedError
298
+ end
299
+
300
+ # Tries to acquire the WriteLock on this directory.
301
+ #
302
+ # This method is only valid if this IndexReader is directory owner.
303
+ #
304
+ # raises:: IOError If WriteLock cannot be acquired.
305
+ def acquire_write_lock()
306
+ if @stale
307
+ raise IOError, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations"
308
+ end
309
+
310
+ if (@write_lock == nil)
311
+ @write_lock = @directory.make_lock(IndexWriter::WRITE_LOCK_NAME)
312
+ if not @write_lock.obtain(IndexWriter::WRITE_LOCK_TIMEOUT) # obtain write lock
313
+ raise IOError, "Index locked for write: " + @write_lock
314
+ end
315
+
316
+ # we have to check whether index has changed since this reader was opened.
317
+ # if so, this reader is no longer valid for deletion
318
+ if (SegmentInfos.read_current_version(@directory) > @segment_infos.version())
319
+ @stale = true
320
+ @write_lock.release()
321
+ @write_lock = nil
322
+ raise IOError, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations"
323
+ end
324
+ end
325
+ end
326
+
327
+ # Deletes the document numbered +doc_num+. Once a document is deleted it
328
+ # will not appear in TermDocEnum or TermPostitions enumerations. Attempts to
329
+ # read its field with the @link #documentend method will result in an error.
330
+ # The presence of this document may still be reflected in the @link
331
+ # #docFreqendstatistic, though this will be corrected eventually as the
332
+ # index is further modified.
333
+ def delete(doc_num)
334
+ synchronize do
335
+ acquire_write_lock() if @directory_owner
336
+ do_delete(doc_num)
337
+ @has_changes = true
338
+ end
339
+ return 1
340
+ end
341
+
342
+ # Implements deletion of the document numbered +doc_num+.
343
+ # Applications should call @link #delete(int)endor @link #delete(Term)end.
344
+ def do_delete(doc_num)
345
+ raise NotImplementedError
346
+ end
347
+
348
+ # Deletes all documents containing +term+.
349
+ # This is useful if one uses a document field to hold a unique ID string for
350
+ # the document. Then to delete such a document, one merely constructs a
351
+ # term with the appropriate field and the unique ID string as its text and
352
+ # passes it to this method. Returns the number of documents deleted. See
353
+ # #delete for information about when this deletion will become effective.
354
+ def delete_docs_with_term(term)
355
+ docs = term_docs_for(term)
356
+ if (docs == nil) then return 0 end
357
+ n = 0
358
+ begin
359
+ while (docs.next?)
360
+ delete(docs.doc)
361
+ n += 1
362
+ end
363
+ ensure
364
+ docs.close()
365
+ end
366
+ return n
367
+ end
368
+
369
+ # Undeletes all documents currently marked as deleted in this index.
370
+ def undelete_all()
371
+ synchronize do
372
+ acquire_write_lock() if @directory_owner
373
+ do_undelete_all()
374
+ @has_changes = true
375
+ end
376
+ end
377
+
378
+ # Commit changes resulting from delete, undelete_all, or set_norm operations
379
+ #
380
+ # raises:: IOError
381
+ def commit()
382
+ synchronize do
383
+ if @has_changes
384
+ if @directory_owner
385
+ @directory.synchronize do # in- & inter-process sync
386
+ commit_lock = @directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
387
+ commit_lock.while_locked do
388
+ do_commit()
389
+ @segment_infos.write(@directory)
390
+ end
391
+ end
392
+ if (@write_lock != nil)
393
+ @write_lock.release() # release write lock
394
+ @write_lock = nil
395
+ end
396
+ else
397
+ do_commit()
398
+ end
399
+ end
400
+ @has_changes = false
401
+ end
402
+ end
403
+
404
+ # Closes files associated with this index.
405
+ # Also saves any new deletions to disk.
406
+ # No other methods should be called after this has been called.
407
+ def close()
408
+ synchronize do
409
+ commit()
410
+ do_close()
411
+ @directory.close() if @close_directory
412
+ end
413
+ end
414
+
415
+ protected
416
+
417
+ # Implements actual undelete_all() in subclass.
418
+ def do_undelete_all()
419
+ raise NotImplementedError
420
+ end
421
+
422
+ # Implements commit.
423
+ def do_commit()
424
+ raise NotImplementedError
425
+ end
426
+
427
+
428
+ # Implements close.
429
+ def do_close()
430
+ raise NotImplementedError
431
+ end
432
+
433
+ # Get a list of unique field names that exist in this index and have the
434
+ # specified field option information.
435
+ # fld_option:: specifies which field option should be available for the
436
+ # returned fields
437
+ # returns:: Collection of Strings indicating the names of the fields.
438
+ # See IndexReader.FieldOption
439
+ def get_field_names()
440
+ raise NotImplementedError
441
+ end
442
+
443
+ # Returns +true+ iff the index in the named directory is
444
+ # currently locked.
445
+ # directory:: the directory to check for a lock
446
+ # raises:: IOError if there is a problem with accessing the index
447
+ def IndexReader.locked?(directory)
448
+ return (directory.make_lock(IndexWriter::WRITE_LOCK_NAME).locked? or
449
+ directory.make_lock(IndexWriter::COMMIT_LOCK_NAME).locked?)
450
+ end
451
+
452
+ # Forcibly unlocks the index in the named directory.
453
+ #
454
+ # Caution: this should only be used by failure recovery code,
455
+ # when it is known that no other process nor thread is in fact
456
+ # currently accessing this index.
457
+ def IndexReader.unlock(directory)
458
+ directory.make_lock(IndexWriter::WRITE_LOCK_NAME).release
459
+ directory.make_lock(IndexWriter::COMMIT_LOCK_NAME).release
460
+ end
461
+ end
462
+ end