ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,288 @@
1
+ require 'ferret/search/similarity'
2
+
3
+ module Ferret::Index
4
+
5
+ class DocumentWriter
6
+ # If non-nil, a message will be printed to this if max_field_length is
7
+ # reached.
8
+ attr_writer :info_stream
9
+
10
+ # directory:: The directory to write the document information to
11
+ # analyzer:: The analyzer to use for the document
12
+ # similarity:: The Similarity function writer.similarity
13
+ # max_field_length:: The maximum number of tokens a field may have
14
+ # writer.max_field_length
15
+ # term_index_interval:: The interval of terms in the index
16
+ # writer.max_field_length
17
+ def initialize(directory,
18
+ analyzer,
19
+ similarity,
20
+ max_field_length,
21
+ term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
22
+ @directory = directory
23
+ @analyzer = analyzer
24
+ @similarity = similarity
25
+ @max_field_length = max_field_length
26
+ @term_index_interval = term_index_interval
27
+
28
+ # Keys are Terms, values are Postings.
29
+ # Used to buffer a document before it is written to the index.
30
+ @posting_table = {}
31
+
32
+ @term_buffer = Term.new("", "")
33
+ end
34
+
35
+ def add_document(segment, doc)
36
+
37
+ # write field names
38
+ @field_infos = FieldInfos.new()
39
+ @field_infos << doc
40
+ @field_infos.write_to_dir(@directory, segment + ".fnm")
41
+
42
+ # write field values
43
+ fields_writer = FieldsWriter.new(@directory, segment, @field_infos)
44
+ begin
45
+ fields_writer.add_document(doc)
46
+ ensure
47
+ fields_writer.close()
48
+ end
49
+
50
+ # invert doc into posting_table
51
+ @posting_table.clear(); # clear posting_table
52
+ arr_size = @field_infos.size
53
+ @field_lengths = Array.new(arr_size, 0) # init field_lengths
54
+ @field_positions = Array.new(arr_size, 0) # init field_positions
55
+ @field_offsets = Array.new(arr_size, 0) # init field_offsets
56
+ @field_boosts = Array.new(arr_size, doc.boost) # init field_boosts
57
+
58
+ invert_document(doc)
59
+
60
+ # sort posting_table into an array
61
+ postings = sort_posting_table()
62
+
63
+ # for (int i = 0; i < postings.length; i += 1)
64
+ # Posting posting = postings[i]
65
+ # print(posting.term)
66
+ # print(" freq=" + posting.freq)
67
+ # print(" pos=")
68
+ # print(posting.positions[0])
69
+ # for (int j = 1; j < posting.freq; j += 1)
70
+ # print("," + posting.positions[j])
71
+ # puts("")
72
+ # end
73
+
74
+ # write postings
75
+ write_postings(postings, segment)
76
+
77
+ # write norms of indexed fields
78
+ write_norms(segment)
79
+
80
+ end
81
+
82
+ private
83
+
84
+ # Tokenizes the fields of a document into Postings.
85
+ def invert_document(doc)
86
+
87
+ fields = doc.all_fields
88
+ fields.each do |field|
89
+ field_name = field.name
90
+ field_info = @field_infos[field_name]
91
+ field_number = field_info.number
92
+
93
+ length = @field_lengths[field_number] # length of field
94
+ position = @field_positions[field_number] # position in field
95
+ offset = @field_offsets[field_number] # offset field
96
+
97
+ if field_info.indexed?
98
+ if not field.tokenized? # un-tokenized field
99
+ string_value = field.string_value
100
+ if field_info.store_offsets?
101
+ add_position(field_name,
102
+ string_value,
103
+ position,
104
+ TermVectorOffsetInfo.new(offset,
105
+ offset + string_value.length))
106
+ position += 1
107
+ else
108
+ add_position(field_name, string_value, position, nil)
109
+ position += 1
110
+ end
111
+ offset += string_value.length()
112
+ length += 1
113
+ else
114
+
115
+ reader = field.reader_value()
116
+
117
+ # Tokenize field and add to posting_table
118
+ stream = @analyzer.token_stream(field_name, reader)
119
+ begin
120
+ last_token = nil
121
+ while token = stream.next
122
+ position += (token.position_increment - 1)
123
+
124
+ if(field_info.store_offsets?())
125
+ add_position(field_name,
126
+ token.term_text(),
127
+ position,
128
+ TermVectorOffsetInfo.new(
129
+ offset + token.start_offset(),
130
+ offset + token.end_offset()))
131
+ position += 1
132
+ else
133
+ add_position(field_name, token.term_text(), position, nil)
134
+ position += 1
135
+ end
136
+
137
+ last_token = token
138
+ length += 1
139
+ if (length > @max_field_length)
140
+ if @info_stream
141
+ @info_stream.puts("max_field_length " + @max_field_length.to_s + " reached, ignoring following tokens")
142
+ end
143
+ break
144
+ end
145
+ end
146
+
147
+ if(last_token != nil)
148
+ offset += last_token.end_offset() + 1
149
+ end
150
+
151
+ ensure
152
+ stream.close()
153
+ end
154
+ end
155
+
156
+ @field_lengths[field_number] = length # save field length
157
+ @field_positions[field_number] = position # save field position
158
+ @field_boosts[field_number] *= field.boost
159
+ @field_offsets[field_number] = offset
160
+ end
161
+ end
162
+ end
163
+
164
+
165
+ def add_position(field, text, position, tv_offset_info)
166
+ @term_buffer.set!(field, text)
167
+ #puts("Offset: " + tv_offset_info)
168
+ posting = @posting_table[@term_buffer]
169
+ if (posting != nil) # word seen before
170
+ freq = posting.freq
171
+ posting.positions[freq] = position # add new position
172
+ posting.offsets[freq] = tv_offset_info # add new position
173
+
174
+ if (tv_offset_info != nil)
175
+ posting.offsets[freq] = tv_offset_info
176
+ end
177
+ posting.freq = freq + 1 # update frequency
178
+ else # word not seen before
179
+ term = Term.new(field, text)
180
+ @posting_table[term] = Posting.new(term, position, tv_offset_info)
181
+ end
182
+ end
183
+
184
+ def sort_posting_table()
185
+ # copy @posting_table into an array
186
+ return @posting_table.values.sort { |x,y| x.term <=> y.term }
187
+ end
188
+
189
+ def write_postings(postings, segment)
190
+
191
+ freq = nil
192
+ prox = nil
193
+ tis_writer = nil
194
+ tv_writer = nil
195
+ begin
196
+ #open files for inverse index storage
197
+ freq = @directory.create_output(segment + ".frq")
198
+ prox = @directory.create_output(segment + ".prx")
199
+ tis_writer = TermInfosWriter.new(@directory, segment, @field_infos,
200
+ @term_index_interval)
201
+ ti = TermInfo.new()
202
+ current_field = nil
203
+
204
+ postings.each do |posting|
205
+ # add an entry to the dictionary with pointers to prox and freq files
206
+ ti.set_values!(1, freq.pos(), prox.pos(), -1)
207
+ tis_writer.add(posting.term, ti)
208
+
209
+ # add an entry to the freq file
210
+ posting_freq = posting.freq
211
+ if (posting_freq == 1) # optimize freq=1
212
+ freq.write_vint(1) # set low bit of doc num.
213
+ else
214
+ freq.write_vint(0) # the document number
215
+ freq.write_vint(posting_freq) # frequency in doc
216
+ end
217
+
218
+ last_position = 0 # write positions
219
+ posting.positions.each do |position|
220
+ prox.write_vint(position - last_position)
221
+ last_position = position
222
+ end
223
+ # check to see if we switched to a new field
224
+ term_field = posting.term.field
225
+ if (current_field != term_field)
226
+ # changing field - see if there is something to save
227
+ current_field = term_field
228
+ fi = @field_infos[current_field]
229
+ if (fi.store_term_vector?)
230
+ if tv_writer.nil?
231
+ tv_writer = TermVectorsWriter.new(@directory, segment, @field_infos)
232
+ tv_writer.open_document()
233
+ end
234
+ tv_writer.open_field(current_field)
235
+
236
+ elsif not tv_writer.nil?
237
+ tv_writer.close_field()
238
+ end
239
+ end
240
+ if not tv_writer.nil? and tv_writer.field_open?
241
+ tv_writer.add_term(posting.term.text, posting_freq, posting.positions, posting.offsets)
242
+ end
243
+ end
244
+ if not tv_writer.nil?
245
+ tv_writer.close_document()
246
+ end
247
+ ensure
248
+ # make an effort to close all streams we can but remember and re-raise
249
+ # the last exception encountered in this process
250
+ keep = nil
251
+ [freq, prox, tis_writer, tv_writer].compact.each do |obj|
252
+ begin
253
+ obj.close
254
+ rescue IOError => e
255
+ keep = e
256
+ end
257
+ end
258
+ raise keep if not keep.nil?
259
+ end
260
+ end
261
+
262
+ def write_norms(segment)
263
+ @field_infos.each_with_index do |fi, i|
264
+ if fi.indexed?
265
+ norm = @field_boosts[i] * @similarity.length_norm(fi.name, @field_lengths[i])
266
+ norms = @directory.create_output(segment + ".f" + i.to_s)
267
+ begin
268
+ norms.write_byte(Ferret::Search::Similarity.encode_norm(norm))
269
+ ensure
270
+ norms.close()
271
+ end
272
+ end
273
+ end
274
+ end
275
+
276
+ end
277
+
278
+ class Posting # info about a Term in a doc
279
+ attr_accessor :term, :freq, :positions, :offsets
280
+
281
+ def initialize(t, position, offset)
282
+ @term = t
283
+ @freq = 1
284
+ @positions = [position]
285
+ @offsets = [offset]
286
+ end
287
+ end
288
+ end
@@ -0,0 +1,259 @@
1
+ module Ferret
2
+ module Index
3
+ # Access to the Field Info file that describes document fields and whether or
4
+ # not they are indexed. Each segment has a separate Field Info file. Objects
5
+ # of this class are thread-safe for multiple readers, but only one thread can
6
+ # be adding documents at a time, with no other reader or writer threads
7
+ # accessing this object.
8
+ class FieldInfos
9
+
10
+ NOT_A_FIELD = 0xffffffff # -1 in java int
11
+
12
+ # Construct a FieldInfos object using the directory and the name of the file
13
+ # InputStream
14
+ #
15
+ # dir:: The directory to open the InputStream from
16
+ # name:: The name of the file to open the InputStream from in the Directory
17
+ def initialize(dir = nil, name = nil)
18
+ @fi_array = []
19
+ @fi_hash = {}
20
+ if dir and dir.exists?(name)
21
+ input = dir.open_input(name)
22
+ begin
23
+ read(input)
24
+ ensure
25
+ input.close()
26
+ end
27
+ end
28
+ end
29
+
30
+ # Returns the number of fields that have been added to this field infos
31
+ # object.
32
+ def size
33
+ return @fi_array.size
34
+ end
35
+
36
+ # Automatically adds all of the fields from the document if they haven't
37
+ # been added already. Or it will update the values.
38
+ def add_doc_fields(doc)
39
+ doc.all_fields.each do |field|
40
+ add(field.name,
41
+ field.indexed?,
42
+ field.store_term_vector?,
43
+ field.store_positions?,
44
+ field.store_offsets?)
45
+ end
46
+ end
47
+ alias :<< :add_doc_fields
48
+
49
+ # Calls the 5 param add method to add all the names in the collection
50
+ def add_fields(names,
51
+ indexed = true,
52
+ store_term_vector = false,
53
+ store_position = false,
54
+ store_offset = false)
55
+ names.each do |name|
56
+ add(name, indexed, store_term_vector, store_position, store_offset)
57
+ end
58
+ end
59
+
60
+ # If the field is not yet known, adds it. If it is known, checks to make
61
+ # sure that the indexed flag is the same as was given previously for this
62
+ # field. If not - marks it as being indexed. Same goes for the TermVector
63
+ # parameters.
64
+ #
65
+ # name:: The name of the field
66
+ # indexed:: true if the field is indexed
67
+ # store_term_vector:: true if the term vector should be stored
68
+ # store_position:: true if the positions should be stored
69
+ # store_offset:: true if the offsets should be stored
70
+ def add(name,
71
+ indexed = true,
72
+ store_term_vector = false,
73
+ store_position = false,
74
+ store_offset = false)
75
+ fi = @fi_hash[name]
76
+ if (fi == nil)
77
+ fi = add_internal(name, indexed, store_term_vector, store_position, store_offset)
78
+ else
79
+ if (fi.indexed? != indexed)
80
+ fi.indexed = true # once indexed, always index
81
+ end
82
+ if (fi.store_term_vector? != store_term_vector)
83
+ fi.store_term_vector = true # once vector, always vector
84
+ end
85
+ if (fi.store_positions? != store_position)
86
+ fi.store_position = true # once vector, always vector
87
+ end
88
+ if (fi.store_offsets? != store_offset)
89
+ fi.store_offset = true # once vector, always vector
90
+ end
91
+ end
92
+ return fi
93
+ end
94
+
95
+ # Returns the number of the field that goes by the field name that is
96
+ # passed. If there is no field of this name then -1 is returned
97
+ def field_number(name)
98
+ fi = @fi_hash[name]
99
+ return fi ? fi.number : NOT_A_FIELD
100
+ end
101
+
102
+ # Retrieve the field_info object by either field number or field name.
103
+ def [](index)
104
+ if index.is_a? Integer
105
+ if index == NOT_A_FIELD || index < 0 # < 0 is for C extensions
106
+ return FieldInfo.new("", false, NOT_A_FIELD, false)
107
+ end
108
+ return @fi_array[index]
109
+ else
110
+ return @fi_hash[index]
111
+ end
112
+ end
113
+
114
+ def name(index)
115
+ if index == NOT_A_FIELD || index < 0 # < 0 is for C extensions
116
+ return ""
117
+ end
118
+ return self[index].name
119
+ end
120
+
121
+ # Iterate through the field_info objects
122
+ def each()
123
+ @fi_array.each() {|fi| yield(fi) }
124
+ end
125
+
126
+ # Iterate through the field_info objects including the index
127
+ def each_with_index()
128
+ @fi_array.each_with_index() {|fi, i| yield(fi, i) }
129
+ end
130
+
131
+ # Get the number of field_infos in this object.
132
+ #
133
+ # NOTE: There is a default empty field always added at the start. This
134
+ # may later be used to set the default values for a field.
135
+ def size()
136
+ return @fi_array.size()
137
+ end
138
+
139
+ # Return true if any of the fields have store_term_vector? set to true
140
+ def has_vectors?()
141
+ @fi_array.each() { |fi| return true if fi.store_term_vector? }
142
+ return false
143
+ end
144
+
145
+ # Write the field_infos to a file specified by name in dir.
146
+ #
147
+ # dir:: the directory to write the fieldinfos to
148
+ # name:: the name of the file to write to.
149
+ def write_to_dir(dir, name)
150
+ output = dir.create_output(name)
151
+ begin
152
+ write(output)
153
+ ensure
154
+ output.close()
155
+ end
156
+ end
157
+
158
+ protected
159
+
160
+ # Write the field_infos to the output file
161
+ #
162
+ # output:: the file to write to
163
+ def write(output)
164
+ output.write_vint(size())
165
+ @fi_array.each() do |fi|
166
+ output.write_string(fi.name)
167
+ output.write_byte(get_field_info_byte(fi))
168
+ end
169
+ end
170
+
171
+ # Read the field_infos object from the input file
172
+ #
173
+ # input:: the input file to read from
174
+ def read(input)
175
+ size = input.read_vint()#read in the size
176
+ size.times do |i|
177
+ name = input.read_string()
178
+ bits = input.read_byte()
179
+ indexed = (bits & IS_INDEXED) != 0
180
+ store_term_vector = (bits & STORE_TERM_VECTOR) != 0
181
+ store_position = (bits & STORE_POSITION) != 0
182
+ store_offset = (bits & STORE_OFFSET) != 0
183
+ add_internal(name, indexed, store_term_vector, store_position, store_offset)
184
+ end
185
+ end
186
+
187
+ private
188
+ IS_INDEXED = 0x1;
189
+ STORE_TERM_VECTOR = 0x2;
190
+ STORE_POSITION = 0x4;
191
+ STORE_OFFSET = 0x8;
192
+
193
+ def add_internal(name, indexed, store_term_vector,
194
+ store_position = false,
195
+ store_offset = false)
196
+ fi = FieldInfo.new(name, indexed,
197
+ @fi_array.size(),
198
+ store_term_vector,
199
+ store_position,
200
+ store_offset)
201
+ @fi_array << fi
202
+ @fi_hash[name] = fi
203
+ return fi
204
+ end
205
+
206
+ def get_field_info_byte(fi)
207
+ bits = 0x0
208
+ if (fi.indexed?)
209
+ bits |= IS_INDEXED
210
+ end
211
+ if (fi.store_term_vector?)
212
+ bits |= STORE_TERM_VECTOR
213
+ end
214
+ if (fi.store_positions?)
215
+ bits |= STORE_POSITION
216
+ end
217
+ if (fi.store_offsets?)
218
+ bits |= STORE_OFFSET
219
+ end
220
+ return bits
221
+ end
222
+ end
223
+
224
+ class FieldInfo
225
+ attr_accessor :name, :number
226
+ attr_writer :indexed, :store_term_vector, :store_offset, :store_position
227
+
228
+ def indexed?()
229
+ return @indexed
230
+ end
231
+
232
+ def store_term_vector?()
233
+ return @store_term_vector
234
+ end
235
+
236
+ def store_offsets?()
237
+ return @store_offset
238
+ end
239
+ def store_positions?()
240
+ return @store_position
241
+ end
242
+
243
+ def set!(indexed, store_term_vector, store_position, store_offset)
244
+ @indexed = indexed
245
+ @store_term_vector = store_term_vector
246
+ @store_position = store_position
247
+ @store_offset = store_offset
248
+ end
249
+
250
+ def initialize(name, indexed, number, store_term_vector,
251
+ store_position = false,
252
+ store_offset = false)
253
+ @name = name
254
+ @number = number
255
+ set!(indexed, store_term_vector, store_position, store_offset)
256
+ end
257
+ end
258
+ end
259
+ end