ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,304 @@
1
+ module Ferret::Document
2
+ # A field is a section of a Document. Each field has two parts, a name
3
+ # and a value. Values may be free text, provided as a String or as a
4
+ # Reader, or they may be atomic keywords, which are not further processed.
5
+ # Such keywords may be used to represent dates, urls, etc. Fields are
6
+ # optionally stored in the index, so that they may be returned with hits
7
+ # on the document.
8
+ class Field
9
+
10
+ # This value will be
11
+ # multiplied into the score of all hits on this field of this
12
+ # document.
13
+ #
14
+ # The boost is multiplied by Document#boost of the document
15
+ # containing this field. If a document has multiple fields with the same
16
+ # name, all such values are multiplied together. This product is then
17
+ # multipled by the value Similarity#length_norm(String,int), and
18
+ # rounded by Similarity#encode_norm(float) before it is stored in the
19
+ # index. One should attempt to ensure that this product does not overflow
20
+ # the range of that encoding.
21
+ #
22
+ # See Document#set_boost(float)
23
+ # See Similarity#length_norm(String, int)
24
+ # See Similarity#encode_norm(float)
25
+ #
26
+ # Note: this value is not stored directly with the document in the index.
27
+ # Documents returned from IndexReader#document(int) and
28
+ # Hits#doc(int) may thus not have the same value present as when this field
29
+ # was indexed.
30
+ attr_accessor :boost, :data
31
+
32
+ attr_reader :name
33
+
34
+ # True iff the value of the field is to be stored in the index for
35
+ # return with search hits. It is an error for this to be true if a
36
+ # field is Reader-valued.
37
+ def stored?() return @stored end
38
+
39
+ # True iff the value of the field is to be indexed, so that it may be
40
+ # searched on.
41
+ def indexed?() return @indexed end
42
+
43
+ # True iff the value of the field should be tokenized as text prior to
44
+ # indexing. Un-tokenized fields are indexed as a single word and may
45
+ # not be Reader-valued.
46
+ def tokenized?() return @tokenized end
47
+
48
+ # True if the field is to be stored as a binary value. This can be used
49
+ # to store images or other binary data in the index if you wish
50
+ def binary?() return @binary end
51
+
52
+ # True if you want to compress the data that you store. This is a good
53
+ # idea for really large text fields. The ruby Zlib library is used to do
54
+ # the compression
55
+ def compressed?() return @compressed end
56
+
57
+ # True iff the term or terms used to index this field are stored as a
58
+ # term vector, available from IndexReader#term_freq_vector(). These
59
+ # methods do not provide access to the original content of the field,
60
+ # only to terms used to index it. If the original content must be
61
+ # preserved, use the _stored_ attribute instead.
62
+ #
63
+ # See IndexReader#term_freq_vector()
64
+ def store_term_vector?() return @store_term_vector end
65
+
66
+ # True if the positions of the indexed terms in this field are stored.
67
+ def store_positions?() return @store_position end
68
+
69
+ # True if the offsets of this field are stored. The offsets are the
70
+ # positions of the start and end characters of the token in the whole
71
+ # field string
72
+ def store_offsets?() return @store_offset end
73
+
74
+ class Store < Ferret::Utils::Parameter
75
+ # Store the original field value in the index in a compressed form.
76
+ # This is useful for long documents and for binary valued fields.
77
+ COMPRESS = Store.new("COMPRESS")
78
+
79
+ # Store the original field value in the index. This is useful for
80
+ # short texts like a document's title which should be displayed with
81
+ # the results. The value is stored in its original form, i.e. no
82
+ # analyzer is used before it is stored.
83
+ YES = Store.new("YES")
84
+
85
+ # Do not store the field value in the index.
86
+ NO = Store.new("NO")
87
+ end
88
+
89
+ class Index < Ferret::Utils::Parameter
90
+ # Do not index the field value. This field can thus not be searched,
91
+ # but one can still access its contents provided it is Field.Store
92
+ # stored
93
+ NO = Index.new("NO")
94
+
95
+ # Index the field's value so it can be searched. An Analyzer will be
96
+ # used to tokenize and possibly further normalize the text before its
97
+ # terms will be stored in the index. This is useful for common text.
98
+ TOKENIZED = Index.new("TOKENIZED")
99
+
100
+ # Index the field's value without using an Analyzer, so it can be
101
+ # searched. As no analyzer is used the value will be stored as a
102
+ # single term. This is useful for unique Ids like product numbers.
103
+ UNTOKENIZED = Index.new("UNTOKENIZED")
104
+ end
105
+
106
+ class TermVector < Ferret::Utils::Parameter
107
+ # Do not store term vectors.
108
+ NO = TermVector.new("NO")
109
+
110
+ # Store the term vectors of each document. A term vector is a list of
111
+ # the document's terms and their number of occurences in that
112
+ # document.
113
+ YES = TermVector.new("YES")
114
+
115
+ # Store the term vector + token position information
116
+ #
117
+ # See #YES
118
+ WITH_POSITIONS = TermVector.new("WITH_POSITIONS")
119
+
120
+ # Store the term vector + Token offset information
121
+ #
122
+ # See #YES
123
+ WITH_OFFSETS = TermVector.new("WITH_OFFSETS")
124
+
125
+ # Store the term vector + Token position and offset information
126
+ #
127
+ # See #YES See #WITH_POSITIONS See #WITH_OFFSETS
128
+ WITH_POSITIONS_OFFSETS = TermVector.new("WITH_POSITIONS_OFFSETS")
129
+ end
130
+
131
+ # Create a field by specifying its name, value and how it will
132
+ # be saved in the index.
133
+ #
134
+ # name:: The name of the field
135
+ # value:: The string to process
136
+ # store:: Whether _value_ should be stored in the index
137
+ # index:: Whether the field should be indexed, and if so, if it should
138
+ # be tokenized before indexing
139
+ #
140
+ # store_term_vector:: Whether term vector should be stored
141
+ # * the field is neither stored nor indexed
142
+ # * the field is not indexed but term_vector is _TermVector::YES_
143
+ #
144
+ # binary:: Whether you want to store binary data in this field. Default is
145
+ # false
146
+ # boost:: the boost for this field. Default is 1.0. A larger number makes
147
+ # this field more important.
148
+ def initialize(name,
149
+ value,
150
+ stored = Store::YES,
151
+ index = Index::UNTOKENIZED,
152
+ store_term_vector = TermVector::NO,
153
+ binary = false,
154
+ boost = 1.0)
155
+ if (index == Index::NO and stored == Store::NO)
156
+ raise ArgumentError, "it doesn't make sense to have a field that " +
157
+ "is neither indexed nor stored"
158
+ end
159
+ if (index == Index::NO && store_term_vector != TermVector::NO)
160
+ raise ArgumentError, "cannot store term vector information for a " +
161
+ "field that is not indexed"
162
+ end
163
+
164
+ # The name of the field (e.g., "date", "subject", "title", or "body")
165
+ @name = name
166
+
167
+ # the one and only data object for all different kind of field values
168
+ @data = value
169
+ self.stored = stored
170
+ self.index = index
171
+ self.store_term_vector = store_term_vector
172
+ @binary = binary
173
+ @boost = boost
174
+ end
175
+
176
+ def stored=(stored)
177
+ if (stored == Store::YES)
178
+ @stored = true
179
+ @compressed = false
180
+ elsif (stored == Store::COMPRESS)
181
+ @stored = true
182
+ @compressed = true
183
+ elsif (stored == Store::NO)
184
+ @stored = false
185
+ @compressed = false
186
+ else
187
+ raise "unknown stored parameter " + stored.to_s
188
+ end
189
+ end
190
+
191
+ def index=(index)
192
+ if (index == Index::NO)
193
+ @indexed = false
194
+ @tokenized = false
195
+ elsif (index == Index::TOKENIZED)
196
+ @indexed = true
197
+ @tokenized = true
198
+ elsif (index == Index::UNTOKENIZED)
199
+ @indexed = true
200
+ @tokenized = false
201
+ else
202
+ raise "unknown stored parameter " + index.to_s
203
+ end
204
+ end
205
+
206
+ def store_term_vector=(store_term_vector)
207
+ if (store_term_vector == TermVector::NO)
208
+ @store_term_vector = false
209
+ @store_position = false
210
+ @store_offset = false
211
+ elsif (store_term_vector == TermVector::YES)
212
+ @store_term_vector = true
213
+ @store_position = false
214
+ @store_offset = false
215
+ elsif (store_term_vector == TermVector::WITH_POSITIONS)
216
+ @store_term_vector = true
217
+ @store_position = true
218
+ @store_offset = false
219
+ elsif (store_term_vector == TermVector::WITH_OFFSETS)
220
+ @store_term_vector = true
221
+ @store_position = false
222
+ @store_offset = true
223
+ elsif (store_term_vector == TermVector::WITH_POSITIONS_OFFSETS)
224
+ @store_term_vector = true
225
+ @store_position = true
226
+ @store_offset = true
227
+ else
228
+ raise "unknown term_vector parameter " + store_term_vector.to_s
229
+ end
230
+ end
231
+
232
+ # Returns the string value of the data that is stored in this field
233
+ def string_value
234
+ if @data.instance_of? String
235
+ return @data
236
+ elsif @data.respond_to? :read
237
+ return @data.read()
238
+ else
239
+ # if it is binary object try to return a string representation
240
+ return @data.to_s
241
+ end
242
+ end
243
+
244
+ # if the data is stored as a binary, just return it.
245
+ def binary_value
246
+ return @data
247
+ end
248
+
249
+ # Returns the string value of the data that is stored in this field
250
+ def reader_value
251
+ if @data.respond_to? :read
252
+ return @data
253
+ elsif @data.instance_of? String
254
+ return Ferret::Utils::StringHelper::StringReader.new(@data)
255
+ else
256
+ # if it is binary object try to return a string representation
257
+ return Ferret::Utils::StringHelper::StringReader.new(@data.to_s)
258
+ end
259
+ end
260
+
261
+ # Create a stored field with binary value. Optionally the value
262
+ # may be compressed. But it obviously won't be tokenized or
263
+ # term vectored or anything like that.
264
+ #
265
+ # name:: The name of the field
266
+ # value:: The binary value
267
+ # store:: How _value_ should be stored (compressed or not.)
268
+ def Field.new_binary_field(name, value, stored)
269
+ if (stored == Store::NO)
270
+ raise ArgumentError, "binary values can't be unstored"
271
+ end
272
+ Field.new(name, value, stored, Index::NO, TermVector::NO, true)
273
+ end
274
+
275
+ # Prints a Field for human consumption.
276
+ def to_s()
277
+ str = ""
278
+ if (@stored)
279
+ str << "stored"
280
+ @str << @compressed ? "/compressed," : "/uncompressed,"
281
+ end
282
+ if (@indexed) then str << "indexed," end
283
+ if (@tokenized) then str << "tokenized," end
284
+ if (@store_term_vector) then str << "store_term_vector," end
285
+ if (@store_offset)
286
+ str << "term_vector_offsets,"
287
+ end
288
+ if (@store_position)
289
+ str << "term_vector_position,"
290
+ end
291
+ if (@binary) then str << "binary," end
292
+
293
+ str << '<'
294
+ str << @name
295
+ str << ':'
296
+
297
+ if (@data != null)
298
+ str << @data.to_s
299
+ end
300
+
301
+ str << '>'
302
+ end
303
+ end
304
+ end
@@ -0,0 +1,26 @@
1
+ require 'ferret/index/index_file_names'
2
+ require 'ferret/index/term'
3
+ require 'ferret/index/term_buffer'
4
+ require 'ferret/index/term_doc_enum'
5
+ require 'ferret/index/multiple_term_doc_pos_enum'
6
+ require 'ferret/index/term_enum'
7
+ require 'ferret/index/term_info'
8
+ require 'ferret/index/term_infos_io'
9
+ require 'ferret/index/term_vector_offset_info'
10
+ require 'ferret/index/term_vectors_io'
11
+ require 'ferret/index/field_infos'
12
+ require 'ferret/index/fields_io'
13
+ require 'ferret/index/compound_file_io'
14
+ require 'ferret/index/term_buffer'
15
+ require 'ferret/index/segment_term_enum'
16
+ require 'ferret/index/segment_term_vector'
17
+ require 'ferret/index/segment_merge_info'
18
+ require 'ferret/index/segment_merge_queue'
19
+ require 'ferret/index/segment_infos'
20
+ require 'ferret/index/document_writer'
21
+ require 'ferret/index/index_reader'
22
+ require 'ferret/index/index_writer'
23
+ require 'ferret/index/multi_reader'
24
+ require 'ferret/index/segment_merger'
25
+ require 'ferret/index/segment_reader'
26
+ require 'ferret/index/index'
@@ -0,0 +1,343 @@
1
+ require 'monitor'
2
+
3
+ module Ferret::Index
4
+
5
+ # Class for accessing a compound stream.
6
+ # This class implements a directory, but is limited to only read operations.
7
+ # Directory methods that would normally modify data raise.
8
+ class CompoundFileReader < Ferret::Store::Directory
9
+
10
+ include MonitorMixin
11
+
12
+ attr_reader :directory, :file_name
13
+
14
+ # Creates a Compound File Reader which contains a single file and has
15
+ # pointers to the individual files within. When it is initialized, the
16
+ # compound file is set and the header is read so that it is ready to read
17
+ # the individual files within.
18
+ def initialize(dir, name)
19
+
20
+ super()
21
+
22
+ @directory = dir
23
+ @file_name = name
24
+ @entries = {}
25
+
26
+ success = false
27
+
28
+ begin
29
+ @stream = dir.open_input(name)
30
+
31
+ # read the directory and init files
32
+ count = @stream.read_vint()
33
+ entry = nil
34
+ count.times() do
35
+ offset = @stream.read_long()
36
+ id = @stream.read_string()
37
+
38
+ if (entry != nil)
39
+ # set length of the previous entry
40
+ entry.length = offset - entry.offset
41
+ end
42
+
43
+ entry = FileEntry.new(offset)
44
+ @entries[id] = entry
45
+ end
46
+
47
+ # set the length of the final entry
48
+ if (entry != nil)
49
+ entry.length = @stream.length() - entry.offset
50
+ end
51
+
52
+ success = true
53
+
54
+ ensure
55
+
56
+ if not success and (@stream != nil)
57
+ begin
58
+ @stream.close()
59
+ rescue IOError
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ def close()
66
+ synchronize do
67
+ if (@stream == nil): raise(IOError, "Already closed") end
68
+
69
+ @entries.clear()
70
+ @stream.close()
71
+ @stream = nil
72
+ end
73
+ end
74
+
75
+ def open_input(id)
76
+ synchronize do
77
+ if (@stream == nil)
78
+ raise(IOError, "Stream closed")
79
+ end
80
+
81
+ entry = @entries[id]
82
+ if (entry == nil)
83
+ raise(IOError, "No sub-file with id " + id + " found")
84
+ end
85
+ return CSIndexInput.new(@stream, entry.offset, entry.length)
86
+ end
87
+ end
88
+
89
+ # Returns an array of strings, one for each file in the directory.
90
+ def list()
91
+ return @entries.keys()
92
+ end
93
+
94
+ # Returns true iff a file with the given name exists.
95
+ def file_exists(name)
96
+ return @entries.key?(name)
97
+ end
98
+
99
+ # Returns the time the named file was last modified.
100
+ def modified(name)
101
+ return @directory.modified(@file_name)
102
+ end
103
+
104
+ # Set the modified time of an existing file to now.
105
+ def touch(name)
106
+ @directory.touch(@file_name)
107
+ end
108
+
109
+ # Not implemented
110
+ def delete(name) raise(UnsupportedOperationError) end
111
+
112
+ # Not implemented
113
+ def rename(from, to) raise(UnsupportedOperationError) end
114
+
115
+ # Returns the length of a file in the directory.
116
+ def file_length(name)
117
+ e = @entries[name]
118
+ if (e == nil): raise(IOError, "File " + name + " does not exist") end
119
+ return e.length
120
+ end
121
+
122
+ # Not implemented
123
+ def create_output(name) raise(UnsupportedOperationError) end
124
+
125
+ # Not implemented
126
+ def make_lock(name) raise(UnsupportedOperationError) end
127
+
128
+ # Implementation of an IndexInput that reads from a portion of the
129
+ # compound file.
130
+ class CSIndexInput < Ferret::Store::BufferedIndexInput
131
+ attr_reader :length
132
+
133
+ def initialize(base, file_offset, length)
134
+ super()
135
+ @base = base
136
+ @base.extend(MonitorMixin)
137
+ @file_offset = file_offset
138
+ @length = length
139
+ end
140
+
141
+ # Closes the stream to further operations.
142
+ def close() end
143
+
144
+ private
145
+ # Expert: implements buffer refill. Reads bytes from the current
146
+ # position in the input.
147
+ #
148
+ # b:: the array to read bytes into
149
+ # offset:: the offset in the array to start storing bytes
150
+ # len:: the number of bytes to read
151
+ def read_internal(b, offset, len)
152
+ @base.synchronize() do
153
+ start = pos()
154
+ if(start + len > @length): raise(EOFError, "read past EOF") end
155
+ @base.seek(@file_offset + start)
156
+ @base.read_bytes(b, offset, len)
157
+ end
158
+ end
159
+
160
+ # Expert: implements seek. Sets current position in @file, where
161
+ # the next {@link #read_internal(byte[],int,int)} will occur.
162
+ def seek_internal(pos) end
163
+ end
164
+
165
+ private
166
+ # Base info
167
+ class FileEntry
168
+ attr_accessor :offset, :length
169
+ def initialize(offset)
170
+ @offset = offset
171
+ end
172
+ end
173
+
174
+ end
175
+
176
+ # Combines multiple files into a single compound file.
177
+ # The file format:
178
+ #
179
+ # * VInt fileCount
180
+ # * {Directory} fileCount entries with the following structure:
181
+ # + long data_offset
182
+ # + UTFString extension
183
+ # * {File Data} fileCount entries with the raw data of the corresponding file
184
+ #
185
+ # The fileCount integer indicates how many files are contained in this compound
186
+ # file. The {directory} that follows has that many entries. Each directory entry
187
+ # contains an encoding identifier, a long pointer to the start of this file's
188
+ # data section, and a UTF String with that file's extension.
189
+ class CompoundFileWriter
190
+
191
+ attr_reader :directory, :file_name
192
+
193
+ # Create the compound stream in the specified file. The file name is the
194
+ # entire name (no extensions are added).
195
+ def initialize(dir, name)
196
+ @directory = dir
197
+ @file_name = name
198
+ @ids = Set.new
199
+ @file_entries = []
200
+ @merged = false
201
+ end
202
+
203
+ # Add a source stream. _file_name_ is the string by which the
204
+ # sub-stream will be known in the compound stream.
205
+ #
206
+ # Throws:: IllegalStateError if this writer is closed
207
+ # Throws:: IllegalArgumentError if a file with the same name
208
+ # has been added already
209
+ def add_file(file_name)
210
+ if @merged
211
+ raise(IllegalStateError, "Can't add extensions after merge has been called")
212
+ end
213
+
214
+ if not @ids.add?(file_name)
215
+ raise(IllegalArgumentError, "File " + file + " already added")
216
+ end
217
+
218
+ entry = FileEntry.new(file_name)
219
+ @file_entries << entry
220
+ end
221
+
222
+ # Merge files with the extensions added up to now.
223
+ # All files with these extensions are combined sequentially into the
224
+ # compound stream. After successful merge, the source files
225
+ # are deleted.
226
+ #
227
+ # Throws:: IllegalStateException if close() had been called before or
228
+ # if no file has been added to this object
229
+ def close()
230
+
231
+ if @merged
232
+ raise(IllegalStateException, "Merge already performed")
233
+ end
234
+
235
+ if @file_entries.empty?
236
+ raise(IllegalStateException, "No entries to merge have been defined")
237
+ end
238
+
239
+ @merged = true
240
+
241
+ # open the compound stream
242
+ os = nil
243
+ begin
244
+ os = @directory.create_output(@file_name)
245
+
246
+ # Write the number of entries
247
+ os.write_vint(@file_entries.size)
248
+
249
+ # Write the directory with all offsets at 0.
250
+ # Remember the positions of directory entries so that we can
251
+ # adjust the offsets later
252
+ @file_entries.each do |fe|
253
+ fe.directory_offset = os.pos()
254
+ os.write_long(0) # for now
255
+ os.write_string(fe.file_name)
256
+ end
257
+
258
+ # Open the files and copy their data into the stream.
259
+ # Remember the locations of each file's data section.
260
+ @file_entries.each do |fe|
261
+ fe.data_offset = os.pos()
262
+ copy_file(fe, os)
263
+ end
264
+
265
+ # Write the data offsets into the directory of the compound stream
266
+ @file_entries.each do |fe|
267
+ os.seek(fe.directory_offset)
268
+ os.write_long(fe.data_offset)
269
+ end
270
+
271
+ # Close the output stream. Set the os to nil before trying to
272
+ # close so that if an exception occurs during the close, the
273
+ # finally clause below will not attempt to close the stream
274
+ # the second time.
275
+ tmp = os
276
+ os = nil
277
+ tmp.close()
278
+
279
+ ensure
280
+ if (os != nil)
281
+ begin
282
+ os.close()
283
+ rescue
284
+ end
285
+ end
286
+ end
287
+ end
288
+
289
+ private
290
+
291
+ # Internal class for holding a file
292
+ class FileEntry
293
+
294
+ attr_accessor :file_name, :directory_offset, :data_offset
295
+
296
+ def initialize(file_name)
297
+ @file_name = file_name
298
+ end
299
+
300
+ end
301
+
302
+ # Copy the contents of the file with specified extension into the
303
+ # provided output stream. Use a buffer for moving data
304
+ # to reduce memory allocation.
305
+ def copy_file(source, os)
306
+ is = nil
307
+ begin
308
+ start_ptr = os.pos()
309
+
310
+ is = @directory.open_input(source.file_name)
311
+ remainder = length = is.length
312
+
313
+ buffer = Ferret::Store::BUFFER.clone
314
+ while (remainder > 0)
315
+ len = [remainder, Ferret::Store::BUFFER_SIZE].min
316
+ is.read_bytes(buffer, 0, len)
317
+ os.write_bytes(buffer, len)
318
+ remainder -= len
319
+ end
320
+
321
+ # Verify that remainder is 0
322
+ if (remainder != 0)
323
+ raise(IOError,
324
+ "Non-zero remainder length after copying: " + remainder.to_s +
325
+ " (id: " + source.file_name + ", length: " + length.to_s +
326
+ ", buffer size: " + Ferret::Store::BUFFER_SIZE.to_s + ")")
327
+ end
328
+
329
+ # Verify that the output length diff is equal to original file
330
+ end_ptr = os.pos()
331
+ diff = end_ptr - start_ptr
332
+ if (diff != length)
333
+ raise(IOError,
334
+ "Difference in the output file offsets " + diff.to_s +
335
+ " does not match the original file length " + length.to_s)
336
+ end
337
+
338
+ ensure
339
+ if (is != nil): is.close() end
340
+ end
341
+ end
342
+ end
343
+ end