ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,304 @@
1
+ module Ferret::Document
2
+ # A field is a section of a Document. Each field has two parts, a name
3
+ # and a value. Values may be free text, provided as a String or as a
4
+ # Reader, or they may be atomic keywords, which are not further processed.
5
+ # Such keywords may be used to represent dates, urls, etc. Fields are
6
+ # optionally stored in the index, so that they may be returned with hits
7
+ # on the document.
8
+ class Field
9
+
10
+ # This value will be
11
+ # multiplied into the score of all hits on this field of this
12
+ # document.
13
+ #
14
+ # The boost is multiplied by Document#boost of the document
15
+ # containing this field. If a document has multiple fields with the same
16
+ # name, all such values are multiplied together. This product is then
17
+ # multipled by the value Similarity#length_norm(String,int), and
18
+ # rounded by Similarity#encode_norm(float) before it is stored in the
19
+ # index. One should attempt to ensure that this product does not overflow
20
+ # the range of that encoding.
21
+ #
22
+ # See Document#set_boost(float)
23
+ # See Similarity#length_norm(String, int)
24
+ # See Similarity#encode_norm(float)
25
+ #
26
+ # Note: this value is not stored directly with the document in the index.
27
+ # Documents returned from IndexReader#document(int) and
28
+ # Hits#doc(int) may thus not have the same value present as when this field
29
+ # was indexed.
30
+ attr_accessor :boost, :data
31
+
32
+ attr_reader :name
33
+
34
+ # True iff the value of the field is to be stored in the index for
35
+ # return with search hits. It is an error for this to be true if a
36
+ # field is Reader-valued.
37
+ def stored?() return @stored end
38
+
39
+ # True iff the value of the field is to be indexed, so that it may be
40
+ # searched on.
41
+ def indexed?() return @indexed end
42
+
43
+ # True iff the value of the field should be tokenized as text prior to
44
+ # indexing. Un-tokenized fields are indexed as a single word and may
45
+ # not be Reader-valued.
46
+ def tokenized?() return @tokenized end
47
+
48
+ # True if the field is to be stored as a binary value. This can be used
49
+ # to store images or other binary data in the index if you wish
50
+ def binary?() return @binary end
51
+
52
+ # True if you want to compress the data that you store. This is a good
53
+ # idea for really large text fields. The ruby Zlib library is used to do
54
+ # the compression
55
+ def compressed?() return @compressed end
56
+
57
+ # True iff the term or terms used to index this field are stored as a
58
+ # term vector, available from IndexReader#term_freq_vector(). These
59
+ # methods do not provide access to the original content of the field,
60
+ # only to terms used to index it. If the original content must be
61
+ # preserved, use the _stored_ attribute instead.
62
+ #
63
+ # See IndexReader#term_freq_vector()
64
+ def store_term_vector?() return @store_term_vector end
65
+
66
+ # True if the positions of the indexed terms in this field are stored.
67
+ def store_positions?() return @store_position end
68
+
69
+ # True if the offsets of this field are stored. The offsets are the
70
+ # positions of the start and end characters of the token in the whole
71
+ # field string
72
+ def store_offsets?() return @store_offset end
73
+
74
+ class Store < Ferret::Utils::Parameter
75
+ # Store the original field value in the index in a compressed form.
76
+ # This is useful for long documents and for binary valued fields.
77
+ COMPRESS = Store.new("COMPRESS")
78
+
79
+ # Store the original field value in the index. This is useful for
80
+ # short texts like a document's title which should be displayed with
81
+ # the results. The value is stored in its original form, i.e. no
82
+ # analyzer is used before it is stored.
83
+ YES = Store.new("YES")
84
+
85
+ # Do not store the field value in the index.
86
+ NO = Store.new("NO")
87
+ end
88
+
89
+ class Index < Ferret::Utils::Parameter
90
+ # Do not index the field value. This field can thus not be searched,
91
+ # but one can still access its contents provided it is Field.Store
92
+ # stored
93
+ NO = Index.new("NO")
94
+
95
+ # Index the field's value so it can be searched. An Analyzer will be
96
+ # used to tokenize and possibly further normalize the text before its
97
+ # terms will be stored in the index. This is useful for common text.
98
+ TOKENIZED = Index.new("TOKENIZED")
99
+
100
+ # Index the field's value without using an Analyzer, so it can be
101
+ # searched. As no analyzer is used the value will be stored as a
102
+ # single term. This is useful for unique Ids like product numbers.
103
+ UNTOKENIZED = Index.new("UNTOKENIZED")
104
+ end
105
+
106
+ class TermVector < Ferret::Utils::Parameter
107
+ # Do not store term vectors.
108
+ NO = TermVector.new("NO")
109
+
110
+ # Store the term vectors of each document. A term vector is a list of
111
+ # the document's terms and their number of occurences in that
112
+ # document.
113
+ YES = TermVector.new("YES")
114
+
115
+ # Store the term vector + token position information
116
+ #
117
+ # See #YES
118
+ WITH_POSITIONS = TermVector.new("WITH_POSITIONS")
119
+
120
+ # Store the term vector + Token offset information
121
+ #
122
+ # See #YES
123
+ WITH_OFFSETS = TermVector.new("WITH_OFFSETS")
124
+
125
+ # Store the term vector + Token position and offset information
126
+ #
127
+ # See #YES See #WITH_POSITIONS See #WITH_OFFSETS
128
+ WITH_POSITIONS_OFFSETS = TermVector.new("WITH_POSITIONS_OFFSETS")
129
+ end
130
+
131
+ # Create a field by specifying its name, value and how it will
132
+ # be saved in the index.
133
+ #
134
+ # name:: The name of the field
135
+ # value:: The string to process
136
+ # store:: Whether _value_ should be stored in the index
137
+ # index:: Whether the field should be indexed, and if so, if it should
138
+ # be tokenized before indexing
139
+ #
140
+ # store_term_vector:: Whether term vector should be stored
141
+ # * the field is neither stored nor indexed
142
+ # * the field is not indexed but term_vector is _TermVector::YES_
143
+ #
144
+ # binary:: Whether you want to store binary data in this field. Default is
145
+ # false
146
+ # boost:: the boost for this field. Default is 1.0. A larger number makes
147
+ # this field more important.
148
+ def initialize(name,
149
+ value,
150
+ stored = Store::YES,
151
+ index = Index::UNTOKENIZED,
152
+ store_term_vector = TermVector::NO,
153
+ binary = false,
154
+ boost = 1.0)
155
+ if (index == Index::NO and stored == Store::NO)
156
+ raise ArgumentError, "it doesn't make sense to have a field that " +
157
+ "is neither indexed nor stored"
158
+ end
159
+ if (index == Index::NO && store_term_vector != TermVector::NO)
160
+ raise ArgumentError, "cannot store term vector information for a " +
161
+ "field that is not indexed"
162
+ end
163
+
164
+ # The name of the field (e.g., "date", "subject", "title", or "body")
165
+ @name = name
166
+
167
+ # the one and only data object for all different kind of field values
168
+ @data = value
169
+ self.stored = stored
170
+ self.index = index
171
+ self.store_term_vector = store_term_vector
172
+ @binary = binary
173
+ @boost = boost
174
+ end
175
+
176
+ def stored=(stored)
177
+ if (stored == Store::YES)
178
+ @stored = true
179
+ @compressed = false
180
+ elsif (stored == Store::COMPRESS)
181
+ @stored = true
182
+ @compressed = true
183
+ elsif (stored == Store::NO)
184
+ @stored = false
185
+ @compressed = false
186
+ else
187
+ raise "unknown stored parameter " + stored.to_s
188
+ end
189
+ end
190
+
191
+ def index=(index)
192
+ if (index == Index::NO)
193
+ @indexed = false
194
+ @tokenized = false
195
+ elsif (index == Index::TOKENIZED)
196
+ @indexed = true
197
+ @tokenized = true
198
+ elsif (index == Index::UNTOKENIZED)
199
+ @indexed = true
200
+ @tokenized = false
201
+ else
202
+ raise "unknown stored parameter " + index.to_s
203
+ end
204
+ end
205
+
206
+ def store_term_vector=(store_term_vector)
207
+ if (store_term_vector == TermVector::NO)
208
+ @store_term_vector = false
209
+ @store_position = false
210
+ @store_offset = false
211
+ elsif (store_term_vector == TermVector::YES)
212
+ @store_term_vector = true
213
+ @store_position = false
214
+ @store_offset = false
215
+ elsif (store_term_vector == TermVector::WITH_POSITIONS)
216
+ @store_term_vector = true
217
+ @store_position = true
218
+ @store_offset = false
219
+ elsif (store_term_vector == TermVector::WITH_OFFSETS)
220
+ @store_term_vector = true
221
+ @store_position = false
222
+ @store_offset = true
223
+ elsif (store_term_vector == TermVector::WITH_POSITIONS_OFFSETS)
224
+ @store_term_vector = true
225
+ @store_position = true
226
+ @store_offset = true
227
+ else
228
+ raise "unknown term_vector parameter " + store_term_vector.to_s
229
+ end
230
+ end
231
+
232
+ # Returns the string value of the data that is stored in this field
233
+ def string_value
234
+ if @data.instance_of? String
235
+ return @data
236
+ elsif @data.respond_to? :read
237
+ return @data.read()
238
+ else
239
+ # if it is binary object try to return a string representation
240
+ return @data.to_s
241
+ end
242
+ end
243
+
244
+ # if the data is stored as a binary, just return it.
245
+ def binary_value
246
+ return @data
247
+ end
248
+
249
+ # Returns the string value of the data that is stored in this field
250
+ def reader_value
251
+ if @data.respond_to? :read
252
+ return @data
253
+ elsif @data.instance_of? String
254
+ return Ferret::Utils::StringHelper::StringReader.new(@data)
255
+ else
256
+ # if it is binary object try to return a string representation
257
+ return Ferret::Utils::StringHelper::StringReader.new(@data.to_s)
258
+ end
259
+ end
260
+
261
+ # Create a stored field with binary value. Optionally the value
262
+ # may be compressed. But it obviously won't be tokenized or
263
+ # term vectored or anything like that.
264
+ #
265
+ # name:: The name of the field
266
+ # value:: The binary value
267
+ # store:: How _value_ should be stored (compressed or not.)
268
+ def Field.new_binary_field(name, value, stored)
269
+ if (stored == Store::NO)
270
+ raise ArgumentError, "binary values can't be unstored"
271
+ end
272
+ Field.new(name, value, stored, Index::NO, TermVector::NO, true)
273
+ end
274
+
275
+ # Prints a Field for human consumption.
276
+ def to_s()
277
+ str = ""
278
+ if (@stored)
279
+ str << "stored"
280
+ @str << @compressed ? "/compressed," : "/uncompressed,"
281
+ end
282
+ if (@indexed) then str << "indexed," end
283
+ if (@tokenized) then str << "tokenized," end
284
+ if (@store_term_vector) then str << "store_term_vector," end
285
+ if (@store_offset)
286
+ str << "term_vector_offsets,"
287
+ end
288
+ if (@store_position)
289
+ str << "term_vector_position,"
290
+ end
291
+ if (@binary) then str << "binary," end
292
+
293
+ str << '<'
294
+ str << @name
295
+ str << ':'
296
+
297
+ if (@data != null)
298
+ str << @data.to_s
299
+ end
300
+
301
+ str << '>'
302
+ end
303
+ end
304
+ end
@@ -0,0 +1,26 @@
1
+ require 'ferret/index/index_file_names'
2
+ require 'ferret/index/term'
3
+ require 'ferret/index/term_buffer'
4
+ require 'ferret/index/term_doc_enum'
5
+ require 'ferret/index/multiple_term_doc_pos_enum'
6
+ require 'ferret/index/term_enum'
7
+ require 'ferret/index/term_info'
8
+ require 'ferret/index/term_infos_io'
9
+ require 'ferret/index/term_vector_offset_info'
10
+ require 'ferret/index/term_vectors_io'
11
+ require 'ferret/index/field_infos'
12
+ require 'ferret/index/fields_io'
13
+ require 'ferret/index/compound_file_io'
14
+ require 'ferret/index/term_buffer'
15
+ require 'ferret/index/segment_term_enum'
16
+ require 'ferret/index/segment_term_vector'
17
+ require 'ferret/index/segment_merge_info'
18
+ require 'ferret/index/segment_merge_queue'
19
+ require 'ferret/index/segment_infos'
20
+ require 'ferret/index/document_writer'
21
+ require 'ferret/index/index_reader'
22
+ require 'ferret/index/index_writer'
23
+ require 'ferret/index/multi_reader'
24
+ require 'ferret/index/segment_merger'
25
+ require 'ferret/index/segment_reader'
26
+ require 'ferret/index/index'
@@ -0,0 +1,343 @@
1
+ require 'monitor'
2
+
3
+ module Ferret::Index
4
+
5
+ # Class for accessing a compound stream.
6
+ # This class implements a directory, but is limited to only read operations.
7
+ # Directory methods that would normally modify data raise.
8
+ class CompoundFileReader < Ferret::Store::Directory
9
+
10
+ include MonitorMixin
11
+
12
+ attr_reader :directory, :file_name
13
+
14
+ # Creates a Compound File Reader which contains a single file and has
15
+ # pointers to the individual files within. When it is initialized, the
16
+ # compound file is set and the header is read so that it is ready to read
17
+ # the individual files within.
18
+ def initialize(dir, name)
19
+
20
+ super()
21
+
22
+ @directory = dir
23
+ @file_name = name
24
+ @entries = {}
25
+
26
+ success = false
27
+
28
+ begin
29
+ @stream = dir.open_input(name)
30
+
31
+ # read the directory and init files
32
+ count = @stream.read_vint()
33
+ entry = nil
34
+ count.times() do
35
+ offset = @stream.read_long()
36
+ id = @stream.read_string()
37
+
38
+ if (entry != nil)
39
+ # set length of the previous entry
40
+ entry.length = offset - entry.offset
41
+ end
42
+
43
+ entry = FileEntry.new(offset)
44
+ @entries[id] = entry
45
+ end
46
+
47
+ # set the length of the final entry
48
+ if (entry != nil)
49
+ entry.length = @stream.length() - entry.offset
50
+ end
51
+
52
+ success = true
53
+
54
+ ensure
55
+
56
+ if not success and (@stream != nil)
57
+ begin
58
+ @stream.close()
59
+ rescue IOError
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ def close()
66
+ synchronize do
67
+ if (@stream == nil): raise(IOError, "Already closed") end
68
+
69
+ @entries.clear()
70
+ @stream.close()
71
+ @stream = nil
72
+ end
73
+ end
74
+
75
+ def open_input(id)
76
+ synchronize do
77
+ if (@stream == nil)
78
+ raise(IOError, "Stream closed")
79
+ end
80
+
81
+ entry = @entries[id]
82
+ if (entry == nil)
83
+ raise(IOError, "No sub-file with id " + id + " found")
84
+ end
85
+ return CSIndexInput.new(@stream, entry.offset, entry.length)
86
+ end
87
+ end
88
+
89
+ # Returns an array of strings, one for each file in the directory.
90
+ def list()
91
+ return @entries.keys()
92
+ end
93
+
94
+ # Returns true iff a file with the given name exists.
95
+ def file_exists(name)
96
+ return @entries.key?(name)
97
+ end
98
+
99
+ # Returns the time the named file was last modified.
100
+ def modified(name)
101
+ return @directory.modified(@file_name)
102
+ end
103
+
104
+ # Set the modified time of an existing file to now.
105
+ def touch(name)
106
+ @directory.touch(@file_name)
107
+ end
108
+
109
+ # Not implemented
110
+ def delete(name) raise(UnsupportedOperationError) end
111
+
112
+ # Not implemented
113
+ def rename(from, to) raise(UnsupportedOperationError) end
114
+
115
+ # Returns the length of a file in the directory.
116
+ def file_length(name)
117
+ e = @entries[name]
118
+ if (e == nil): raise(IOError, "File " + name + " does not exist") end
119
+ return e.length
120
+ end
121
+
122
+ # Not implemented
123
+ def create_output(name) raise(UnsupportedOperationError) end
124
+
125
+ # Not implemented
126
+ def make_lock(name) raise(UnsupportedOperationError) end
127
+
128
+ # Implementation of an IndexInput that reads from a portion of the
129
+ # compound file.
130
+ class CSIndexInput < Ferret::Store::BufferedIndexInput
131
+ attr_reader :length
132
+
133
+ def initialize(base, file_offset, length)
134
+ super()
135
+ @base = base
136
+ @base.extend(MonitorMixin)
137
+ @file_offset = file_offset
138
+ @length = length
139
+ end
140
+
141
+ # Closes the stream to further operations.
142
+ def close() end
143
+
144
+ private
145
+ # Expert: implements buffer refill. Reads bytes from the current
146
+ # position in the input.
147
+ #
148
+ # b:: the array to read bytes into
149
+ # offset:: the offset in the array to start storing bytes
150
+ # len:: the number of bytes to read
151
+ def read_internal(b, offset, len)
152
+ @base.synchronize() do
153
+ start = pos()
154
+ if(start + len > @length): raise(EOFError, "read past EOF") end
155
+ @base.seek(@file_offset + start)
156
+ @base.read_bytes(b, offset, len)
157
+ end
158
+ end
159
+
160
+ # Expert: implements seek. Sets current position in @file, where
161
+ # the next {@link #read_internal(byte[],int,int)} will occur.
162
+ def seek_internal(pos) end
163
+ end
164
+
165
+ private
166
+ # Base info
167
+ class FileEntry
168
+ attr_accessor :offset, :length
169
+ def initialize(offset)
170
+ @offset = offset
171
+ end
172
+ end
173
+
174
+ end
175
+
176
+ # Combines multiple files into a single compound file.
177
+ # The file format:
178
+ #
179
+ # * VInt fileCount
180
+ # * {Directory} fileCount entries with the following structure:
181
+ # + long data_offset
182
+ # + UTFString extension
183
+ # * {File Data} fileCount entries with the raw data of the corresponding file
184
+ #
185
+ # The fileCount integer indicates how many files are contained in this compound
186
+ # file. The {directory} that follows has that many entries. Each directory entry
187
+ # contains an encoding identifier, a long pointer to the start of this file's
188
+ # data section, and a UTF String with that file's extension.
189
+ class CompoundFileWriter
190
+
191
+ attr_reader :directory, :file_name
192
+
193
+ # Create the compound stream in the specified file. The file name is the
194
+ # entire name (no extensions are added).
195
+ def initialize(dir, name)
196
+ @directory = dir
197
+ @file_name = name
198
+ @ids = Set.new
199
+ @file_entries = []
200
+ @merged = false
201
+ end
202
+
203
+ # Add a source stream. _file_name_ is the string by which the
204
+ # sub-stream will be known in the compound stream.
205
+ #
206
+ # Throws:: IllegalStateError if this writer is closed
207
+ # Throws:: IllegalArgumentError if a file with the same name
208
+ # has been added already
209
+ def add_file(file_name)
210
+ if @merged
211
+ raise(IllegalStateError, "Can't add extensions after merge has been called")
212
+ end
213
+
214
+ if not @ids.add?(file_name)
215
+ raise(IllegalArgumentError, "File " + file + " already added")
216
+ end
217
+
218
+ entry = FileEntry.new(file_name)
219
+ @file_entries << entry
220
+ end
221
+
222
+ # Merge files with the extensions added up to now.
223
+ # All files with these extensions are combined sequentially into the
224
+ # compound stream. After successful merge, the source files
225
+ # are deleted.
226
+ #
227
+ # Throws:: IllegalStateException if close() had been called before or
228
+ # if no file has been added to this object
229
+ def close()
230
+
231
+ if @merged
232
+ raise(IllegalStateException, "Merge already performed")
233
+ end
234
+
235
+ if @file_entries.empty?
236
+ raise(IllegalStateException, "No entries to merge have been defined")
237
+ end
238
+
239
+ @merged = true
240
+
241
+ # open the compound stream
242
+ os = nil
243
+ begin
244
+ os = @directory.create_output(@file_name)
245
+
246
+ # Write the number of entries
247
+ os.write_vint(@file_entries.size)
248
+
249
+ # Write the directory with all offsets at 0.
250
+ # Remember the positions of directory entries so that we can
251
+ # adjust the offsets later
252
+ @file_entries.each do |fe|
253
+ fe.directory_offset = os.pos()
254
+ os.write_long(0) # for now
255
+ os.write_string(fe.file_name)
256
+ end
257
+
258
+ # Open the files and copy their data into the stream.
259
+ # Remember the locations of each file's data section.
260
+ @file_entries.each do |fe|
261
+ fe.data_offset = os.pos()
262
+ copy_file(fe, os)
263
+ end
264
+
265
+ # Write the data offsets into the directory of the compound stream
266
+ @file_entries.each do |fe|
267
+ os.seek(fe.directory_offset)
268
+ os.write_long(fe.data_offset)
269
+ end
270
+
271
+ # Close the output stream. Set the os to nil before trying to
272
+ # close so that if an exception occurs during the close, the
273
+ # finally clause below will not attempt to close the stream
274
+ # the second time.
275
+ tmp = os
276
+ os = nil
277
+ tmp.close()
278
+
279
+ ensure
280
+ if (os != nil)
281
+ begin
282
+ os.close()
283
+ rescue
284
+ end
285
+ end
286
+ end
287
+ end
288
+
289
+ private
290
+
291
+ # Internal class for holding a file
292
+ class FileEntry
293
+
294
+ attr_accessor :file_name, :directory_offset, :data_offset
295
+
296
+ def initialize(file_name)
297
+ @file_name = file_name
298
+ end
299
+
300
+ end
301
+
302
+ # Copy the contents of the file with specified extension into the
303
+ # provided output stream. Use a buffer for moving data
304
+ # to reduce memory allocation.
305
+ def copy_file(source, os)
306
+ is = nil
307
+ begin
308
+ start_ptr = os.pos()
309
+
310
+ is = @directory.open_input(source.file_name)
311
+ remainder = length = is.length
312
+
313
+ buffer = Ferret::Store::BUFFER.clone
314
+ while (remainder > 0)
315
+ len = [remainder, Ferret::Store::BUFFER_SIZE].min
316
+ is.read_bytes(buffer, 0, len)
317
+ os.write_bytes(buffer, len)
318
+ remainder -= len
319
+ end
320
+
321
+ # Verify that remainder is 0
322
+ if (remainder != 0)
323
+ raise(IOError,
324
+ "Non-zero remainder length after copying: " + remainder.to_s +
325
+ " (id: " + source.file_name + ", length: " + length.to_s +
326
+ ", buffer size: " + Ferret::Store::BUFFER_SIZE.to_s + ")")
327
+ end
328
+
329
+ # Verify that the output length diff is equal to original file
330
+ end_ptr = os.pos()
331
+ diff = end_ptr - start_ptr
332
+ if (diff != length)
333
+ raise(IOError,
334
+ "Difference in the output file offsets " + diff.to_s +
335
+ " does not match the original file length " + length.to_s)
336
+ end
337
+
338
+ ensure
339
+ if (is != nil): is.close() end
340
+ end
341
+ end
342
+ end
343
+ end