ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,52 @@
1
+ module Ferret
2
+ module Index
3
+ # Abstract class for enumerating terms.
4
+ #
5
+ # Term enumerations are always ordered by Term.<=>. Each term in
6
+ # the enumeration is greater than all that precede it.
7
+ class TermEnum
8
+ # Increments the enumeration to the next element. True if one exists.
9
+ def next?
10
+ raise NotImplementedError
11
+ end
12
+
13
+ # Returns the current Term in the enumeration.
14
+ def term
15
+ raise NotImplementedError
16
+ end
17
+
18
+ # Returns the doc_freq of the current Term in the enumeration.
19
+ def doc_freq
20
+ raise NotImplementedError
21
+ end
22
+
23
+ # Closes the enumeration to further activity, freeing resources.
24
+ def close
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # Term Vector support
29
+ # Skips terms to the first beyond the current whose value is
30
+ # greater or equal to _target_.
31
+ #
32
+ # Returns true iff there is such a term.
33
+ #
34
+ # Behaves as if written:
35
+ #
36
+ # def skip_to(target_term)
37
+ # while (target > term)
38
+ # if (!next()) return false
39
+ # end
40
+ # return true
41
+ # end
42
+ #
43
+ # Some implementations are considerably more efficient than that.
44
+ def skip_to(term)
45
+ while (target > term)
46
+ return false if not next?
47
+ end
48
+ return true
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,41 @@
1
+ module Ferret::Index
2
+ # A TermInfo is the record of information stored for a term.
3
+ class TermInfo
4
+ attr_accessor :doc_freq, :freq_pointer, :prox_pointer, :skip_offset
5
+
6
+ def initialize(df=0, fp=0, pp=0, so=0)
7
+ set_values!(df, fp, pp, so)
8
+ end
9
+
10
+ def set!(ti)
11
+ @doc_freq = ti.doc_freq
12
+ @freq_pointer = ti.freq_pointer
13
+ @prox_pointer = ti.prox_pointer
14
+ @skip_offset = ti.skip_offset
15
+ end
16
+
17
+ def set_values!(df=0, fp=0, pp=0, so=0)
18
+ @doc_freq = df
19
+ @freq_pointer = fp
20
+ @prox_pointer = pp
21
+ @skip_offset = so
22
+ end
23
+
24
+ def copy_of()
25
+ TermInfo.new(doc_freq, freq_pointer, prox_pointer, skip_offset)
26
+ end
27
+
28
+ def ==(o)
29
+ return false if !o.instance_of?(TermInfo)
30
+ @doc_freq == o.doc_freq &&
31
+ @freq_pointer == o.freq_pointer &&
32
+ @prox_pointer == o.prox_pointer &&
33
+ @skip_offset == o.skip_offset
34
+ end
35
+ alias eql? ==
36
+
37
+ def to_s()
38
+ "TermInfo:df=#{@doc_freq}:fp=#{@freq_pointer}:pp=#{@prox_pointer}:so=#{@skip_offset}"
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,312 @@
1
+ require 'monitor'
2
+ module Ferret::Index
3
+
4
+ # This stores a monotonically increasing set of <Term, TermInfo> pairs in a
5
+ # Directory. A TermInfos can be written once, in order.
6
+ class TermInfosWriter
7
+ attr_reader :index_interval, :skip_interval, :out
8
+ attr_writer :other
9
+ # The file format version, a negative number.
10
+ FORMAT = -2
11
+
12
+
13
+ # TODO: the default values for these two parameters should be settable
14
+ # from IndexWriter. However, once that's done, folks will start setting
15
+ # them to ridiculous values and complaining that things don't work well,
16
+ # as with mergeFactor. So, let's wait until a number of folks find that
17
+ # alternate values work better. Note that both of these values are
18
+ # stored in the segment, so that it's safe to change these w/o
19
+ # rebuilding all indexes.
20
+
21
+ # Expert: The fraction of terms in the "dictionary" which should be
22
+ # stored in RAM. Smaller values use more memory, but make searching
23
+ # slightly faster, while larger values use less memory and make
24
+ # searching slightly slower. Searching is typically not dominated by
25
+ # dictionary lookup, so tweaking this is rarely useful.
26
+ #
27
+ # Expert: The fraction of TermDocEnum entries stored in skip
28
+ # tables, used to accellerate TermDocEnum#skipTo(int). Larger
29
+ # values result in smaller indexes, greater acceleration, but fewer
30
+ # accelerable cases, while smaller values result in bigger indexes, less
31
+ # acceleration and more accelerable cases. More detailed experiments
32
+ # would be useful here.
33
+ def initialize(dir, segment, fis, interval, is_index = false)
34
+ @index_interval = interval
35
+ @skip_interval = 16
36
+ @last_index_pointer = 0
37
+ @last_term = Term.new("", "")
38
+ @last_term_info = TermInfo.new()
39
+ @size = 0
40
+ @is_index = is_index
41
+ @field_infos = fis
42
+ @out = dir.create_output(segment + (@is_index ? ".tii" : ".tis"))
43
+ @out.write_int(FORMAT) # write format
44
+ @out.write_long(0) # leave space for size
45
+ @out.write_int(@index_interval) # write @index_interval
46
+ @out.write_int(@skip_interval) # write @skip_interval
47
+ unless is_index
48
+ @other = TermInfosWriter.new(dir, segment, fis, interval, true)
49
+ @other.other = self
50
+ end
51
+ end
52
+
53
+ # Adds a new <Term, TermInfo> pair to the set.
54
+ # Term must be lexicographically greater than all previous Terms added.
55
+ # TermInfo pointers must be positive and greater than all previous.
56
+ def add(term, term_info)
57
+ if (not @is_index and @last_term > term)
58
+ raise IOError, "term out of order #{term.text} < #{@last_term.text}"
59
+ end
60
+ if (term_info.freq_pointer < @last_term_info.freq_pointer)
61
+ raise IOError, "freq pointer out of order"
62
+ end
63
+ if (term_info.prox_pointer < @last_term_info.prox_pointer)
64
+ raise IOError, "prox pointer out of order"
65
+ end
66
+
67
+ if (not @is_index and @size % @index_interval == 0)
68
+ @other.add(@last_term, @last_term_info) # add an index term
69
+ end
70
+
71
+ write_term(term) # write term
72
+ @out.write_vint(term_info.doc_freq) # write doc freq
73
+ @out.write_vlong(term_info.freq_pointer - @last_term_info.freq_pointer)
74
+ @out.write_vlong(term_info.prox_pointer - @last_term_info.prox_pointer)
75
+ @out.write_vint(term_info.skip_offset) if (term_info.doc_freq >= @skip_interval)
76
+
77
+ if (@is_index)
78
+ @out.write_vlong(@other.out.pos() - @last_index_pointer)
79
+ @last_index_pointer = @other.out.pos() # write pointer
80
+ end
81
+
82
+ @last_term_info.set!(term_info)
83
+ @size += 1
84
+ end
85
+
86
+ # Called to complete TermInfos creation.
87
+ def close()
88
+ @out.seek(4) # write @size after format
89
+ @out.write_long(@size)
90
+ @out.close()
91
+
92
+ @other.close() unless @is_index
93
+ end
94
+
95
+ private
96
+ def write_term(term)
97
+ start = Ferret::Utils::StringHelper.string_difference(@last_term.text, term.text)
98
+ length = term.text.length() - start
99
+
100
+ @out.write_vint(start) # write shared prefix length
101
+ @out.write_vint(length) # write delta length
102
+ @out.write_chars(term.text, start, length) # write delta chars
103
+ @out.write_vint(@field_infos.field_number(term.field)) # write field num
104
+ @last_term = term
105
+ end
106
+ end
107
+
108
+
109
+ # This stores a monotonically increasing set of <Term, TermInfo> pairs in a
110
+ # Directory. Pairs are accessed either by Term or by ordinal position the
111
+ # set.
112
+ class TermInfosReader
113
+ include MonitorMixin
114
+
115
+ def initialize(dir, seg, fis)
116
+ super()
117
+
118
+ Thread.current["#{self.object_id}-term_enum"] = nil
119
+
120
+ @directory = dir
121
+ @segment = seg
122
+ @field_infos = fis
123
+
124
+ @orig_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tis"),
125
+ @field_infos, false)
126
+ @size = @orig_enum.size
127
+ @skip_interval = @orig_enum.skip_interval
128
+ @index_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tii"),
129
+ @field_infos, true)
130
+ @index_terms = nil
131
+ @index_infos = nil
132
+ @index_pointers = nil
133
+ end
134
+
135
+ def close()
136
+ # clear this threads cache
137
+ Thread.current["#{self.object_id}-term_enum"] = nil
138
+
139
+ @orig_enum.close() if (@orig_enum != nil)
140
+ @index_enum.close() if (@index_enum != nil)
141
+ end
142
+
143
+ # Returns the number of term/value pairs in the set.
144
+ attr_reader :size
145
+ # The skip interval for the original enumerator
146
+ attr_reader :skip_interval
147
+
148
+
149
+ # Returns the TermInfo for a Term in the set, or nil.
150
+ def get_term_info(term)
151
+ return nil if (@size == 0)
152
+
153
+ ensure_index_is_read()
154
+
155
+ # optimize sequential access: first try scanning cached enum w/o seeking
156
+ e = enum()
157
+ if e.term and term >= e.term
158
+ enum_offset = (e.position / e.index_interval).to_i + 1
159
+ if (@index_terms.length == enum_offset or
160
+ term < @index_terms[enum_offset]) # but before end of block
161
+ return scan_for_term_info(term) # no need to seek
162
+ end
163
+ end
164
+
165
+ # random-access: must seek
166
+ seek_enum(get_index_offset(term))
167
+ return scan_for_term_info(term)
168
+ end
169
+ alias :[] :get_term_info
170
+
171
+ # Returns the nth term in the set.
172
+ def get_term(position)
173
+ return nil if (@size == 0)
174
+
175
+ e = enum()
176
+ if (e != nil and
177
+ e.term != nil and
178
+ position >= e.position and
179
+ position < (e.position + e.index_interval))
180
+ return scan_for_term(position) # can avoid seek
181
+ end
182
+
183
+ seek_enum((position / e.index_interval).to_i) # must seek
184
+ return scan_for_term(position)
185
+ end
186
+
187
+ def get_terms_position(term)
188
+ return nil if (@size == 0)
189
+ ensure_index_is_read
190
+ seek_enum(get_index_offset(term))
191
+
192
+ e = enum()
193
+
194
+ while term > e.term and e.next?
195
+ end
196
+
197
+ return term == e.term ? e.position : -1
198
+ end
199
+
200
+ # Returns an enumeration of all the Terms and TermInfos in the set.
201
+ def terms()
202
+ return @orig_enum.clone()
203
+ end
204
+
205
+ # Returns an enumeration of terms starting at or after the named term.
206
+ def terms_from(term)
207
+ get_term_info(term)
208
+ return enum().clone()
209
+ end
210
+
211
+ private
212
+
213
+ def enum()
214
+ term_enum = Thread.current["#{self.object_id}-term_enum"]
215
+ if (term_enum == nil)
216
+ term_enum = terms()
217
+ @xterm_enum = Thread.current["#{self.object_id}-term_enum"] = term_enum
218
+ end
219
+ return term_enum
220
+ end
221
+
222
+ def ensure_index_is_read()
223
+ synchronize() do
224
+ return if @index_terms
225
+ begin
226
+ index_size = @index_enum.size
227
+
228
+ @index_terms = Array.new(index_size)
229
+ @index_infos = Array.new(index_size)
230
+ @index_pointers = Array.new(index_size)
231
+
232
+ i = 0
233
+ while @index_enum.next?
234
+ @index_terms[i] = @index_enum.term
235
+ @index_infos[i] = @index_enum.term_info
236
+ @index_pointers[i] = @index_enum.index_pointer
237
+ i += 1
238
+ end
239
+ ensure
240
+ @index_enum.close()
241
+ @index_enum = nil
242
+ end
243
+ end
244
+ end
245
+
246
+ # Returns the offset of the greatest index entry which is less than or
247
+ # equal to term.
248
+ def get_index_offset(term)
249
+ lo = 0 # binary search @index_terms[]
250
+ hi = @index_terms.length - 1
251
+
252
+ while (hi >= lo)
253
+ mid = (lo + hi) >> 1
254
+ delta = term <=> @index_terms[mid]
255
+ if (delta < 0)
256
+ hi = mid - 1
257
+ elsif (delta > 0)
258
+ lo = mid + 1
259
+ else
260
+ return mid
261
+ end
262
+ end
263
+ return hi
264
+ end
265
+
266
+ def seek_enum(ind_offset)
267
+ enum().seek(@index_pointers[ind_offset],
268
+ (ind_offset * enum().index_interval) - 1,
269
+ @index_terms[ind_offset],
270
+ @index_infos[ind_offset])
271
+ end
272
+
273
+ # Scans within block for matching term.
274
+ def scan_for_term_info(term)
275
+ e = enum()
276
+ e.scan_to(term)
277
+ if e.term != nil and term == e.term
278
+ return e.term_info()
279
+ else
280
+ return nil
281
+ end
282
+ end
283
+
284
+ def scan_for_term(position)
285
+ e = enum()
286
+ while (e.position < position)
287
+ return nil if not e.next?
288
+ end
289
+
290
+ return e.term
291
+ end
292
+
293
+ # Returns the position of a Term in the set or -1.
294
+ def get_position(term)
295
+ return -1 if (@size == 0)
296
+
297
+ ind_offset = get_index_offset(term)
298
+ seek_enum(ind_offset)
299
+
300
+ e = enum()
301
+ while (term > e.term and e.next?)
302
+ end
303
+
304
+ if (term == e.term())
305
+ return e.position
306
+ else
307
+ return -1
308
+ end
309
+ end
310
+
311
+ end
312
+ end
@@ -0,0 +1,20 @@
1
+ module Ferret::Index
2
+ class TermVectorOffsetInfo
3
+ attr_accessor :start_offset, :end_offset
4
+
5
+ def initialize(start_offset, end_offset)
6
+ @end_offset = end_offset
7
+ @start_offset = start_offset
8
+ end
9
+
10
+ def eql?(o)
11
+ return false if !o.instance_of?(TermVectorOffsetInfo)
12
+ @end_offset == o.end_offset and @start_offset == o.start_offset
13
+ end
14
+ alias :== :eql?
15
+
16
+ def hash()
17
+ 29 * @start_offset + @end_offset
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,552 @@
1
+ module Ferret::Index
2
+ # Writer works by opening a document and then opening the fields within
3
+ # the document and then writing out the vectors for each field.
4
+ #
5
+ # Rough usage:
6
+ #
7
+ # for each document
8
+ #
9
+ # writer.open_document()
10
+ # for each field on the document
11
+ #
12
+ # writer.open_field(field)
13
+ # for all of the @terms
14
+ #
15
+ # writer.add_term(...)
16
+ # end
17
+ # writer.close_field
18
+ # end
19
+ # writer.close_document()
20
+ # end
21
+ #
22
+ #
23
+ class TermVectorsWriter
24
+ STORE_POSITIONS_WITH_TERMVECTOR = 0x1
25
+ STORE_OFFSET_WITH_TERMVECTOR = 0x2
26
+
27
+ FORMAT_VERSION = 2
28
+
29
+ # The size in bytes that the FORMAT_VERSION will take up at the beginning
30
+ # of each file
31
+ FORMAT_SIZE = 4
32
+
33
+ TVX_EXTENSION = ".tvx"
34
+ TVD_EXTENSION = ".tvd"
35
+ TVF_EXTENSION = ".tvf"
36
+
37
+ def initialize(directory, segment, field_infos)
38
+ @current_field = nil
39
+ @current_doc_pointer = -1
40
+
41
+ # Open files for TermVector storage
42
+ @tvx = directory.create_output(segment + TVX_EXTENSION)
43
+ @tvx.write_int(FORMAT_VERSION)
44
+ @tvd = directory.create_output(segment + TVD_EXTENSION)
45
+ @tvd.write_int(FORMAT_VERSION)
46
+ @tvf = directory.create_output(segment + TVF_EXTENSION)
47
+ @tvf.write_int(FORMAT_VERSION)
48
+
49
+ @field_infos = field_infos
50
+ @fields = []
51
+ @terms = []
52
+ end
53
+
54
+
55
+ def open_document()
56
+ close_document()
57
+ @current_doc_pointer = @tvd.pos()
58
+ end
59
+
60
+
61
+ def close_document()
62
+
63
+ if (document_open?())
64
+ close_field()
65
+ write_doc()
66
+ @fields.clear()
67
+ @current_doc_pointer = -1
68
+ end
69
+ end
70
+
71
+
72
+ def document_open?()
73
+ return @current_doc_pointer != -1
74
+ end
75
+
76
+
77
+ # Start processing a field. This can be followed by a number of calls to
78
+ # add_term, and a final call to close_field to indicate the end of
79
+ # processing of this field. If a field was previously open, it is closed
80
+ # automatically.
81
+ def open_field(field)
82
+ field_info = @field_infos[field]
83
+ create_field(field_info.number,
84
+ field_info.store_positions?,
85
+ field_info.store_offsets?)
86
+ end
87
+
88
+ # Finished processing current field. This should be followed by a call
89
+ # to open_field before future calls to add_term.
90
+ def close_field()
91
+ if field_open?
92
+ #puts("close_field()")
93
+
94
+ # save field and @terms
95
+ write_field()
96
+ @fields << @current_field
97
+ @terms.clear()
98
+ @current_field = nil
99
+ end
100
+ end
101
+
102
+ # Return true if a field is currently open.
103
+ def field_open?()
104
+ return @current_field != nil
105
+ end
106
+
107
+ # Add term to the field's term vector. Field must already be open.
108
+ #
109
+ # Terms should be added in increasing order of @terms, one call per
110
+ # unique termNum. ProxPointer is a pointer into the TermPosition file
111
+ # (prx). Freq is the number of times this term appears in this field, in
112
+ # this document. raises:: IllegalStateException if document or field is
113
+ # not open
114
+ def add_term(term_text, freq, positions = nil, offsets = nil)
115
+ if not document_open?
116
+ raise IllegalStateError, "Cannot add terms when document is not open"
117
+ end
118
+ if not field_open?
119
+ raise IllegalStateError, "Cannot add terms when field is not open"
120
+ end
121
+
122
+ add_term_internal(term_text, freq, positions, offsets)
123
+ end
124
+
125
+ def add_term_internal(term_text, freq, positions, offsets)
126
+ @terms << TVTerm.new(term_text, freq, positions, offsets)
127
+ end
128
+
129
+ # Add a complete document specified by all its term vectors. If document has no
130
+ # term vectors, add value for @tvx.
131
+ #
132
+ # vectors:: The documents to have their term vectors added
133
+ # raises:: IOException
134
+ def add_all_doc_vectors(vectors)
135
+
136
+ open_document()
137
+
138
+ if vectors != nil
139
+ vectors.each do |vector|
140
+ store_positions = (vector.size > 0 and vector.positions != nil)
141
+ store_offsets = (vector.size > 0 and vector.offsets != nil)
142
+
143
+ create_field(@field_infos.field_number(vector.field),
144
+ store_positions, store_offsets)
145
+
146
+ vector.size.times do |j|
147
+ add_term_internal(vector.terms[j],
148
+ vector.term_frequencies[j],
149
+ store_positions ? vector.positions[j] : nil,
150
+ store_offsets ? vector.offsets[j] : nil)
151
+ end
152
+ close_field()
153
+ end
154
+ end
155
+ close_document()
156
+ end
157
+
158
+ # Close all streams.
159
+ def close()
160
+ begin
161
+ close_document()
162
+ ensure
163
+ # make an effort to close all streams we can but remember and re-raise
164
+ # the last exception encountered in this process
165
+ keep = nil
166
+ [@tvx, @tvd, @tvf].compact.each do |os|
167
+ begin
168
+ os.close()
169
+ rescue IOError => e
170
+ keep = e
171
+ end
172
+ end
173
+ raise keep if (keep != nil)
174
+ end
175
+ end
176
+
177
+ class TVField
178
+ attr_accessor :number, :tvf_pointer, :store_positions, :store_offsets
179
+ def initialize(number, store_pos, store_off)
180
+ @tvf_pointer = 0
181
+ @number = number
182
+ @store_positions = store_pos
183
+ @store_offsets = store_off
184
+ end
185
+ end
186
+
187
+ class TVTerm
188
+ attr_accessor :term_text, :freq, :positions, :offsets
189
+
190
+ def initialize(term_text=nil, freq=nil, positions=nil, offsets=nil)
191
+ @term_text = term_text
192
+ @freq = freq
193
+ @positions = positions
194
+ @offsets = offsets
195
+ end
196
+ end
197
+
198
+ private
199
+
200
+ def write_field()
201
+ # remember where this field is written
202
+ @current_field.tvf_pointer = @tvf.pos
203
+
204
+ size = @terms.size
205
+ @tvf.write_vint(size)
206
+
207
+ store_positions = @current_field.store_positions
208
+ store_offsets = @current_field.store_offsets
209
+ bits = 0x0
210
+ if (store_positions)
211
+ bits |= STORE_POSITIONS_WITH_TERMVECTOR
212
+ end
213
+ if (store_offsets)
214
+ bits |= STORE_OFFSET_WITH_TERMVECTOR
215
+ end
216
+ @tvf.write_byte(bits)
217
+
218
+ last_term_text = ""
219
+ @terms.each do |term|
220
+ start = Ferret::Utils::StringHelper.string_difference(last_term_text,
221
+ term.term_text)
222
+ length = term.term_text.length() - start
223
+ @tvf.write_vint(start) # write shared prefix length
224
+ @tvf.write_vint(length) # write delta length
225
+ @tvf.write_chars(term.term_text, start, length) # write delta chars
226
+ @tvf.write_vint(term.freq)
227
+ last_term_text = term.term_text
228
+
229
+ if (store_positions)
230
+ if (term.positions == nil)
231
+ raise IllegalStateError, "Trying to write positions that are nil!"
232
+ end
233
+
234
+ # use delta encoding for positions
235
+ position = 0
236
+ term.freq.times do |j|
237
+ @tvf.write_vint(term.positions[j] - position)
238
+ position = term.positions[j]
239
+ end
240
+ end
241
+
242
+ if (store_offsets)
243
+ if(term.offsets == nil)
244
+ raise IllegalStateError, "Trying to write offsets that are nil!"
245
+ end
246
+
247
+ # use delta encoding for offsets
248
+ position = 0
249
+ term.freq.times do |j|
250
+ @tvf.write_vint(term.offsets[j].start_offset - position)
251
+ #Save the diff between the two.
252
+ @tvf.write_vint(term.offsets[j].end_offset -
253
+ term.offsets[j].start_offset)
254
+ position = term.offsets[j].end_offset()
255
+ end
256
+ end
257
+ end
258
+ end
259
+
260
+ def write_doc()
261
+ if field_open?
262
+ raise IllegalStateError, "Field is still open while writing document"
263
+ end
264
+ #puts("Writing doc pointer: " + @current_doc_pointer)
265
+ # write document index record
266
+ @tvx.write_long(@current_doc_pointer)
267
+
268
+ # write document data record
269
+ size = @fields.size
270
+
271
+ # write the number of @fields
272
+ @tvd.write_vint(size)
273
+
274
+ # write field numbers
275
+ @fields.each { |field| @tvd.write_vint(field.number) }
276
+
277
+ # write field pointers
278
+ last_field_pointer = 0
279
+ @fields.each do |field|
280
+ @tvd.write_vlong(field.tvf_pointer - last_field_pointer)
281
+ last_field_pointer = field.tvf_pointer
282
+ end
283
+ #puts("After writing doc pointer: " + @tvx.pos())
284
+ end
285
+
286
+ def create_field(field_number, store_position, store_offset)
287
+ if not document_open?
288
+ raise IllegalStateError, "Cannot open field when no document is open."
289
+ end
290
+ close_field()
291
+ @current_field = TVField.new(field_number, store_position, store_offset)
292
+ end
293
+ end
294
+
295
+ class TermVectorsReader
296
+ attr_reader :size
297
+
298
+ # accessors for clone method
299
+ attr_accessor :tvx, :tvd, :tvf
300
+ protected :tvx, :tvx=, :tvd, :tvd=, :tvf, :tvf=
301
+
302
+
303
+ def initialize(d, segment, field_infos)
304
+
305
+ if (d.exists?(segment + TermVectorsWriter::TVX_EXTENSION))
306
+ @tvx = d.open_input(segment + TermVectorsWriter::TVX_EXTENSION)
307
+ check_valid_format(@tvx)
308
+ @tvd = d.open_input(segment + TermVectorsWriter::TVD_EXTENSION)
309
+ @tvd_format = check_valid_format(@tvd)
310
+ @tvf = d.open_input(segment + TermVectorsWriter::TVF_EXTENSION)
311
+ @tvf_format = check_valid_format(@tvf)
312
+ @size = @tvx.length / 8
313
+ else
314
+ @tvx = nil
315
+ @tvd = nil
316
+ @tvf = nil
317
+ end
318
+
319
+ @field_infos = field_infos
320
+ end
321
+
322
+ def close()
323
+ # make an effort to close all streams we can but remember and re-raise
324
+ # the last exception encountered in this process
325
+ keep = nil
326
+ [@tvx, @tvd, @tvf].compact.each do |os|
327
+ begin
328
+ os.close()
329
+ rescue IOError => e
330
+ keep = e
331
+ end
332
+ end
333
+ raise keep if (keep != nil)
334
+ end
335
+
336
+ # Retrieve the term vector for the given document and field
337
+ # doc_num:: The document number to retrieve the vector for
338
+ # field:: The field within the document to retrieve
339
+ # returns:: The TermFreqVector for the document and field or nil if there
340
+ # is no termVector for this field.
341
+ # raises:: IOException if there is an error reading the term vector files
342
+ def get_field_tv(doc_num, field)
343
+ # Check if no term vectors are available for this segment at all
344
+ field_number = @field_infos.field_number(field)
345
+ result = nil
346
+ if (@tvx != nil)
347
+ #We need to account for the FORMAT_SIZE at when seeking in the @tvx
348
+ #We don't need to do this in other seeks because we already have the
349
+ # file pointer
350
+ #that was written in another file
351
+ @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
352
+ #puts("TVX Pointer: " + @tvx.pos())
353
+ position = @tvx.read_long()
354
+
355
+ @tvd.seek(position)
356
+ field_count = @tvd.read_vint()
357
+ #puts("Num Fields: " + field_count)
358
+ # There are only a few fields per document. We opt for a full scan
359
+ # rather then requiring that they be ordered. We need to read through
360
+ # all of the fields anyway to get to the tvf pointers.
361
+ number = 0
362
+ found = -1
363
+ field_count.times do |i|
364
+ if @tvd_format == TermVectorsWriter::FORMAT_VERSION
365
+ number = @tvd.read_vint()
366
+ else
367
+ number += @tvd.read_vint()
368
+ end
369
+ if (number == field_number)
370
+ found = i
371
+ end
372
+ end
373
+
374
+ # This field, although valid in the segment, was not found in this
375
+ # document
376
+ if (found != -1)
377
+ # Compute position in the @tvf file
378
+ position = 0
379
+ (found + 1).times do
380
+ position += @tvd.read_vlong()
381
+ end
382
+
383
+ result = read_term_vector(field, position)
384
+ end
385
+ end
386
+ return result
387
+ end
388
+
389
+ # Return all term vectors stored for this document or nil if it could
390
+ # not be read in.
391
+ #
392
+ # doc_num:: The document number to retrieve the vector for
393
+ # returns:: All term frequency vectors
394
+ # raises:: IOException if there is an error reading the term vector files
395
+ def get_tv(doc_num)
396
+ result = nil
397
+ # Check if no term vectors are available for this segment at all
398
+ if (@tvx != nil)
399
+ #We need to offset by
400
+ @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
401
+ position = @tvx.read_long()
402
+
403
+ @tvd.seek(position)
404
+ field_count = @tvd.read_vint()
405
+
406
+ # No fields are vectorized for this document
407
+ if (field_count != 0)
408
+ number = 0
409
+ fields = Array.new(field_count)
410
+
411
+ field_count.times do |i|
412
+ if @tvd_format == TermVectorsWriter::FORMAT_VERSION
413
+ number = @tvd.read_vint()
414
+ else
415
+ number += @tvd.read_vint()
416
+ end
417
+
418
+ fields[i] = @field_infos[number].name
419
+ end
420
+
421
+ # Compute position in the @tvf file
422
+ position = 0
423
+ tvf_pointers = Array.new(field_count)
424
+ field_count.times do |i|
425
+ position += @tvd.read_vlong()
426
+ tvf_pointers[i] = position
427
+ end
428
+
429
+ result = read_term_vectors(fields, tvf_pointers)
430
+ end
431
+ end
432
+ return result
433
+ end
434
+
435
+ def clone()
436
+
437
+ if (@tvx == nil or @tvd == nil or @tvf == nil)
438
+ return nil
439
+ end
440
+
441
+ clone = self
442
+ clone.tvx = @tvx.clone()
443
+ clone.tvd = @tvd.clone()
444
+ clone.tvf = @tvf.clone()
445
+
446
+ return clone
447
+ end
448
+
449
+ private
450
+
451
+ def read_term_vectors(fields, tvf_pointers)
452
+
453
+ res = Array.new(fields.length)
454
+ fields.length.times do |i|
455
+ res[i] = read_term_vector(fields[i], tvf_pointers[i])
456
+ end
457
+ return res
458
+ end
459
+
460
+ # field:: The field to read in
461
+ # tvf_pointer:: The pointer within the @tvf file where we should start reading
462
+ # returns:: The TermVector located at that position
463
+ # raises:: IOException
464
+ def read_term_vector(field, tvf_pointer)
465
+ # Now read the data from specified position
466
+ # We don't need to offset by the FORMAT here since the pointer
467
+ # already includes the offset
468
+ @tvf.seek(tvf_pointer)
469
+
470
+ num_terms = @tvf.read_vint()
471
+ # If no terms - return a constant empty termvector. However, this should
472
+ # never occur!
473
+ if (num_terms == 0)
474
+ return SegmentTermVector.new(field, nil, nil)
475
+ end
476
+
477
+
478
+ if(@tvf_format == TermVectorsWriter::FORMAT_VERSION)
479
+ bits = @tvf.read_byte()
480
+ store_positions = (bits & TermVectorsWriter::STORE_POSITIONS_WITH_TERMVECTOR) != 0
481
+ store_offsets = (bits & TermVectorsWriter::STORE_OFFSET_WITH_TERMVECTOR) != 0
482
+ else
483
+ @tvf.read_vint()
484
+ store_positions = false
485
+ store_offsets = false
486
+ end
487
+
488
+ terms = Array.new(num_terms)
489
+ term_freqs = Array.new(num_terms)
490
+
491
+ # we may not need these, but declare them
492
+ positions = nil
493
+ offsets = nil
494
+ if(store_positions)
495
+ positions = Array.new(num_terms)
496
+ end
497
+ if(store_offsets)
498
+ offsets = Array.new(num_terms)
499
+ end
500
+
501
+ start = 0
502
+ delta_length = 0
503
+ total_length = 0
504
+ buffer = ""
505
+ previous_buffer = ""
506
+
507
+ num_terms.times do |i|
508
+ start = @tvf.read_vint()
509
+ delta_length = @tvf.read_vint()
510
+ total_length = start + delta_length
511
+ @tvf.read_chars(buffer, start, delta_length)
512
+ terms[i] = buffer[0, total_length].to_s
513
+ previous_string = terms[i]
514
+ freq = @tvf.read_vint()
515
+ term_freqs[i] = freq
516
+
517
+ if (store_positions) #read in the positions
518
+ pos = Array.new(freq)
519
+ positions[i] = pos
520
+ prev_position = 0
521
+ freq.times do |j|
522
+ pos[j] = prev_position + @tvf.read_vint()
523
+ prev_position = pos[j]
524
+ end
525
+ end
526
+
527
+ if (store_offsets)
528
+ offs = Array.new(freq)
529
+ offsets[i] = offs
530
+ prev_offset = 0
531
+ freq.times do |j|
532
+ start_offset = prev_offset + @tvf.read_vint()
533
+ end_offset = start_offset + @tvf.read_vint()
534
+ offs[j] = TermVectorOffsetInfo.new(start_offset, end_offset)
535
+ prev_offset = end_offset
536
+ end
537
+ end
538
+ end
539
+
540
+ SegmentTermVector.new(field, terms, term_freqs, positions, offsets)
541
+ end
542
+
543
+ def check_valid_format(istream)
544
+ format = istream.read_int()
545
+ if (format > TermVectorsWriter::FORMAT_VERSION)
546
+ raise IOError, "Incompatible format version: #{format} expected #{TermVectorsWriter::FORMAT_VERSION} or less"
547
+ end
548
+ return format
549
+ end
550
+
551
+ end
552
+ end