ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,52 @@
1
+ module Ferret
2
+ module Index
3
+ # Abstract class for enumerating terms.
4
+ #
5
+ # Term enumerations are always ordered by Term.<=>. Each term in
6
+ # the enumeration is greater than all that precede it.
7
+ class TermEnum
8
+ # Increments the enumeration to the next element. True if one exists.
9
+ def next?
10
+ raise NotImplementedError
11
+ end
12
+
13
+ # Returns the current Term in the enumeration.
14
+ def term
15
+ raise NotImplementedError
16
+ end
17
+
18
+ # Returns the doc_freq of the current Term in the enumeration.
19
+ def doc_freq
20
+ raise NotImplementedError
21
+ end
22
+
23
+ # Closes the enumeration to further activity, freeing resources.
24
+ def close
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # Term Vector support
29
+ # Skips terms to the first beyond the current whose value is
30
+ # greater or equal to _target_.
31
+ #
32
+ # Returns true iff there is such a term.
33
+ #
34
+ # Behaves as if written:
35
+ #
36
+ # def skip_to(target_term)
37
+ # while (target > term)
38
+ # if (!next()) return false
39
+ # end
40
+ # return true
41
+ # end
42
+ #
43
+ # Some implementations are considerably more efficient than that.
44
+ def skip_to(term)
45
+ while (target > term)
46
+ return false if not next?
47
+ end
48
+ return true
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,41 @@
1
+ module Ferret::Index
2
+ # A TermInfo is the record of information stored for a term.
3
+ class TermInfo
4
+ attr_accessor :doc_freq, :freq_pointer, :prox_pointer, :skip_offset
5
+
6
+ def initialize(df=0, fp=0, pp=0, so=0)
7
+ set_values!(df, fp, pp, so)
8
+ end
9
+
10
+ def set!(ti)
11
+ @doc_freq = ti.doc_freq
12
+ @freq_pointer = ti.freq_pointer
13
+ @prox_pointer = ti.prox_pointer
14
+ @skip_offset = ti.skip_offset
15
+ end
16
+
17
+ def set_values!(df=0, fp=0, pp=0, so=0)
18
+ @doc_freq = df
19
+ @freq_pointer = fp
20
+ @prox_pointer = pp
21
+ @skip_offset = so
22
+ end
23
+
24
+ def copy_of()
25
+ TermInfo.new(doc_freq, freq_pointer, prox_pointer, skip_offset)
26
+ end
27
+
28
+ def ==(o)
29
+ return false if !o.instance_of?(TermInfo)
30
+ @doc_freq == o.doc_freq &&
31
+ @freq_pointer == o.freq_pointer &&
32
+ @prox_pointer == o.prox_pointer &&
33
+ @skip_offset == o.skip_offset
34
+ end
35
+ alias eql? ==
36
+
37
+ def to_s()
38
+ "TermInfo:df=#{@doc_freq}:fp=#{@freq_pointer}:pp=#{@prox_pointer}:so=#{@skip_offset}"
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,312 @@
1
+ require 'monitor'
2
+ module Ferret::Index
3
+
4
+ # This stores a monotonically increasing set of <Term, TermInfo> pairs in a
5
+ # Directory. A TermInfos can be written once, in order.
6
+ class TermInfosWriter
7
+ attr_reader :index_interval, :skip_interval, :out
8
+ attr_writer :other
9
+ # The file format version, a negative number.
10
+ FORMAT = -2
11
+
12
+
13
+ # TODO: the default values for these two parameters should be settable
14
+ # from IndexWriter. However, once that's done, folks will start setting
15
+ # them to ridiculous values and complaining that things don't work well,
16
+ # as with mergeFactor. So, let's wait until a number of folks find that
17
+ # alternate values work better. Note that both of these values are
18
+ # stored in the segment, so that it's safe to change these w/o
19
+ # rebuilding all indexes.
20
+
21
+ # Expert: The fraction of terms in the "dictionary" which should be
22
+ # stored in RAM. Smaller values use more memory, but make searching
23
+ # slightly faster, while larger values use less memory and make
24
+ # searching slightly slower. Searching is typically not dominated by
25
+ # dictionary lookup, so tweaking this is rarely useful.
26
+ #
27
+ # Expert: The fraction of TermDocEnum entries stored in skip
28
+ # tables, used to accellerate TermDocEnum#skipTo(int). Larger
29
+ # values result in smaller indexes, greater acceleration, but fewer
30
+ # accelerable cases, while smaller values result in bigger indexes, less
31
+ # acceleration and more accelerable cases. More detailed experiments
32
+ # would be useful here.
33
+ def initialize(dir, segment, fis, interval, is_index = false)
34
+ @index_interval = interval
35
+ @skip_interval = 16
36
+ @last_index_pointer = 0
37
+ @last_term = Term.new("", "")
38
+ @last_term_info = TermInfo.new()
39
+ @size = 0
40
+ @is_index = is_index
41
+ @field_infos = fis
42
+ @out = dir.create_output(segment + (@is_index ? ".tii" : ".tis"))
43
+ @out.write_int(FORMAT) # write format
44
+ @out.write_long(0) # leave space for size
45
+ @out.write_int(@index_interval) # write @index_interval
46
+ @out.write_int(@skip_interval) # write @skip_interval
47
+ unless is_index
48
+ @other = TermInfosWriter.new(dir, segment, fis, interval, true)
49
+ @other.other = self
50
+ end
51
+ end
52
+
53
+ # Adds a new <Term, TermInfo> pair to the set.
54
+ # Term must be lexicographically greater than all previous Terms added.
55
+ # TermInfo pointers must be positive and greater than all previous.
56
+ def add(term, term_info)
57
+ if (not @is_index and @last_term > term)
58
+ raise IOError, "term out of order #{term.text} < #{@last_term.text}"
59
+ end
60
+ if (term_info.freq_pointer < @last_term_info.freq_pointer)
61
+ raise IOError, "freq pointer out of order"
62
+ end
63
+ if (term_info.prox_pointer < @last_term_info.prox_pointer)
64
+ raise IOError, "prox pointer out of order"
65
+ end
66
+
67
+ if (not @is_index and @size % @index_interval == 0)
68
+ @other.add(@last_term, @last_term_info) # add an index term
69
+ end
70
+
71
+ write_term(term) # write term
72
+ @out.write_vint(term_info.doc_freq) # write doc freq
73
+ @out.write_vlong(term_info.freq_pointer - @last_term_info.freq_pointer)
74
+ @out.write_vlong(term_info.prox_pointer - @last_term_info.prox_pointer)
75
+ @out.write_vint(term_info.skip_offset) if (term_info.doc_freq >= @skip_interval)
76
+
77
+ if (@is_index)
78
+ @out.write_vlong(@other.out.pos() - @last_index_pointer)
79
+ @last_index_pointer = @other.out.pos() # write pointer
80
+ end
81
+
82
+ @last_term_info.set!(term_info)
83
+ @size += 1
84
+ end
85
+
86
+ # Called to complete TermInfos creation.
87
+ def close()
88
+ @out.seek(4) # write @size after format
89
+ @out.write_long(@size)
90
+ @out.close()
91
+
92
+ @other.close() unless @is_index
93
+ end
94
+
95
+ private
96
+ def write_term(term)
97
+ start = Ferret::Utils::StringHelper.string_difference(@last_term.text, term.text)
98
+ length = term.text.length() - start
99
+
100
+ @out.write_vint(start) # write shared prefix length
101
+ @out.write_vint(length) # write delta length
102
+ @out.write_chars(term.text, start, length) # write delta chars
103
+ @out.write_vint(@field_infos.field_number(term.field)) # write field num
104
+ @last_term = term
105
+ end
106
+ end
107
+
108
+
109
+ # This stores a monotonically increasing set of <Term, TermInfo> pairs in a
110
+ # Directory. Pairs are accessed either by Term or by ordinal position the
111
+ # set.
112
+ class TermInfosReader
113
+ include MonitorMixin
114
+
115
+ def initialize(dir, seg, fis)
116
+ super()
117
+
118
+ Thread.current["#{self.object_id}-term_enum"] = nil
119
+
120
+ @directory = dir
121
+ @segment = seg
122
+ @field_infos = fis
123
+
124
+ @orig_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tis"),
125
+ @field_infos, false)
126
+ @size = @orig_enum.size
127
+ @skip_interval = @orig_enum.skip_interval
128
+ @index_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tii"),
129
+ @field_infos, true)
130
+ @index_terms = nil
131
+ @index_infos = nil
132
+ @index_pointers = nil
133
+ end
134
+
135
+ def close()
136
+ # clear this threads cache
137
+ Thread.current["#{self.object_id}-term_enum"] = nil
138
+
139
+ @orig_enum.close() if (@orig_enum != nil)
140
+ @index_enum.close() if (@index_enum != nil)
141
+ end
142
+
143
+ # Returns the number of term/value pairs in the set.
144
+ attr_reader :size
145
+ # The skip interval for the original enumerator
146
+ attr_reader :skip_interval
147
+
148
+
149
+ # Returns the TermInfo for a Term in the set, or nil.
150
+ def get_term_info(term)
151
+ return nil if (@size == 0)
152
+
153
+ ensure_index_is_read()
154
+
155
+ # optimize sequential access: first try scanning cached enum w/o seeking
156
+ e = enum()
157
+ if e.term and term >= e.term
158
+ enum_offset = (e.position / e.index_interval).to_i + 1
159
+ if (@index_terms.length == enum_offset or
160
+ term < @index_terms[enum_offset]) # but before end of block
161
+ return scan_for_term_info(term) # no need to seek
162
+ end
163
+ end
164
+
165
+ # random-access: must seek
166
+ seek_enum(get_index_offset(term))
167
+ return scan_for_term_info(term)
168
+ end
169
+ alias :[] :get_term_info
170
+
171
+ # Returns the nth term in the set.
172
+ def get_term(position)
173
+ return nil if (@size == 0)
174
+
175
+ e = enum()
176
+ if (e != nil and
177
+ e.term != nil and
178
+ position >= e.position and
179
+ position < (e.position + e.index_interval))
180
+ return scan_for_term(position) # can avoid seek
181
+ end
182
+
183
+ seek_enum((position / e.index_interval).to_i) # must seek
184
+ return scan_for_term(position)
185
+ end
186
+
187
+ def get_terms_position(term)
188
+ return nil if (@size == 0)
189
+ ensure_index_is_read
190
+ seek_enum(get_index_offset(term))
191
+
192
+ e = enum()
193
+
194
+ while term > e.term and e.next?
195
+ end
196
+
197
+ return term == e.term ? e.position : -1
198
+ end
199
+
200
+ # Returns an enumeration of all the Terms and TermInfos in the set.
201
+ def terms()
202
+ return @orig_enum.clone()
203
+ end
204
+
205
+ # Returns an enumeration of terms starting at or after the named term.
206
+ def terms_from(term)
207
+ get_term_info(term)
208
+ return enum().clone()
209
+ end
210
+
211
+ private
212
+
213
+ def enum()
214
+ term_enum = Thread.current["#{self.object_id}-term_enum"]
215
+ if (term_enum == nil)
216
+ term_enum = terms()
217
+ @xterm_enum = Thread.current["#{self.object_id}-term_enum"] = term_enum
218
+ end
219
+ return term_enum
220
+ end
221
+
222
+ def ensure_index_is_read()
223
+ synchronize() do
224
+ return if @index_terms
225
+ begin
226
+ index_size = @index_enum.size
227
+
228
+ @index_terms = Array.new(index_size)
229
+ @index_infos = Array.new(index_size)
230
+ @index_pointers = Array.new(index_size)
231
+
232
+ i = 0
233
+ while @index_enum.next?
234
+ @index_terms[i] = @index_enum.term
235
+ @index_infos[i] = @index_enum.term_info
236
+ @index_pointers[i] = @index_enum.index_pointer
237
+ i += 1
238
+ end
239
+ ensure
240
+ @index_enum.close()
241
+ @index_enum = nil
242
+ end
243
+ end
244
+ end
245
+
246
+ # Returns the offset of the greatest index entry which is less than or
247
+ # equal to term.
248
+ def get_index_offset(term)
249
+ lo = 0 # binary search @index_terms[]
250
+ hi = @index_terms.length - 1
251
+
252
+ while (hi >= lo)
253
+ mid = (lo + hi) >> 1
254
+ delta = term <=> @index_terms[mid]
255
+ if (delta < 0)
256
+ hi = mid - 1
257
+ elsif (delta > 0)
258
+ lo = mid + 1
259
+ else
260
+ return mid
261
+ end
262
+ end
263
+ return hi
264
+ end
265
+
266
+ def seek_enum(ind_offset)
267
+ enum().seek(@index_pointers[ind_offset],
268
+ (ind_offset * enum().index_interval) - 1,
269
+ @index_terms[ind_offset],
270
+ @index_infos[ind_offset])
271
+ end
272
+
273
+ # Scans within block for matching term.
274
+ def scan_for_term_info(term)
275
+ e = enum()
276
+ e.scan_to(term)
277
+ if e.term != nil and term == e.term
278
+ return e.term_info()
279
+ else
280
+ return nil
281
+ end
282
+ end
283
+
284
+ def scan_for_term(position)
285
+ e = enum()
286
+ while (e.position < position)
287
+ return nil if not e.next?
288
+ end
289
+
290
+ return e.term
291
+ end
292
+
293
+ # Returns the position of a Term in the set or -1.
294
+ def get_position(term)
295
+ return -1 if (@size == 0)
296
+
297
+ ind_offset = get_index_offset(term)
298
+ seek_enum(ind_offset)
299
+
300
+ e = enum()
301
+ while (term > e.term and e.next?)
302
+ end
303
+
304
+ if (term == e.term())
305
+ return e.position
306
+ else
307
+ return -1
308
+ end
309
+ end
310
+
311
+ end
312
+ end
@@ -0,0 +1,20 @@
1
+ module Ferret::Index
2
+ class TermVectorOffsetInfo
3
+ attr_accessor :start_offset, :end_offset
4
+
5
+ def initialize(start_offset, end_offset)
6
+ @end_offset = end_offset
7
+ @start_offset = start_offset
8
+ end
9
+
10
+ def eql?(o)
11
+ return false if !o.instance_of?(TermVectorOffsetInfo)
12
+ @end_offset == o.end_offset and @start_offset == o.start_offset
13
+ end
14
+ alias :== :eql?
15
+
16
+ def hash()
17
+ 29 * @start_offset + @end_offset
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,552 @@
1
+ module Ferret::Index
2
+ # Writer works by opening a document and then opening the fields within
3
+ # the document and then writing out the vectors for each field.
4
+ #
5
+ # Rough usage:
6
+ #
7
+ # for each document
8
+ #
9
+ # writer.open_document()
10
+ # for each field on the document
11
+ #
12
+ # writer.open_field(field)
13
+ # for all of the @terms
14
+ #
15
+ # writer.add_term(...)
16
+ # end
17
+ # writer.close_field
18
+ # end
19
+ # writer.close_document()
20
+ # end
21
+ #
22
+ #
23
+ class TermVectorsWriter
24
+ STORE_POSITIONS_WITH_TERMVECTOR = 0x1
25
+ STORE_OFFSET_WITH_TERMVECTOR = 0x2
26
+
27
+ FORMAT_VERSION = 2
28
+
29
+ # The size in bytes that the FORMAT_VERSION will take up at the beginning
30
+ # of each file
31
+ FORMAT_SIZE = 4
32
+
33
+ TVX_EXTENSION = ".tvx"
34
+ TVD_EXTENSION = ".tvd"
35
+ TVF_EXTENSION = ".tvf"
36
+
37
+ def initialize(directory, segment, field_infos)
38
+ @current_field = nil
39
+ @current_doc_pointer = -1
40
+
41
+ # Open files for TermVector storage
42
+ @tvx = directory.create_output(segment + TVX_EXTENSION)
43
+ @tvx.write_int(FORMAT_VERSION)
44
+ @tvd = directory.create_output(segment + TVD_EXTENSION)
45
+ @tvd.write_int(FORMAT_VERSION)
46
+ @tvf = directory.create_output(segment + TVF_EXTENSION)
47
+ @tvf.write_int(FORMAT_VERSION)
48
+
49
+ @field_infos = field_infos
50
+ @fields = []
51
+ @terms = []
52
+ end
53
+
54
+
55
+ def open_document()
56
+ close_document()
57
+ @current_doc_pointer = @tvd.pos()
58
+ end
59
+
60
+
61
+ def close_document()
62
+
63
+ if (document_open?())
64
+ close_field()
65
+ write_doc()
66
+ @fields.clear()
67
+ @current_doc_pointer = -1
68
+ end
69
+ end
70
+
71
+
72
+ def document_open?()
73
+ return @current_doc_pointer != -1
74
+ end
75
+
76
+
77
+ # Start processing a field. This can be followed by a number of calls to
78
+ # add_term, and a final call to close_field to indicate the end of
79
+ # processing of this field. If a field was previously open, it is closed
80
+ # automatically.
81
+ def open_field(field)
82
+ field_info = @field_infos[field]
83
+ create_field(field_info.number,
84
+ field_info.store_positions?,
85
+ field_info.store_offsets?)
86
+ end
87
+
88
+ # Finished processing current field. This should be followed by a call
89
+ # to open_field before future calls to add_term.
90
+ def close_field()
91
+ if field_open?
92
+ #puts("close_field()")
93
+
94
+ # save field and @terms
95
+ write_field()
96
+ @fields << @current_field
97
+ @terms.clear()
98
+ @current_field = nil
99
+ end
100
+ end
101
+
102
+ # Return true if a field is currently open.
103
+ def field_open?()
104
+ return @current_field != nil
105
+ end
106
+
107
+ # Add term to the field's term vector. Field must already be open.
108
+ #
109
+ # Terms should be added in increasing order of @terms, one call per
110
+ # unique termNum. ProxPointer is a pointer into the TermPosition file
111
+ # (prx). Freq is the number of times this term appears in this field, in
112
+ # this document. raises:: IllegalStateException if document or field is
113
+ # not open
114
+ def add_term(term_text, freq, positions = nil, offsets = nil)
115
+ if not document_open?
116
+ raise IllegalStateError, "Cannot add terms when document is not open"
117
+ end
118
+ if not field_open?
119
+ raise IllegalStateError, "Cannot add terms when field is not open"
120
+ end
121
+
122
+ add_term_internal(term_text, freq, positions, offsets)
123
+ end
124
+
125
+ def add_term_internal(term_text, freq, positions, offsets)
126
+ @terms << TVTerm.new(term_text, freq, positions, offsets)
127
+ end
128
+
129
+ # Add a complete document specified by all its term vectors. If document has no
130
+ # term vectors, add value for @tvx.
131
+ #
132
+ # vectors:: The documents to have their term vectors added
133
+ # raises:: IOException
134
+ def add_all_doc_vectors(vectors)
135
+
136
+ open_document()
137
+
138
+ if vectors != nil
139
+ vectors.each do |vector|
140
+ store_positions = (vector.size > 0 and vector.positions != nil)
141
+ store_offsets = (vector.size > 0 and vector.offsets != nil)
142
+
143
+ create_field(@field_infos.field_number(vector.field),
144
+ store_positions, store_offsets)
145
+
146
+ vector.size.times do |j|
147
+ add_term_internal(vector.terms[j],
148
+ vector.term_frequencies[j],
149
+ store_positions ? vector.positions[j] : nil,
150
+ store_offsets ? vector.offsets[j] : nil)
151
+ end
152
+ close_field()
153
+ end
154
+ end
155
+ close_document()
156
+ end
157
+
158
+ # Close all streams.
159
+ def close()
160
+ begin
161
+ close_document()
162
+ ensure
163
+ # make an effort to close all streams we can but remember and re-raise
164
+ # the last exception encountered in this process
165
+ keep = nil
166
+ [@tvx, @tvd, @tvf].compact.each do |os|
167
+ begin
168
+ os.close()
169
+ rescue IOError => e
170
+ keep = e
171
+ end
172
+ end
173
+ raise keep if (keep != nil)
174
+ end
175
+ end
176
+
177
+ class TVField
178
+ attr_accessor :number, :tvf_pointer, :store_positions, :store_offsets
179
+ def initialize(number, store_pos, store_off)
180
+ @tvf_pointer = 0
181
+ @number = number
182
+ @store_positions = store_pos
183
+ @store_offsets = store_off
184
+ end
185
+ end
186
+
187
+ class TVTerm
188
+ attr_accessor :term_text, :freq, :positions, :offsets
189
+
190
+ def initialize(term_text=nil, freq=nil, positions=nil, offsets=nil)
191
+ @term_text = term_text
192
+ @freq = freq
193
+ @positions = positions
194
+ @offsets = offsets
195
+ end
196
+ end
197
+
198
+ private
199
+
200
+ def write_field()
201
+ # remember where this field is written
202
+ @current_field.tvf_pointer = @tvf.pos
203
+
204
+ size = @terms.size
205
+ @tvf.write_vint(size)
206
+
207
+ store_positions = @current_field.store_positions
208
+ store_offsets = @current_field.store_offsets
209
+ bits = 0x0
210
+ if (store_positions)
211
+ bits |= STORE_POSITIONS_WITH_TERMVECTOR
212
+ end
213
+ if (store_offsets)
214
+ bits |= STORE_OFFSET_WITH_TERMVECTOR
215
+ end
216
+ @tvf.write_byte(bits)
217
+
218
+ last_term_text = ""
219
+ @terms.each do |term|
220
+ start = Ferret::Utils::StringHelper.string_difference(last_term_text,
221
+ term.term_text)
222
+ length = term.term_text.length() - start
223
+ @tvf.write_vint(start) # write shared prefix length
224
+ @tvf.write_vint(length) # write delta length
225
+ @tvf.write_chars(term.term_text, start, length) # write delta chars
226
+ @tvf.write_vint(term.freq)
227
+ last_term_text = term.term_text
228
+
229
+ if (store_positions)
230
+ if (term.positions == nil)
231
+ raise IllegalStateError, "Trying to write positions that are nil!"
232
+ end
233
+
234
+ # use delta encoding for positions
235
+ position = 0
236
+ term.freq.times do |j|
237
+ @tvf.write_vint(term.positions[j] - position)
238
+ position = term.positions[j]
239
+ end
240
+ end
241
+
242
+ if (store_offsets)
243
+ if(term.offsets == nil)
244
+ raise IllegalStateError, "Trying to write offsets that are nil!"
245
+ end
246
+
247
+ # use delta encoding for offsets
248
+ position = 0
249
+ term.freq.times do |j|
250
+ @tvf.write_vint(term.offsets[j].start_offset - position)
251
+ #Save the diff between the two.
252
+ @tvf.write_vint(term.offsets[j].end_offset -
253
+ term.offsets[j].start_offset)
254
+ position = term.offsets[j].end_offset()
255
+ end
256
+ end
257
+ end
258
+ end
259
+
260
+ def write_doc()
261
+ if field_open?
262
+ raise IllegalStateError, "Field is still open while writing document"
263
+ end
264
+ #puts("Writing doc pointer: " + @current_doc_pointer)
265
+ # write document index record
266
+ @tvx.write_long(@current_doc_pointer)
267
+
268
+ # write document data record
269
+ size = @fields.size
270
+
271
+ # write the number of @fields
272
+ @tvd.write_vint(size)
273
+
274
+ # write field numbers
275
+ @fields.each { |field| @tvd.write_vint(field.number) }
276
+
277
+ # write field pointers
278
+ last_field_pointer = 0
279
+ @fields.each do |field|
280
+ @tvd.write_vlong(field.tvf_pointer - last_field_pointer)
281
+ last_field_pointer = field.tvf_pointer
282
+ end
283
+ #puts("After writing doc pointer: " + @tvx.pos())
284
+ end
285
+
286
+ def create_field(field_number, store_position, store_offset)
287
+ if not document_open?
288
+ raise IllegalStateError, "Cannot open field when no document is open."
289
+ end
290
+ close_field()
291
+ @current_field = TVField.new(field_number, store_position, store_offset)
292
+ end
293
+ end
294
+
295
+ class TermVectorsReader
296
+ attr_reader :size
297
+
298
+ # accessors for clone method
299
+ attr_accessor :tvx, :tvd, :tvf
300
+ protected :tvx, :tvx=, :tvd, :tvd=, :tvf, :tvf=
301
+
302
+
303
+ def initialize(d, segment, field_infos)
304
+
305
+ if (d.exists?(segment + TermVectorsWriter::TVX_EXTENSION))
306
+ @tvx = d.open_input(segment + TermVectorsWriter::TVX_EXTENSION)
307
+ check_valid_format(@tvx)
308
+ @tvd = d.open_input(segment + TermVectorsWriter::TVD_EXTENSION)
309
+ @tvd_format = check_valid_format(@tvd)
310
+ @tvf = d.open_input(segment + TermVectorsWriter::TVF_EXTENSION)
311
+ @tvf_format = check_valid_format(@tvf)
312
+ @size = @tvx.length / 8
313
+ else
314
+ @tvx = nil
315
+ @tvd = nil
316
+ @tvf = nil
317
+ end
318
+
319
+ @field_infos = field_infos
320
+ end
321
+
322
+ def close()
323
+ # make an effort to close all streams we can but remember and re-raise
324
+ # the last exception encountered in this process
325
+ keep = nil
326
+ [@tvx, @tvd, @tvf].compact.each do |os|
327
+ begin
328
+ os.close()
329
+ rescue IOError => e
330
+ keep = e
331
+ end
332
+ end
333
+ raise keep if (keep != nil)
334
+ end
335
+
336
+ # Retrieve the term vector for the given document and field
337
+ # doc_num:: The document number to retrieve the vector for
338
+ # field:: The field within the document to retrieve
339
+ # returns:: The TermFreqVector for the document and field or nil if there
340
+ # is no termVector for this field.
341
+ # raises:: IOException if there is an error reading the term vector files
342
+ def get_field_tv(doc_num, field)
343
+ # Check if no term vectors are available for this segment at all
344
+ field_number = @field_infos.field_number(field)
345
+ result = nil
346
+ if (@tvx != nil)
347
+ #We need to account for the FORMAT_SIZE at when seeking in the @tvx
348
+ #We don't need to do this in other seeks because we already have the
349
+ # file pointer
350
+ #that was written in another file
351
+ @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
352
+ #puts("TVX Pointer: " + @tvx.pos())
353
+ position = @tvx.read_long()
354
+
355
+ @tvd.seek(position)
356
+ field_count = @tvd.read_vint()
357
+ #puts("Num Fields: " + field_count)
358
+ # There are only a few fields per document. We opt for a full scan
359
+ # rather then requiring that they be ordered. We need to read through
360
+ # all of the fields anyway to get to the tvf pointers.
361
+ number = 0
362
+ found = -1
363
+ field_count.times do |i|
364
+ if @tvd_format == TermVectorsWriter::FORMAT_VERSION
365
+ number = @tvd.read_vint()
366
+ else
367
+ number += @tvd.read_vint()
368
+ end
369
+ if (number == field_number)
370
+ found = i
371
+ end
372
+ end
373
+
374
+ # This field, although valid in the segment, was not found in this
375
+ # document
376
+ if (found != -1)
377
+ # Compute position in the @tvf file
378
+ position = 0
379
+ (found + 1).times do
380
+ position += @tvd.read_vlong()
381
+ end
382
+
383
+ result = read_term_vector(field, position)
384
+ end
385
+ end
386
+ return result
387
+ end
388
+
389
+ # Return all term vectors stored for this document or nil if it could
390
+ # not be read in.
391
+ #
392
+ # doc_num:: The document number to retrieve the vector for
393
+ # returns:: All term frequency vectors
394
+ # raises:: IOException if there is an error reading the term vector files
395
+ def get_tv(doc_num)
396
+ result = nil
397
+ # Check if no term vectors are available for this segment at all
398
+ if (@tvx != nil)
399
+ #We need to offset by
400
+ @tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
401
+ position = @tvx.read_long()
402
+
403
+ @tvd.seek(position)
404
+ field_count = @tvd.read_vint()
405
+
406
+ # No fields are vectorized for this document
407
+ if (field_count != 0)
408
+ number = 0
409
+ fields = Array.new(field_count)
410
+
411
+ field_count.times do |i|
412
+ if @tvd_format == TermVectorsWriter::FORMAT_VERSION
413
+ number = @tvd.read_vint()
414
+ else
415
+ number += @tvd.read_vint()
416
+ end
417
+
418
+ fields[i] = @field_infos[number].name
419
+ end
420
+
421
+ # Compute position in the @tvf file
422
+ position = 0
423
+ tvf_pointers = Array.new(field_count)
424
+ field_count.times do |i|
425
+ position += @tvd.read_vlong()
426
+ tvf_pointers[i] = position
427
+ end
428
+
429
+ result = read_term_vectors(fields, tvf_pointers)
430
+ end
431
+ end
432
+ return result
433
+ end
434
+
435
+ def clone()
436
+
437
+ if (@tvx == nil or @tvd == nil or @tvf == nil)
438
+ return nil
439
+ end
440
+
441
+ clone = self
442
+ clone.tvx = @tvx.clone()
443
+ clone.tvd = @tvd.clone()
444
+ clone.tvf = @tvf.clone()
445
+
446
+ return clone
447
+ end
448
+
449
+ private
450
+
451
+ def read_term_vectors(fields, tvf_pointers)
452
+
453
+ res = Array.new(fields.length)
454
+ fields.length.times do |i|
455
+ res[i] = read_term_vector(fields[i], tvf_pointers[i])
456
+ end
457
+ return res
458
+ end
459
+
460
+ # field:: The field to read in
461
+ # tvf_pointer:: The pointer within the @tvf file where we should start reading
462
+ # returns:: The TermVector located at that position
463
+ # raises:: IOException
464
+ def read_term_vector(field, tvf_pointer)
465
+ # Now read the data from specified position
466
+ # We don't need to offset by the FORMAT here since the pointer
467
+ # already includes the offset
468
+ @tvf.seek(tvf_pointer)
469
+
470
+ num_terms = @tvf.read_vint()
471
+ # If no terms - return a constant empty termvector. However, this should
472
+ # never occur!
473
+ if (num_terms == 0)
474
+ return SegmentTermVector.new(field, nil, nil)
475
+ end
476
+
477
+
478
+ if(@tvf_format == TermVectorsWriter::FORMAT_VERSION)
479
+ bits = @tvf.read_byte()
480
+ store_positions = (bits & TermVectorsWriter::STORE_POSITIONS_WITH_TERMVECTOR) != 0
481
+ store_offsets = (bits & TermVectorsWriter::STORE_OFFSET_WITH_TERMVECTOR) != 0
482
+ else
483
+ @tvf.read_vint()
484
+ store_positions = false
485
+ store_offsets = false
486
+ end
487
+
488
+ terms = Array.new(num_terms)
489
+ term_freqs = Array.new(num_terms)
490
+
491
+ # we may not need these, but declare them
492
+ positions = nil
493
+ offsets = nil
494
+ if(store_positions)
495
+ positions = Array.new(num_terms)
496
+ end
497
+ if(store_offsets)
498
+ offsets = Array.new(num_terms)
499
+ end
500
+
501
+ start = 0
502
+ delta_length = 0
503
+ total_length = 0
504
+ buffer = ""
505
+ previous_buffer = ""
506
+
507
+ num_terms.times do |i|
508
+ start = @tvf.read_vint()
509
+ delta_length = @tvf.read_vint()
510
+ total_length = start + delta_length
511
+ @tvf.read_chars(buffer, start, delta_length)
512
+ terms[i] = buffer[0, total_length].to_s
513
+ previous_string = terms[i]
514
+ freq = @tvf.read_vint()
515
+ term_freqs[i] = freq
516
+
517
+ if (store_positions) #read in the positions
518
+ pos = Array.new(freq)
519
+ positions[i] = pos
520
+ prev_position = 0
521
+ freq.times do |j|
522
+ pos[j] = prev_position + @tvf.read_vint()
523
+ prev_position = pos[j]
524
+ end
525
+ end
526
+
527
+ if (store_offsets)
528
+ offs = Array.new(freq)
529
+ offsets[i] = offs
530
+ prev_offset = 0
531
+ freq.times do |j|
532
+ start_offset = prev_offset + @tvf.read_vint()
533
+ end_offset = start_offset + @tvf.read_vint()
534
+ offs[j] = TermVectorOffsetInfo.new(start_offset, end_offset)
535
+ prev_offset = end_offset
536
+ end
537
+ end
538
+ end
539
+
540
+ SegmentTermVector.new(field, terms, term_freqs, positions, offsets)
541
+ end
542
+
543
+ def check_valid_format(istream)
544
+ format = istream.read_int()
545
+ if (format > TermVectorsWriter::FORMAT_VERSION)
546
+ raise IOError, "Incompatible format version: #{format} expected #{TermVectorsWriter::FORMAT_VERSION} or less"
547
+ end
548
+ return format
549
+ end
550
+
551
+ end
552
+ end