ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,58 @@
1
+ module Ferret::Index
2
+ # Provides access to stored term vector of
3
+ # a document field.
4
+ class SegmentTermVector
5
+ # Array of term frequencies. Locations of the array correspond one to one
6
+ # to the terms in the array obtained from _terms_
7
+ # method. Each location in the array contains the number of times this
8
+ # term occurs in the document or the document field.
9
+ attr_reader :term_frequencies, :positions, :offsets
10
+
11
+ attr_reader :field, :terms
12
+
13
+ def initialize(field, terms, term_freqs, positions=nil, offsets=nil)
14
+ @field = field
15
+ @terms = terms
16
+ @term_frequencies = term_freqs
17
+ @positions = positions
18
+ @offsets = offsets
19
+ end
20
+
21
+ def to_s()
22
+ sb = @field.to_s + ": "
23
+ if @terms
24
+ terms.each_with_index do |term, i|
25
+ sb << ', ' if i > 0
26
+ sb << term + '/' + @term_frequencies[i].to_s
27
+ end
28
+ end
29
+ sb << 'end'
30
+
31
+ return sb
32
+ end
33
+
34
+ # Returns the number of unique terms in the field
35
+ def size()
36
+ return @terms == nil ? 0 : @terms.size
37
+ end
38
+
39
+ # Return an index in the term numbers array returned from _get_terms_ at
40
+ # which the term with the specified _term_ appears. If this term does
41
+ # not appear in the array, return -1.
42
+ def index_of(term)
43
+ return @terms ? @terms.index(term) : nil
44
+ end
45
+
46
+ # Just like _index_of_ but searches for a number of terms at the same
47
+ # time. Returns an array that has the same size as the number of terms
48
+ # searched for, each slot containing the result of searching for that
49
+ # term number.
50
+ #
51
+ # terms:: array containing terms to look for
52
+ # start:: index in the array where the list of terms starts
53
+ # len:: the number of terms in the list
54
+ def indexes_of(terms, start, len)
55
+ return terms[start, len].map { |term| index_of(term) }
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,49 @@
1
+ module Ferret::Index
2
+ # A Term represents a word from text. This is the unit of search. It is
3
+ # composed of two elements, the text of the word, as a string, and the name of
4
+ # the field that the text occured in, an interned string.
5
+ #
6
+ # Note that terms may represent more than words from text fields, but also
7
+ # things like dates, email addresses, urls, etc.
8
+ #
9
+ # A term contains two attributes;
10
+ # field:: The field indicates the part of a document which this term came from.
11
+ # text:: In the case of words, this is simply the text of the word. In the case
12
+ # of dates and other types, this is an encoding of the object as a string.
13
+ class Term
14
+ include Comparable
15
+
16
+ attr_accessor :field
17
+ attr_accessor :text
18
+
19
+ # Constructs a Term with the given field and text
20
+ def initialize(fld_name, txt)
21
+ @field = fld_name
22
+ @text = txt
23
+ end
24
+
25
+ # Combines the hash() of the field and the text.
26
+ def hash()
27
+ return field.hash() + text.hash()
28
+ end
29
+
30
+ # implements comparable giving us the methods >, >=, <, <= and between?
31
+ def <=>(other)
32
+ if @field == other.field
33
+ return @text <=> other.text
34
+ else
35
+ return @field <=> other.field
36
+ end
37
+ end
38
+ alias :eql? :==
39
+
40
+ # Resets the field and text of a Term.
41
+ def set!(fld_name, txt)
42
+ initialize(fld_name, txt)
43
+ end
44
+
45
+ def to_s
46
+ @field + ":" + @text
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,88 @@
1
+ module Ferret::Index
2
+ class TermBuffer
3
+ include Comparable
4
+
5
+ attr_reader :text, :text_length, :field
6
+
7
+ def initialize
8
+ @text = String.new
9
+ @text_length = -1
10
+ @field = nil
11
+ end
12
+
13
+ def hash()
14
+ return @text.hash + @field.hash
15
+ end
16
+
17
+ def <=>(other)
18
+ if (@field == other.field)
19
+ return text_str <=> other.text_str
20
+ end
21
+ @field <=> other.field
22
+ end
23
+
24
+ def read(input, field_infos)
25
+ @term = nil # invalidate cache
26
+ start = input.read_vint()
27
+ length = input.read_vint()
28
+ total_length = start + length
29
+ @text_length = total_length
30
+ input.read_chars(@text, start, length)
31
+ @field = field_infos[input.read_vint()].name
32
+ end
33
+
34
+ def term=(term)
35
+ if (term == nil)
36
+ reset()
37
+ return
38
+ end
39
+
40
+ # copy text into the buffer
41
+ @text_length = term.text.length
42
+ @text = term.text.clone
43
+
44
+ @field = term.field
45
+ @term = term
46
+ end
47
+
48
+ def set!(other)
49
+ @text_length = other.text_length
50
+ @text = other.text.clone if other.text
51
+ @field = other.field
52
+ @term = other.term
53
+ end
54
+
55
+ def reset()
56
+ @field = nil
57
+ @text = String.new
58
+ @text_length = 0
59
+ @term = nil
60
+ end
61
+
62
+ def to_term()
63
+ if @field.nil? # unset
64
+ return nil
65
+ end
66
+
67
+ if @term.nil?
68
+ @term = Term.new(@field, @text[0,@text_length].to_s)
69
+ end
70
+ return @term
71
+ end
72
+ alias :term :to_term
73
+
74
+ def clone()
75
+ clone = TermBuffer.new()
76
+ clone.set!(self)
77
+ return clone
78
+ end
79
+
80
+ def text_str()
81
+ @text[0,@text_length]
82
+ end
83
+
84
+ def to_s()
85
+ to_term.to_s
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,283 @@
1
+ module Ferret::Index
2
+ # TermDocEnum provides an interface for enumerating &lt;document,
3
+ # frequency&gt; pairs for a term.
4
+ #
5
+ # The document portion names each document containing the term. Documents
6
+ # are indicated by number. The frequency portion gives the number of times
7
+ # the term occurred in each document.
8
+ #
9
+ # The pairs are ordered by document number.
10
+ #
11
+ # See IndexReader#term_docs
12
+ class TermDocEnum
13
+ # Sets this to the data for a term.
14
+ # The enumeration is reset to the start of the data for this term.
15
+ def seek(term) raise NotImplementedError end
16
+
17
+ # Returns the current document number.
18
+ #
19
+ # This is invalid until #next() is called for the first time.
20
+ def doc() raise NotImplementedError end
21
+
22
+ # Returns the frequency of the term within the current document. This
23
+ # is invalid until {@link #next()} is called for the first time.
24
+ def freq() raise NotImplementedError end
25
+
26
+ # Moves to the next pair in the enumeration.
27
+ # Returns true iff there is such a next pair in the enumeration.
28
+ def next?() raise NotImplementedError end
29
+
30
+ # Attempts to read multiple entries from the enumeration, up to length of
31
+ # _docs_. Document numbers are stored in _docs_, and term
32
+ # frequencies are stored in _freqs_. The _freqs_ array must be as
33
+ # long as the _docs_ array.
34
+ #
35
+ # Returns the number of entries read. Zero is only returned when the
36
+ # stream has been exhausted.
37
+ def read(docs, freqs) raise NotImplementedError end
38
+
39
+ # Skips entries to the first beyond the current whose document number is
40
+ # greater than or equal to _target_.
41
+ #
42
+ # Returns true iff there is such an entry.
43
+ #
44
+ # Some implementations are considerably more efficient than that.
45
+ def skip_to(target)
46
+ while (target > doc())
47
+ return false if not next?()
48
+ end
49
+ return true
50
+ end
51
+
52
+ # Frees associated resources.
53
+ def close() raise NotImplementedError end
54
+ end
55
+
56
+
57
+ class SegmentTermDocEnum < TermDocEnum
58
+ attr_accessor :parent, :freq_stream, :count, :df, :deleted_docs, :doc, :freq
59
+
60
+ def initialize(parent)
61
+ @parent = parent
62
+ @freq_stream = parent.freq_stream.clone()
63
+ @deleted_docs = parent.deleted_docs
64
+ @skip_interval = parent.term_infos.skip_interval
65
+ @skip_stream = nil
66
+ @doc = 0
67
+ end
68
+
69
+ # Find the term, TermEnum or TermInfo in the doc
70
+ #
71
+ # t:: can be a Term, TermEnum of TermInfo object
72
+ def seek(t)
73
+ if t.instance_of?(Term)
74
+ ti = parent.term_infos[t]
75
+ elsif t.is_a?(TermEnum)
76
+ # use comparison of fieldinfos to verify that term enum (t) belongs to the
77
+ # same segment as this SegmentTermDocEnum
78
+ if (t.instance_of?(SegmentTermEnum) and t.field_infos == parent.field_infos)
79
+ ti = t.term_info()
80
+ else # punt case
81
+ ti = parent.term_infos[t.term]
82
+ end
83
+ elsif t.is_a? TermInfo # this one is easy. That's exactly what we're looking for
84
+ ti = t
85
+ else
86
+ raise ArgumentError, "Must pass a Term, TermEnum or TermInfo object, not a " +
87
+ t.class.to_s
88
+ end
89
+ do_seek(ti)
90
+ #puts "pos = #{@freq_stream.pos} ti = #{ti}"
91
+ end
92
+
93
+ def do_seek(ti)
94
+ @count = 0
95
+ if (ti == nil)
96
+ @doc_freq = 0
97
+ else
98
+ @doc_freq = ti.doc_freq
99
+ @doc = 0
100
+ @skip_doc = 0
101
+ @skip_count = 0
102
+ @num_skips = @doc_freq / @skip_interval
103
+ @freq_pointer = ti.freq_pointer
104
+ @prox_pointer = ti.prox_pointer
105
+ @skip_pointer = @freq_pointer + ti.skip_offset
106
+ @freq_stream.seek(@freq_pointer)
107
+ @have_skipped = false
108
+ end
109
+ end
110
+
111
+ def close()
112
+ @freq_stream.close()
113
+ if (@skip_stream != nil)
114
+ @skip_stream.close()
115
+ end
116
+ @parent = nil
117
+ end
118
+
119
+ def skipping_doc()
120
+ end
121
+
122
+ def next?()
123
+ while (true)
124
+ return false if @count == @doc_freq
125
+
126
+ doc_code = @freq_stream.read_vint()
127
+ @doc += doc_code >> 1 # shift off low bit
128
+ if ((doc_code & 1) != 0) # if low bit is set
129
+ @freq = 1 # freq is one
130
+ else
131
+ @freq = @freq_stream.read_vint() # else read freq
132
+ end
133
+
134
+ @count += 1
135
+
136
+ break if (@deleted_docs == nil or not @deleted_docs[@doc])
137
+
138
+ skipping_doc()
139
+ end
140
+ return true
141
+ end
142
+
143
+ # Optimized implementation.
144
+ def read(docs, freqs, start = 0)
145
+ i = start
146
+ needed=docs.length
147
+
148
+ while (i < needed and @count < @doc_freq)
149
+
150
+ # manually inlined call to next() for speed
151
+ doc_code = @freq_stream.read_vint()
152
+ @doc += doc_code >> 1 # shift off low bit
153
+ if ((doc_code & 1) != 0) # if low bit is set
154
+ @freq = 1 # freq is one
155
+ else
156
+ @freq = @freq_stream.read_vint() # else read freq
157
+ end
158
+ @count += 1
159
+
160
+ if (@deleted_docs == nil or not @deleted_docs[@doc])
161
+ docs[i] = @doc
162
+ freqs[i] = @freq
163
+ i += 1
164
+ end
165
+ end
166
+ return i
167
+ end
168
+
169
+ # Overridden by SegmentTermDocPosEnum to skip in prox stream.
170
+ def skip_prox(prox_pointer)
171
+ end
172
+
173
+ # Optimized implementation.
174
+ def skip_to(target)
175
+ if (@doc_freq >= @skip_interval) # optimized case
176
+
177
+ if (@skip_stream == nil)
178
+ @skip_stream = @freq_stream.clone() # lazily clone
179
+ end
180
+
181
+ if (!@have_skipped) # lazily seek skip stream
182
+ @skip_stream.seek(@skip_pointer)
183
+ @have_skipped = true
184
+ end
185
+
186
+ # scan skip data
187
+ last_skip_doc = @skip_doc
188
+ last_freq_pointer = @freq_stream.pos()
189
+ last_prox_pointer = -1
190
+ num_skipped = -1 - (@count % @skip_interval)
191
+
192
+ while (target > @skip_doc)
193
+ last_skip_doc = @skip_doc
194
+ last_freq_pointer = @freq_pointer
195
+ last_prox_pointer = @prox_pointer
196
+
197
+ if (@skip_doc != 0 and @skip_doc >= @doc)
198
+ num_skipped += @skip_interval
199
+ end
200
+
201
+ if(@skip_count >= @num_skips)
202
+ break
203
+ end
204
+
205
+ @skip_doc += @skip_stream.read_vint()
206
+ @freq_pointer += @skip_stream.read_vint()
207
+ @prox_pointer += @skip_stream.read_vint()
208
+
209
+ @skip_count += 1
210
+ end
211
+
212
+ # if we found something to skip, then skip it
213
+ if (last_freq_pointer > @freq_stream.pos())
214
+ @freq_stream.seek(last_freq_pointer)
215
+ skip_prox(last_prox_pointer)
216
+
217
+ @doc = last_skip_doc
218
+ @count += num_skipped
219
+ end
220
+
221
+ end
222
+
223
+ # done skipping, now just scan
224
+
225
+ begin
226
+ if not next?
227
+ return false
228
+ end
229
+ end while (target > @doc)
230
+ return true
231
+ end
232
+ end
233
+
234
+ class SegmentTermDocPosEnum < SegmentTermDocEnum
235
+ def initialize(p)
236
+ super
237
+ @prox_stream = p.prox_stream.clone()
238
+ end
239
+
240
+ def do_seek(ti)
241
+ super
242
+ if (ti != nil)
243
+ @prox_stream.seek(ti.prox_pointer)
244
+ end
245
+ @prox_count = 0
246
+ end
247
+
248
+ def close()
249
+ super
250
+ @prox_stream.close()
251
+ end
252
+
253
+ def next_position()
254
+ @prox_count -= 1
255
+ return @position += @prox_stream.read_vint()
256
+ end
257
+
258
+ def skipping_doc()
259
+ @freq.times { @prox_stream.read_vint() }
260
+ end
261
+
262
+ def next?()
263
+ @prox_count.times { @prox_stream.read_vint() }
264
+
265
+ if (super)
266
+ @prox_count = @freq # note frequency
267
+ @position = 0 # reset position
268
+ return true
269
+ end
270
+ return false
271
+ end
272
+
273
+ def read(docs, freqs)
274
+ raise NotImplementedError, "TermDocPosEnum does not support processing multiple documents in one call. Use TermDocEnum instead."
275
+ end
276
+
277
+ # Called by super.skipTo().
278
+ def skip_prox(prox_pointer)
279
+ @prox_stream.seek(prox_pointer)
280
+ @prox_count = 0
281
+ end
282
+ end
283
+ end