ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,58 @@
1
+ module Ferret::Index
2
+ # Provides access to stored term vector of
3
+ # a document field.
4
+ class SegmentTermVector
5
+ # Array of term frequencies. Locations of the array correspond one to one
6
+ # to the terms in the array obtained from _terms_
7
+ # method. Each location in the array contains the number of times this
8
+ # term occurs in the document or the document field.
9
+ attr_reader :term_frequencies, :positions, :offsets
10
+
11
+ attr_reader :field, :terms
12
+
13
+ def initialize(field, terms, term_freqs, positions=nil, offsets=nil)
14
+ @field = field
15
+ @terms = terms
16
+ @term_frequencies = term_freqs
17
+ @positions = positions
18
+ @offsets = offsets
19
+ end
20
+
21
+ def to_s()
22
+ sb = @field.to_s + ": "
23
+ if @terms
24
+ terms.each_with_index do |term, i|
25
+ sb << ', ' if i > 0
26
+ sb << term + '/' + @term_frequencies[i].to_s
27
+ end
28
+ end
29
+ sb << 'end'
30
+
31
+ return sb
32
+ end
33
+
34
+ # Returns the number of unique terms in the field
35
+ def size()
36
+ return @terms == nil ? 0 : @terms.size
37
+ end
38
+
39
+ # Return an index in the term numbers array returned from _get_terms_ at
40
+ # which the term with the specified _term_ appears. If this term does
41
+ # not appear in the array, return -1.
42
+ def index_of(term)
43
+ return @terms ? @terms.index(term) : nil
44
+ end
45
+
46
+ # Just like _index_of_ but searches for a number of terms at the same
47
+ # time. Returns an array that has the same size as the number of terms
48
+ # searched for, each slot containing the result of searching for that
49
+ # term number.
50
+ #
51
+ # terms:: array containing terms to look for
52
+ # start:: index in the array where the list of terms starts
53
+ # len:: the number of terms in the list
54
+ def indexes_of(terms, start, len)
55
+ return terms[start, len].map { |term| index_of(term) }
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,49 @@
1
+ module Ferret::Index
2
+ # A Term represents a word from text. This is the unit of search. It is
3
+ # composed of two elements, the text of the word, as a string, and the name of
4
+ # the field that the text occured in, an interned string.
5
+ #
6
+ # Note that terms may represent more than words from text fields, but also
7
+ # things like dates, email addresses, urls, etc.
8
+ #
9
+ # A term contains two attributes;
10
+ # field:: The field indicates the part of a document which this term came from.
11
+ # text:: In the case of words, this is simply the text of the word. In the case
12
+ # of dates and other types, this is an encoding of the object as a string.
13
+ class Term
14
+ include Comparable
15
+
16
+ attr_accessor :field
17
+ attr_accessor :text
18
+
19
+ # Constructs a Term with the given field and text
20
+ def initialize(fld_name, txt)
21
+ @field = fld_name
22
+ @text = txt
23
+ end
24
+
25
+ # Combines the hash() of the field and the text.
26
+ def hash()
27
+ return field.hash() + text.hash()
28
+ end
29
+
30
+ # implements comparable giving us the methods >, >=, <, <= and between?
31
+ def <=>(other)
32
+ if @field == other.field
33
+ return @text <=> other.text
34
+ else
35
+ return @field <=> other.field
36
+ end
37
+ end
38
+ alias :eql? :==
39
+
40
+ # Resets the field and text of a Term.
41
+ def set!(fld_name, txt)
42
+ initialize(fld_name, txt)
43
+ end
44
+
45
+ def to_s
46
+ @field + ":" + @text
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,88 @@
1
+ module Ferret::Index
2
+ class TermBuffer
3
+ include Comparable
4
+
5
+ attr_reader :text, :text_length, :field
6
+
7
+ def initialize
8
+ @text = String.new
9
+ @text_length = -1
10
+ @field = nil
11
+ end
12
+
13
+ def hash()
14
+ return @text.hash + @field.hash
15
+ end
16
+
17
+ def <=>(other)
18
+ if (@field == other.field)
19
+ return text_str <=> other.text_str
20
+ end
21
+ @field <=> other.field
22
+ end
23
+
24
+ def read(input, field_infos)
25
+ @term = nil # invalidate cache
26
+ start = input.read_vint()
27
+ length = input.read_vint()
28
+ total_length = start + length
29
+ @text_length = total_length
30
+ input.read_chars(@text, start, length)
31
+ @field = field_infos[input.read_vint()].name
32
+ end
33
+
34
+ def term=(term)
35
+ if (term == nil)
36
+ reset()
37
+ return
38
+ end
39
+
40
+ # copy text into the buffer
41
+ @text_length = term.text.length
42
+ @text = term.text.clone
43
+
44
+ @field = term.field
45
+ @term = term
46
+ end
47
+
48
+ def set!(other)
49
+ @text_length = other.text_length
50
+ @text = other.text.clone if other.text
51
+ @field = other.field
52
+ @term = other.term
53
+ end
54
+
55
+ def reset()
56
+ @field = nil
57
+ @text = String.new
58
+ @text_length = 0
59
+ @term = nil
60
+ end
61
+
62
+ def to_term()
63
+ if @field.nil? # unset
64
+ return nil
65
+ end
66
+
67
+ if @term.nil?
68
+ @term = Term.new(@field, @text[0,@text_length].to_s)
69
+ end
70
+ return @term
71
+ end
72
+ alias :term :to_term
73
+
74
+ def clone()
75
+ clone = TermBuffer.new()
76
+ clone.set!(self)
77
+ return clone
78
+ end
79
+
80
+ def text_str()
81
+ @text[0,@text_length]
82
+ end
83
+
84
+ def to_s()
85
+ to_term.to_s
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,283 @@
1
+ module Ferret::Index
2
+ # TermDocEnum provides an interface for enumerating &lt;document,
3
+ # frequency&gt; pairs for a term.
4
+ #
5
+ # The document portion names each document containing the term. Documents
6
+ # are indicated by number. The frequency portion gives the number of times
7
+ # the term occurred in each document.
8
+ #
9
+ # The pairs are ordered by document number.
10
+ #
11
+ # See IndexReader#term_docs
12
+ class TermDocEnum
13
+ # Sets this to the data for a term.
14
+ # The enumeration is reset to the start of the data for this term.
15
+ def seek(term) raise NotImplementedError end
16
+
17
+ # Returns the current document number.
18
+ #
19
+ # This is invalid until #next() is called for the first time.
20
+ def doc() raise NotImplementedError end
21
+
22
+ # Returns the frequency of the term within the current document. This
23
+ # is invalid until {@link #next()} is called for the first time.
24
+ def freq() raise NotImplementedError end
25
+
26
+ # Moves to the next pair in the enumeration.
27
+ # Returns true iff there is such a next pair in the enumeration.
28
+ def next?() raise NotImplementedError end
29
+
30
+ # Attempts to read multiple entries from the enumeration, up to length of
31
+ # _docs_. Document numbers are stored in _docs_, and term
32
+ # frequencies are stored in _freqs_. The _freqs_ array must be as
33
+ # long as the _docs_ array.
34
+ #
35
+ # Returns the number of entries read. Zero is only returned when the
36
+ # stream has been exhausted.
37
+ def read(docs, freqs) raise NotImplementedError end
38
+
39
+ # Skips entries to the first beyond the current whose document number is
40
+ # greater than or equal to _target_.
41
+ #
42
+ # Returns true iff there is such an entry.
43
+ #
44
+ # Some implementations are considerably more efficient than that.
45
+ def skip_to(target)
46
+ while (target > doc())
47
+ return false if not next?()
48
+ end
49
+ return true
50
+ end
51
+
52
+ # Frees associated resources.
53
+ def close() raise NotImplementedError end
54
+ end
55
+
56
+
57
+ class SegmentTermDocEnum < TermDocEnum
58
+ attr_accessor :parent, :freq_stream, :count, :df, :deleted_docs, :doc, :freq
59
+
60
+ def initialize(parent)
61
+ @parent = parent
62
+ @freq_stream = parent.freq_stream.clone()
63
+ @deleted_docs = parent.deleted_docs
64
+ @skip_interval = parent.term_infos.skip_interval
65
+ @skip_stream = nil
66
+ @doc = 0
67
+ end
68
+
69
+ # Find the term, TermEnum or TermInfo in the doc
70
+ #
71
+ # t:: can be a Term, TermEnum of TermInfo object
72
+ def seek(t)
73
+ if t.instance_of?(Term)
74
+ ti = parent.term_infos[t]
75
+ elsif t.is_a?(TermEnum)
76
+ # use comparison of fieldinfos to verify that term enum (t) belongs to the
77
+ # same segment as this SegmentTermDocEnum
78
+ if (t.instance_of?(SegmentTermEnum) and t.field_infos == parent.field_infos)
79
+ ti = t.term_info()
80
+ else # punt case
81
+ ti = parent.term_infos[t.term]
82
+ end
83
+ elsif t.is_a? TermInfo # this one is easy. That's exactly what we're looking for
84
+ ti = t
85
+ else
86
+ raise ArgumentError, "Must pass a Term, TermEnum or TermInfo object, not a " +
87
+ t.class.to_s
88
+ end
89
+ do_seek(ti)
90
+ #puts "pos = #{@freq_stream.pos} ti = #{ti}"
91
+ end
92
+
93
+ def do_seek(ti)
94
+ @count = 0
95
+ if (ti == nil)
96
+ @doc_freq = 0
97
+ else
98
+ @doc_freq = ti.doc_freq
99
+ @doc = 0
100
+ @skip_doc = 0
101
+ @skip_count = 0
102
+ @num_skips = @doc_freq / @skip_interval
103
+ @freq_pointer = ti.freq_pointer
104
+ @prox_pointer = ti.prox_pointer
105
+ @skip_pointer = @freq_pointer + ti.skip_offset
106
+ @freq_stream.seek(@freq_pointer)
107
+ @have_skipped = false
108
+ end
109
+ end
110
+
111
+ def close()
112
+ @freq_stream.close()
113
+ if (@skip_stream != nil)
114
+ @skip_stream.close()
115
+ end
116
+ @parent = nil
117
+ end
118
+
119
+ def skipping_doc()
120
+ end
121
+
122
+ def next?()
123
+ while (true)
124
+ return false if @count == @doc_freq
125
+
126
+ doc_code = @freq_stream.read_vint()
127
+ @doc += doc_code >> 1 # shift off low bit
128
+ if ((doc_code & 1) != 0) # if low bit is set
129
+ @freq = 1 # freq is one
130
+ else
131
+ @freq = @freq_stream.read_vint() # else read freq
132
+ end
133
+
134
+ @count += 1
135
+
136
+ break if (@deleted_docs == nil or not @deleted_docs[@doc])
137
+
138
+ skipping_doc()
139
+ end
140
+ return true
141
+ end
142
+
143
+ # Optimized implementation.
144
+ def read(docs, freqs, start = 0)
145
+ i = start
146
+ needed=docs.length
147
+
148
+ while (i < needed and @count < @doc_freq)
149
+
150
+ # manually inlined call to next() for speed
151
+ doc_code = @freq_stream.read_vint()
152
+ @doc += doc_code >> 1 # shift off low bit
153
+ if ((doc_code & 1) != 0) # if low bit is set
154
+ @freq = 1 # freq is one
155
+ else
156
+ @freq = @freq_stream.read_vint() # else read freq
157
+ end
158
+ @count += 1
159
+
160
+ if (@deleted_docs == nil or not @deleted_docs[@doc])
161
+ docs[i] = @doc
162
+ freqs[i] = @freq
163
+ i += 1
164
+ end
165
+ end
166
+ return i
167
+ end
168
+
169
+ # Overridden by SegmentTermDocPosEnum to skip in prox stream.
170
+ def skip_prox(prox_pointer)
171
+ end
172
+
173
+ # Optimized implementation.
174
+ def skip_to(target)
175
+ if (@doc_freq >= @skip_interval) # optimized case
176
+
177
+ if (@skip_stream == nil)
178
+ @skip_stream = @freq_stream.clone() # lazily clone
179
+ end
180
+
181
+ if (!@have_skipped) # lazily seek skip stream
182
+ @skip_stream.seek(@skip_pointer)
183
+ @have_skipped = true
184
+ end
185
+
186
+ # scan skip data
187
+ last_skip_doc = @skip_doc
188
+ last_freq_pointer = @freq_stream.pos()
189
+ last_prox_pointer = -1
190
+ num_skipped = -1 - (@count % @skip_interval)
191
+
192
+ while (target > @skip_doc)
193
+ last_skip_doc = @skip_doc
194
+ last_freq_pointer = @freq_pointer
195
+ last_prox_pointer = @prox_pointer
196
+
197
+ if (@skip_doc != 0 and @skip_doc >= @doc)
198
+ num_skipped += @skip_interval
199
+ end
200
+
201
+ if(@skip_count >= @num_skips)
202
+ break
203
+ end
204
+
205
+ @skip_doc += @skip_stream.read_vint()
206
+ @freq_pointer += @skip_stream.read_vint()
207
+ @prox_pointer += @skip_stream.read_vint()
208
+
209
+ @skip_count += 1
210
+ end
211
+
212
+ # if we found something to skip, then skip it
213
+ if (last_freq_pointer > @freq_stream.pos())
214
+ @freq_stream.seek(last_freq_pointer)
215
+ skip_prox(last_prox_pointer)
216
+
217
+ @doc = last_skip_doc
218
+ @count += num_skipped
219
+ end
220
+
221
+ end
222
+
223
+ # done skipping, now just scan
224
+
225
+ begin
226
+ if not next?
227
+ return false
228
+ end
229
+ end while (target > @doc)
230
+ return true
231
+ end
232
+ end
233
+
234
+ class SegmentTermDocPosEnum < SegmentTermDocEnum
235
+ def initialize(p)
236
+ super
237
+ @prox_stream = p.prox_stream.clone()
238
+ end
239
+
240
+ def do_seek(ti)
241
+ super
242
+ if (ti != nil)
243
+ @prox_stream.seek(ti.prox_pointer)
244
+ end
245
+ @prox_count = 0
246
+ end
247
+
248
+ def close()
249
+ super
250
+ @prox_stream.close()
251
+ end
252
+
253
+ def next_position()
254
+ @prox_count -= 1
255
+ return @position += @prox_stream.read_vint()
256
+ end
257
+
258
+ def skipping_doc()
259
+ @freq.times { @prox_stream.read_vint() }
260
+ end
261
+
262
+ def next?()
263
+ @prox_count.times { @prox_stream.read_vint() }
264
+
265
+ if (super)
266
+ @prox_count = @freq # note frequency
267
+ @position = 0 # reset position
268
+ return true
269
+ end
270
+ return false
271
+ end
272
+
273
+ def read(docs, freqs)
274
+ raise NotImplementedError, "TermDocPosEnum does not support processing multiple documents in one call. Use TermDocEnum instead."
275
+ end
276
+
277
+ # Called by super.skipTo().
278
+ def skip_prox(prox_pointer)
279
+ @prox_stream.seek(prox_pointer)
280
+ @prox_count = 0
281
+ end
282
+ end
283
+ end