ferret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,58 @@
|
|
1
|
+
module Ferret::Index
|
2
|
+
# Provides access to stored term vector of
|
3
|
+
# a document field.
|
4
|
+
class SegmentTermVector
|
5
|
+
# Array of term frequencies. Locations of the array correspond one to one
|
6
|
+
# to the terms in the array obtained from _terms_
|
7
|
+
# method. Each location in the array contains the number of times this
|
8
|
+
# term occurs in the document or the document field.
|
9
|
+
attr_reader :term_frequencies, :positions, :offsets
|
10
|
+
|
11
|
+
attr_reader :field, :terms
|
12
|
+
|
13
|
+
def initialize(field, terms, term_freqs, positions=nil, offsets=nil)
|
14
|
+
@field = field
|
15
|
+
@terms = terms
|
16
|
+
@term_frequencies = term_freqs
|
17
|
+
@positions = positions
|
18
|
+
@offsets = offsets
|
19
|
+
end
|
20
|
+
|
21
|
+
def to_s()
|
22
|
+
sb = @field.to_s + ": "
|
23
|
+
if @terms
|
24
|
+
terms.each_with_index do |term, i|
|
25
|
+
sb << ', ' if i > 0
|
26
|
+
sb << term + '/' + @term_frequencies[i].to_s
|
27
|
+
end
|
28
|
+
end
|
29
|
+
sb << 'end'
|
30
|
+
|
31
|
+
return sb
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the number of unique terms in the field
|
35
|
+
def size()
|
36
|
+
return @terms == nil ? 0 : @terms.size
|
37
|
+
end
|
38
|
+
|
39
|
+
# Return an index in the term numbers array returned from _get_terms_ at
|
40
|
+
# which the term with the specified _term_ appears. If this term does
|
41
|
+
# not appear in the array, return -1.
|
42
|
+
def index_of(term)
|
43
|
+
return @terms ? @terms.index(term) : nil
|
44
|
+
end
|
45
|
+
|
46
|
+
# Just like _index_of_ but searches for a number of terms at the same
|
47
|
+
# time. Returns an array that has the same size as the number of terms
|
48
|
+
# searched for, each slot containing the result of searching for that
|
49
|
+
# term number.
|
50
|
+
#
|
51
|
+
# terms:: array containing terms to look for
|
52
|
+
# start:: index in the array where the list of terms starts
|
53
|
+
# len:: the number of terms in the list
|
54
|
+
def indexes_of(terms, start, len)
|
55
|
+
return terms[start, len].map { |term| index_of(term) }
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Ferret::Index
|
2
|
+
# A Term represents a word from text. This is the unit of search. It is
|
3
|
+
# composed of two elements, the text of the word, as a string, and the name of
|
4
|
+
# the field that the text occured in, an interned string.
|
5
|
+
#
|
6
|
+
# Note that terms may represent more than words from text fields, but also
|
7
|
+
# things like dates, email addresses, urls, etc.
|
8
|
+
#
|
9
|
+
# A term contains two attributes;
|
10
|
+
# field:: The field indicates the part of a document which this term came from.
|
11
|
+
# text:: In the case of words, this is simply the text of the word. In the case
|
12
|
+
# of dates and other types, this is an encoding of the object as a string.
|
13
|
+
class Term
|
14
|
+
include Comparable
|
15
|
+
|
16
|
+
attr_accessor :field
|
17
|
+
attr_accessor :text
|
18
|
+
|
19
|
+
# Constructs a Term with the given field and text
|
20
|
+
def initialize(fld_name, txt)
|
21
|
+
@field = fld_name
|
22
|
+
@text = txt
|
23
|
+
end
|
24
|
+
|
25
|
+
# Combines the hash() of the field and the text.
|
26
|
+
def hash()
|
27
|
+
return field.hash() + text.hash()
|
28
|
+
end
|
29
|
+
|
30
|
+
# implements comparable giving us the methods >, >=, <, <= and between?
|
31
|
+
def <=>(other)
|
32
|
+
if @field == other.field
|
33
|
+
return @text <=> other.text
|
34
|
+
else
|
35
|
+
return @field <=> other.field
|
36
|
+
end
|
37
|
+
end
|
38
|
+
alias :eql? :==
|
39
|
+
|
40
|
+
# Resets the field and text of a Term.
|
41
|
+
def set!(fld_name, txt)
|
42
|
+
initialize(fld_name, txt)
|
43
|
+
end
|
44
|
+
|
45
|
+
def to_s
|
46
|
+
@field + ":" + @text
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
module Ferret::Index
|
2
|
+
class TermBuffer
|
3
|
+
include Comparable
|
4
|
+
|
5
|
+
attr_reader :text, :text_length, :field
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@text = String.new
|
9
|
+
@text_length = -1
|
10
|
+
@field = nil
|
11
|
+
end
|
12
|
+
|
13
|
+
def hash()
|
14
|
+
return @text.hash + @field.hash
|
15
|
+
end
|
16
|
+
|
17
|
+
def <=>(other)
|
18
|
+
if (@field == other.field)
|
19
|
+
return text_str <=> other.text_str
|
20
|
+
end
|
21
|
+
@field <=> other.field
|
22
|
+
end
|
23
|
+
|
24
|
+
def read(input, field_infos)
|
25
|
+
@term = nil # invalidate cache
|
26
|
+
start = input.read_vint()
|
27
|
+
length = input.read_vint()
|
28
|
+
total_length = start + length
|
29
|
+
@text_length = total_length
|
30
|
+
input.read_chars(@text, start, length)
|
31
|
+
@field = field_infos[input.read_vint()].name
|
32
|
+
end
|
33
|
+
|
34
|
+
def term=(term)
|
35
|
+
if (term == nil)
|
36
|
+
reset()
|
37
|
+
return
|
38
|
+
end
|
39
|
+
|
40
|
+
# copy text into the buffer
|
41
|
+
@text_length = term.text.length
|
42
|
+
@text = term.text.clone
|
43
|
+
|
44
|
+
@field = term.field
|
45
|
+
@term = term
|
46
|
+
end
|
47
|
+
|
48
|
+
def set!(other)
|
49
|
+
@text_length = other.text_length
|
50
|
+
@text = other.text.clone if other.text
|
51
|
+
@field = other.field
|
52
|
+
@term = other.term
|
53
|
+
end
|
54
|
+
|
55
|
+
def reset()
|
56
|
+
@field = nil
|
57
|
+
@text = String.new
|
58
|
+
@text_length = 0
|
59
|
+
@term = nil
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_term()
|
63
|
+
if @field.nil? # unset
|
64
|
+
return nil
|
65
|
+
end
|
66
|
+
|
67
|
+
if @term.nil?
|
68
|
+
@term = Term.new(@field, @text[0,@text_length].to_s)
|
69
|
+
end
|
70
|
+
return @term
|
71
|
+
end
|
72
|
+
alias :term :to_term
|
73
|
+
|
74
|
+
def clone()
|
75
|
+
clone = TermBuffer.new()
|
76
|
+
clone.set!(self)
|
77
|
+
return clone
|
78
|
+
end
|
79
|
+
|
80
|
+
def text_str()
|
81
|
+
@text[0,@text_length]
|
82
|
+
end
|
83
|
+
|
84
|
+
def to_s()
|
85
|
+
to_term.to_s
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,283 @@
|
|
1
|
+
module Ferret::Index
|
2
|
+
# TermDocEnum provides an interface for enumerating <document,
|
3
|
+
# frequency> pairs for a term.
|
4
|
+
#
|
5
|
+
# The document portion names each document containing the term. Documents
|
6
|
+
# are indicated by number. The frequency portion gives the number of times
|
7
|
+
# the term occurred in each document.
|
8
|
+
#
|
9
|
+
# The pairs are ordered by document number.
|
10
|
+
#
|
11
|
+
# See IndexReader#term_docs
|
12
|
+
class TermDocEnum
|
13
|
+
# Sets this to the data for a term.
|
14
|
+
# The enumeration is reset to the start of the data for this term.
|
15
|
+
def seek(term) raise NotImplementedError end
|
16
|
+
|
17
|
+
# Returns the current document number.
|
18
|
+
#
|
19
|
+
# This is invalid until #next() is called for the first time.
|
20
|
+
def doc() raise NotImplementedError end
|
21
|
+
|
22
|
+
# Returns the frequency of the term within the current document. This
|
23
|
+
# is invalid until {@link #next()} is called for the first time.
|
24
|
+
def freq() raise NotImplementedError end
|
25
|
+
|
26
|
+
# Moves to the next pair in the enumeration.
|
27
|
+
# Returns true iff there is such a next pair in the enumeration.
|
28
|
+
def next?() raise NotImplementedError end
|
29
|
+
|
30
|
+
# Attempts to read multiple entries from the enumeration, up to length of
|
31
|
+
# _docs_. Document numbers are stored in _docs_, and term
|
32
|
+
# frequencies are stored in _freqs_. The _freqs_ array must be as
|
33
|
+
# long as the _docs_ array.
|
34
|
+
#
|
35
|
+
# Returns the number of entries read. Zero is only returned when the
|
36
|
+
# stream has been exhausted.
|
37
|
+
def read(docs, freqs) raise NotImplementedError end
|
38
|
+
|
39
|
+
# Skips entries to the first beyond the current whose document number is
|
40
|
+
# greater than or equal to _target_.
|
41
|
+
#
|
42
|
+
# Returns true iff there is such an entry.
|
43
|
+
#
|
44
|
+
# Some implementations are considerably more efficient than that.
|
45
|
+
def skip_to(target)
|
46
|
+
while (target > doc())
|
47
|
+
return false if not next?()
|
48
|
+
end
|
49
|
+
return true
|
50
|
+
end
|
51
|
+
|
52
|
+
# Frees associated resources.
|
53
|
+
def close() raise NotImplementedError end
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
class SegmentTermDocEnum < TermDocEnum
|
58
|
+
attr_accessor :parent, :freq_stream, :count, :df, :deleted_docs, :doc, :freq
|
59
|
+
|
60
|
+
def initialize(parent)
|
61
|
+
@parent = parent
|
62
|
+
@freq_stream = parent.freq_stream.clone()
|
63
|
+
@deleted_docs = parent.deleted_docs
|
64
|
+
@skip_interval = parent.term_infos.skip_interval
|
65
|
+
@skip_stream = nil
|
66
|
+
@doc = 0
|
67
|
+
end
|
68
|
+
|
69
|
+
# Find the term, TermEnum or TermInfo in the doc
|
70
|
+
#
|
71
|
+
# t:: can be a Term, TermEnum of TermInfo object
|
72
|
+
def seek(t)
|
73
|
+
if t.instance_of?(Term)
|
74
|
+
ti = parent.term_infos[t]
|
75
|
+
elsif t.is_a?(TermEnum)
|
76
|
+
# use comparison of fieldinfos to verify that term enum (t) belongs to the
|
77
|
+
# same segment as this SegmentTermDocEnum
|
78
|
+
if (t.instance_of?(SegmentTermEnum) and t.field_infos == parent.field_infos)
|
79
|
+
ti = t.term_info()
|
80
|
+
else # punt case
|
81
|
+
ti = parent.term_infos[t.term]
|
82
|
+
end
|
83
|
+
elsif t.is_a? TermInfo # this one is easy. That's exactly what we're looking for
|
84
|
+
ti = t
|
85
|
+
else
|
86
|
+
raise ArgumentError, "Must pass a Term, TermEnum or TermInfo object, not a " +
|
87
|
+
t.class.to_s
|
88
|
+
end
|
89
|
+
do_seek(ti)
|
90
|
+
#puts "pos = #{@freq_stream.pos} ti = #{ti}"
|
91
|
+
end
|
92
|
+
|
93
|
+
def do_seek(ti)
|
94
|
+
@count = 0
|
95
|
+
if (ti == nil)
|
96
|
+
@doc_freq = 0
|
97
|
+
else
|
98
|
+
@doc_freq = ti.doc_freq
|
99
|
+
@doc = 0
|
100
|
+
@skip_doc = 0
|
101
|
+
@skip_count = 0
|
102
|
+
@num_skips = @doc_freq / @skip_interval
|
103
|
+
@freq_pointer = ti.freq_pointer
|
104
|
+
@prox_pointer = ti.prox_pointer
|
105
|
+
@skip_pointer = @freq_pointer + ti.skip_offset
|
106
|
+
@freq_stream.seek(@freq_pointer)
|
107
|
+
@have_skipped = false
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def close()
|
112
|
+
@freq_stream.close()
|
113
|
+
if (@skip_stream != nil)
|
114
|
+
@skip_stream.close()
|
115
|
+
end
|
116
|
+
@parent = nil
|
117
|
+
end
|
118
|
+
|
119
|
+
def skipping_doc()
|
120
|
+
end
|
121
|
+
|
122
|
+
def next?()
|
123
|
+
while (true)
|
124
|
+
return false if @count == @doc_freq
|
125
|
+
|
126
|
+
doc_code = @freq_stream.read_vint()
|
127
|
+
@doc += doc_code >> 1 # shift off low bit
|
128
|
+
if ((doc_code & 1) != 0) # if low bit is set
|
129
|
+
@freq = 1 # freq is one
|
130
|
+
else
|
131
|
+
@freq = @freq_stream.read_vint() # else read freq
|
132
|
+
end
|
133
|
+
|
134
|
+
@count += 1
|
135
|
+
|
136
|
+
break if (@deleted_docs == nil or not @deleted_docs[@doc])
|
137
|
+
|
138
|
+
skipping_doc()
|
139
|
+
end
|
140
|
+
return true
|
141
|
+
end
|
142
|
+
|
143
|
+
# Optimized implementation.
|
144
|
+
def read(docs, freqs, start = 0)
|
145
|
+
i = start
|
146
|
+
needed=docs.length
|
147
|
+
|
148
|
+
while (i < needed and @count < @doc_freq)
|
149
|
+
|
150
|
+
# manually inlined call to next() for speed
|
151
|
+
doc_code = @freq_stream.read_vint()
|
152
|
+
@doc += doc_code >> 1 # shift off low bit
|
153
|
+
if ((doc_code & 1) != 0) # if low bit is set
|
154
|
+
@freq = 1 # freq is one
|
155
|
+
else
|
156
|
+
@freq = @freq_stream.read_vint() # else read freq
|
157
|
+
end
|
158
|
+
@count += 1
|
159
|
+
|
160
|
+
if (@deleted_docs == nil or not @deleted_docs[@doc])
|
161
|
+
docs[i] = @doc
|
162
|
+
freqs[i] = @freq
|
163
|
+
i += 1
|
164
|
+
end
|
165
|
+
end
|
166
|
+
return i
|
167
|
+
end
|
168
|
+
|
169
|
+
# Overridden by SegmentTermDocPosEnum to skip in prox stream.
|
170
|
+
def skip_prox(prox_pointer)
|
171
|
+
end
|
172
|
+
|
173
|
+
# Optimized implementation.
|
174
|
+
def skip_to(target)
|
175
|
+
if (@doc_freq >= @skip_interval) # optimized case
|
176
|
+
|
177
|
+
if (@skip_stream == nil)
|
178
|
+
@skip_stream = @freq_stream.clone() # lazily clone
|
179
|
+
end
|
180
|
+
|
181
|
+
if (!@have_skipped) # lazily seek skip stream
|
182
|
+
@skip_stream.seek(@skip_pointer)
|
183
|
+
@have_skipped = true
|
184
|
+
end
|
185
|
+
|
186
|
+
# scan skip data
|
187
|
+
last_skip_doc = @skip_doc
|
188
|
+
last_freq_pointer = @freq_stream.pos()
|
189
|
+
last_prox_pointer = -1
|
190
|
+
num_skipped = -1 - (@count % @skip_interval)
|
191
|
+
|
192
|
+
while (target > @skip_doc)
|
193
|
+
last_skip_doc = @skip_doc
|
194
|
+
last_freq_pointer = @freq_pointer
|
195
|
+
last_prox_pointer = @prox_pointer
|
196
|
+
|
197
|
+
if (@skip_doc != 0 and @skip_doc >= @doc)
|
198
|
+
num_skipped += @skip_interval
|
199
|
+
end
|
200
|
+
|
201
|
+
if(@skip_count >= @num_skips)
|
202
|
+
break
|
203
|
+
end
|
204
|
+
|
205
|
+
@skip_doc += @skip_stream.read_vint()
|
206
|
+
@freq_pointer += @skip_stream.read_vint()
|
207
|
+
@prox_pointer += @skip_stream.read_vint()
|
208
|
+
|
209
|
+
@skip_count += 1
|
210
|
+
end
|
211
|
+
|
212
|
+
# if we found something to skip, then skip it
|
213
|
+
if (last_freq_pointer > @freq_stream.pos())
|
214
|
+
@freq_stream.seek(last_freq_pointer)
|
215
|
+
skip_prox(last_prox_pointer)
|
216
|
+
|
217
|
+
@doc = last_skip_doc
|
218
|
+
@count += num_skipped
|
219
|
+
end
|
220
|
+
|
221
|
+
end
|
222
|
+
|
223
|
+
# done skipping, now just scan
|
224
|
+
|
225
|
+
begin
|
226
|
+
if not next?
|
227
|
+
return false
|
228
|
+
end
|
229
|
+
end while (target > @doc)
|
230
|
+
return true
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
class SegmentTermDocPosEnum < SegmentTermDocEnum
|
235
|
+
def initialize(p)
|
236
|
+
super
|
237
|
+
@prox_stream = p.prox_stream.clone()
|
238
|
+
end
|
239
|
+
|
240
|
+
def do_seek(ti)
|
241
|
+
super
|
242
|
+
if (ti != nil)
|
243
|
+
@prox_stream.seek(ti.prox_pointer)
|
244
|
+
end
|
245
|
+
@prox_count = 0
|
246
|
+
end
|
247
|
+
|
248
|
+
def close()
|
249
|
+
super
|
250
|
+
@prox_stream.close()
|
251
|
+
end
|
252
|
+
|
253
|
+
def next_position()
|
254
|
+
@prox_count -= 1
|
255
|
+
return @position += @prox_stream.read_vint()
|
256
|
+
end
|
257
|
+
|
258
|
+
def skipping_doc()
|
259
|
+
@freq.times { @prox_stream.read_vint() }
|
260
|
+
end
|
261
|
+
|
262
|
+
def next?()
|
263
|
+
@prox_count.times { @prox_stream.read_vint() }
|
264
|
+
|
265
|
+
if (super)
|
266
|
+
@prox_count = @freq # note frequency
|
267
|
+
@position = 0 # reset position
|
268
|
+
return true
|
269
|
+
end
|
270
|
+
return false
|
271
|
+
end
|
272
|
+
|
273
|
+
def read(docs, freqs)
|
274
|
+
raise NotImplementedError, "TermDocPosEnum does not support processing multiple documents in one call. Use TermDocEnum instead."
|
275
|
+
end
|
276
|
+
|
277
|
+
# Called by super.skipTo().
|
278
|
+
def skip_prox(prox_pointer)
|
279
|
+
@prox_stream.seek(prox_pointer)
|
280
|
+
@prox_count = 0
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|