ferret 0.9.6 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,52 +0,0 @@
|
|
1
|
-
module Ferret
|
2
|
-
module Index
|
3
|
-
# Abstract class for enumerating terms.
|
4
|
-
#
|
5
|
-
# Term enumerations are always ordered by Term.<=>. Each term in
|
6
|
-
# the enumeration is greater than all that precede it.
|
7
|
-
class TermEnum
|
8
|
-
# Increments the enumeration to the next element. True if one exists.
|
9
|
-
def next?
|
10
|
-
raise NotImplementedError
|
11
|
-
end
|
12
|
-
|
13
|
-
# Returns the current Term in the enumeration.
|
14
|
-
def term
|
15
|
-
raise NotImplementedError
|
16
|
-
end
|
17
|
-
|
18
|
-
# Returns the doc_freq of the current Term in the enumeration.
|
19
|
-
def doc_freq
|
20
|
-
raise NotImplementedError
|
21
|
-
end
|
22
|
-
|
23
|
-
# Closes the enumeration to further activity, freeing resources.
|
24
|
-
def close
|
25
|
-
raise NotImplementedError
|
26
|
-
end
|
27
|
-
|
28
|
-
# Term Vector support
|
29
|
-
# Skips terms to the first beyond the current whose value is
|
30
|
-
# greater or equal to _target_.
|
31
|
-
#
|
32
|
-
# Returns true iff there is such a term.
|
33
|
-
#
|
34
|
-
# Behaves as if written:
|
35
|
-
#
|
36
|
-
# def skip_to(target)
|
37
|
-
# while (target > term)
|
38
|
-
# if (!next()) return false
|
39
|
-
# end
|
40
|
-
# return true
|
41
|
-
# end
|
42
|
-
#
|
43
|
-
# Some implementations are considerably more efficient than that.
|
44
|
-
def skip_to(target)
|
45
|
-
while (target > term)
|
46
|
-
return false if not next?
|
47
|
-
end
|
48
|
-
return true
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
@@ -1,37 +0,0 @@
|
|
1
|
-
module Ferret::Index
|
2
|
-
# A TermInfo is the record of information stored for a term.
|
3
|
-
class TermInfo
|
4
|
-
attr_accessor :doc_freq, :freq_pointer, :prox_pointer, :skip_offset
|
5
|
-
|
6
|
-
def initialize(df=0, fp=0, pp=0, so=0)
|
7
|
-
set_values!(df, fp, pp, so)
|
8
|
-
end
|
9
|
-
|
10
|
-
def set!(ti)
|
11
|
-
@doc_freq = ti.doc_freq
|
12
|
-
@freq_pointer = ti.freq_pointer
|
13
|
-
@prox_pointer = ti.prox_pointer
|
14
|
-
@skip_offset = ti.skip_offset
|
15
|
-
end
|
16
|
-
|
17
|
-
def set_values!(df=0, fp=0, pp=0, so=0)
|
18
|
-
@doc_freq = df
|
19
|
-
@freq_pointer = fp
|
20
|
-
@prox_pointer = pp
|
21
|
-
@skip_offset = so
|
22
|
-
end
|
23
|
-
|
24
|
-
def ==(o)
|
25
|
-
return false if !o.instance_of?(TermInfo)
|
26
|
-
@doc_freq == o.doc_freq &&
|
27
|
-
@freq_pointer == o.freq_pointer &&
|
28
|
-
@prox_pointer == o.prox_pointer &&
|
29
|
-
@skip_offset == o.skip_offset
|
30
|
-
end
|
31
|
-
alias eql? ==
|
32
|
-
|
33
|
-
def to_s()
|
34
|
-
"TermInfo:df=#{doc_freq}:fp=#{freq_pointer}:pp=#{prox_pointer}:so=#{skip_offset}"
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
@@ -1,321 +0,0 @@
|
|
1
|
-
require 'monitor'
|
2
|
-
module Ferret::Index
|
3
|
-
|
4
|
-
# This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
5
|
-
# Directory. A TermInfos can be written once, in order.
|
6
|
-
class TermInfosWriter
|
7
|
-
attr_reader :index_interval, :skip_interval, :out
|
8
|
-
attr_writer :other
|
9
|
-
# The file format version, a negative number.
|
10
|
-
FORMAT = -2
|
11
|
-
|
12
|
-
|
13
|
-
# TODO: the default values for these two parameters should be settable
|
14
|
-
# from IndexWriter. However, once that's done, folks will start setting
|
15
|
-
# them to ridiculous values and complaining that things don't work well,
|
16
|
-
# as with mergeFactor. So, let's wait until a number of folks find that
|
17
|
-
# alternate values work better. Note that both of these values are
|
18
|
-
# stored in the segment, so that it's safe to change these w/o
|
19
|
-
# rebuilding all indexes.
|
20
|
-
|
21
|
-
# Expert: The fraction of terms in the "dictionary" which should be
|
22
|
-
# stored in RAM. Smaller values use more memory, but make searching
|
23
|
-
# slightly faster, while larger values use less memory and make
|
24
|
-
# searching slightly slower. Searching is typically not dominated by
|
25
|
-
# dictionary lookup, so tweaking this is rarely useful.
|
26
|
-
#
|
27
|
-
# Expert: The fraction of TermDocEnum entries stored in skip
|
28
|
-
# tables, used to accellerate TermDocEnum#skipTo(int). Larger
|
29
|
-
# values result in smaller indexes, greater acceleration, but fewer
|
30
|
-
# accelerable cases, while smaller values result in bigger indexes, less
|
31
|
-
# acceleration and more accelerable cases. More detailed experiments
|
32
|
-
# would be useful here.
|
33
|
-
def initialize(dir, segment, fis, interval, is_index = false)
|
34
|
-
@index_interval = interval
|
35
|
-
@skip_interval = 16
|
36
|
-
@last_index_pointer = 0
|
37
|
-
@last_term = Term.new("", "")
|
38
|
-
@last_term_info = TermInfo.new()
|
39
|
-
@size = 0
|
40
|
-
@is_index = is_index
|
41
|
-
@field_infos = fis
|
42
|
-
@out = dir.create_output(segment + (@is_index ? ".tii" : ".tis"))
|
43
|
-
@out.write_int(FORMAT) # write format
|
44
|
-
@out.write_long(0) # leave space for size
|
45
|
-
@out.write_int(@index_interval) # write @index_interval
|
46
|
-
@out.write_int(@skip_interval) # write @skip_interval
|
47
|
-
unless is_index
|
48
|
-
@other = TermInfosWriter.new(dir, segment, fis, interval, true)
|
49
|
-
@other.other = self
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
# Adds a new <Term, TermInfo> pair to the set.
|
54
|
-
# Term must be lexicographically greater than all previous Terms added.
|
55
|
-
# TermInfo pointers must be positive and greater than all previous.
|
56
|
-
def add(term, term_info)
|
57
|
-
if (not @is_index and @last_term > term)
|
58
|
-
raise IOError, "term out of order #{term.text} < #{@last_term.text}"
|
59
|
-
end
|
60
|
-
if (term_info.freq_pointer < @last_term_info.freq_pointer)
|
61
|
-
raise IOError, "freq pointer out of order"
|
62
|
-
end
|
63
|
-
if (term_info.prox_pointer < @last_term_info.prox_pointer)
|
64
|
-
raise IOError, "prox pointer out of order"
|
65
|
-
end
|
66
|
-
|
67
|
-
if (not @is_index and @size % @index_interval == 0)
|
68
|
-
@other.add(@last_term, @last_term_info) # add an index term
|
69
|
-
end
|
70
|
-
|
71
|
-
write_term(term) # write term
|
72
|
-
@out.write_vint(term_info.doc_freq) # write doc freq
|
73
|
-
@out.write_vlong(term_info.freq_pointer - @last_term_info.freq_pointer)
|
74
|
-
@out.write_vlong(term_info.prox_pointer - @last_term_info.prox_pointer)
|
75
|
-
@out.write_vint(term_info.skip_offset) if (term_info.doc_freq >= @skip_interval)
|
76
|
-
|
77
|
-
if (@is_index)
|
78
|
-
@out.write_vlong(@other.out.pos() - @last_index_pointer)
|
79
|
-
@last_index_pointer = @other.out.pos() # write pointer
|
80
|
-
end
|
81
|
-
|
82
|
-
@last_term_info.set!(term_info)
|
83
|
-
@size += 1
|
84
|
-
end
|
85
|
-
|
86
|
-
# Called to complete TermInfos creation.
|
87
|
-
def close()
|
88
|
-
@out.seek(4) # write @size after format
|
89
|
-
@out.write_long(@size)
|
90
|
-
@out.close()
|
91
|
-
|
92
|
-
@other.close() unless @is_index
|
93
|
-
end
|
94
|
-
|
95
|
-
private
|
96
|
-
def write_term(term)
|
97
|
-
start = Ferret::Utils::StringHelper.string_difference(@last_term.text, term.text)
|
98
|
-
length = term.text.length() - start
|
99
|
-
|
100
|
-
@out.write_vint(start) # write shared prefix length
|
101
|
-
@out.write_vint(length) # write delta length
|
102
|
-
@out.write_chars(term.text, start, length) # write delta chars
|
103
|
-
@out.write_vint(@field_infos.field_number(term.field)) # write field num
|
104
|
-
@last_term = term
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
|
109
|
-
# This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
110
|
-
# Directory. Pairs are accessed either by Term or by ordinal position the
|
111
|
-
# set.
|
112
|
-
class TermInfosReader
|
113
|
-
include MonitorMixin
|
114
|
-
|
115
|
-
def initialize(dir, seg, fis)
|
116
|
-
super()
|
117
|
-
|
118
|
-
@directory = dir
|
119
|
-
@segment = seg
|
120
|
-
@field_infos = fis
|
121
|
-
|
122
|
-
@orig_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tis"),
|
123
|
-
@field_infos, false)
|
124
|
-
@size = @orig_enum.size
|
125
|
-
@skip_interval = @orig_enum.skip_interval
|
126
|
-
@index_enum = SegmentTermEnum.new(@directory.open_input(@segment + ".tii"),
|
127
|
-
@field_infos, true)
|
128
|
-
@index_terms = nil
|
129
|
-
@index_infos = nil
|
130
|
-
@index_pointers = nil
|
131
|
-
end
|
132
|
-
|
133
|
-
def close()
|
134
|
-
# clear this threads cache
|
135
|
-
@orig_enum.close() if (@orig_enum != nil)
|
136
|
-
@index_enum.close() if (@index_enum != nil)
|
137
|
-
end
|
138
|
-
|
139
|
-
# Returns the number of term/value pairs in the set.
|
140
|
-
attr_reader :size
|
141
|
-
# The skip interval for the original enumerator
|
142
|
-
attr_reader :skip_interval
|
143
|
-
|
144
|
-
|
145
|
-
# Returns the TermInfo for a Term in the set, or nil.
|
146
|
-
def get_term_info(term)
|
147
|
-
return nil if (@size == 0)
|
148
|
-
|
149
|
-
ensure_index_is_read()
|
150
|
-
|
151
|
-
# optimize sequential access: first try scanning cached enum w/o seeking
|
152
|
-
e = enum()
|
153
|
-
if e.term and term >= e.term
|
154
|
-
enum_offset = (e.position / e.index_interval).to_i + 1
|
155
|
-
if (@index_terms.length == enum_offset or
|
156
|
-
term < @index_terms[enum_offset]) # but before end of block
|
157
|
-
return scan_for_term_info(term) # no need to seek
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
# random-access: must seek
|
162
|
-
seek_enum(get_index_offset(term))
|
163
|
-
return scan_for_term_info(term)
|
164
|
-
end
|
165
|
-
alias :[] :get_term_info
|
166
|
-
|
167
|
-
# Returns the nth term in the set.
|
168
|
-
def get_term(position)
|
169
|
-
return nil if (@size == 0)
|
170
|
-
|
171
|
-
e = enum()
|
172
|
-
if (e != nil and
|
173
|
-
e.term != nil and
|
174
|
-
position >= e.position and
|
175
|
-
position < (e.position + e.index_interval))
|
176
|
-
return scan_for_term(position) # can avoid seek
|
177
|
-
end
|
178
|
-
|
179
|
-
seek_enum((position / e.index_interval).to_i) # must seek
|
180
|
-
return scan_for_term(position)
|
181
|
-
end
|
182
|
-
|
183
|
-
def get_terms_position(term)
|
184
|
-
return nil if (@size == 0)
|
185
|
-
ensure_index_is_read
|
186
|
-
seek_enum(get_index_offset(term))
|
187
|
-
|
188
|
-
e = enum()
|
189
|
-
|
190
|
-
while term > e.term and e.next?
|
191
|
-
end
|
192
|
-
|
193
|
-
return term == e.term ? e.position : -1
|
194
|
-
end
|
195
|
-
|
196
|
-
# Returns an enumeration of all the Terms and TermInfos in the set.
|
197
|
-
def terms()
|
198
|
-
return @orig_enum.clone()
|
199
|
-
end
|
200
|
-
|
201
|
-
# Returns an enumeration of terms starting at or after the named term.
|
202
|
-
def terms_from(term)
|
203
|
-
get_term_info(term)
|
204
|
-
return enum().clone()
|
205
|
-
end
|
206
|
-
|
207
|
-
private
|
208
|
-
|
209
|
-
def enum()
|
210
|
-
#te_cache = Thread.current["term_enum"]
|
211
|
-
#if (te_cache == nil)
|
212
|
-
# te_cache = Thread.current["term_enum"] = Ferret::Utils::WeakKeyHash.new
|
213
|
-
#end
|
214
|
-
#te_cache.synchronize do
|
215
|
-
# term_enum = te_cache[self]
|
216
|
-
# if term_enum == nil
|
217
|
-
# term_enum = terms()
|
218
|
-
# te_cache[self] = term_enum
|
219
|
-
# end
|
220
|
-
# return term_enum
|
221
|
-
#end
|
222
|
-
term_enum = Thread.current.get_local(self)
|
223
|
-
if term_enum.nil?
|
224
|
-
Thread.current.set_local(self, term_enum = terms())
|
225
|
-
end
|
226
|
-
return term_enum
|
227
|
-
end
|
228
|
-
|
229
|
-
def ensure_index_is_read()
|
230
|
-
synchronize() do
|
231
|
-
return if @index_terms
|
232
|
-
begin
|
233
|
-
index_size = @index_enum.size
|
234
|
-
|
235
|
-
@index_terms = Array.new(index_size)
|
236
|
-
@index_infos = Array.new(index_size)
|
237
|
-
@index_pointers = Array.new(index_size)
|
238
|
-
|
239
|
-
i = 0
|
240
|
-
while @index_enum.next?
|
241
|
-
@index_terms[i] = @index_enum.term
|
242
|
-
@index_infos[i] = @index_enum.term_info
|
243
|
-
@index_pointers[i] = @index_enum.index_pointer
|
244
|
-
i += 1
|
245
|
-
end
|
246
|
-
ensure
|
247
|
-
@index_enum.close()
|
248
|
-
@index_enum = nil
|
249
|
-
end
|
250
|
-
end
|
251
|
-
end
|
252
|
-
|
253
|
-
# Returns the offset of the greatest index entry which is less than or
|
254
|
-
# equal to term.
|
255
|
-
#
|
256
|
-
# This method is rewritten in the C extension.
|
257
|
-
def get_index_offset(term)
|
258
|
-
lo = 0 # binary search @index_terms[]
|
259
|
-
hi = @index_terms.length - 1
|
260
|
-
|
261
|
-
while (hi >= lo)
|
262
|
-
mid = (lo + hi) >> 1
|
263
|
-
delta = term <=> @index_terms[mid]
|
264
|
-
if (delta < 0)
|
265
|
-
hi = mid - 1
|
266
|
-
elsif (delta > 0)
|
267
|
-
lo = mid + 1
|
268
|
-
else
|
269
|
-
return mid
|
270
|
-
end
|
271
|
-
end
|
272
|
-
return hi
|
273
|
-
end
|
274
|
-
|
275
|
-
def seek_enum(ind_offset)
|
276
|
-
enum().seek(@index_pointers[ind_offset],
|
277
|
-
(ind_offset * enum().index_interval) - 1,
|
278
|
-
@index_terms[ind_offset],
|
279
|
-
@index_infos[ind_offset])
|
280
|
-
end
|
281
|
-
|
282
|
-
# Scans within block for matching term.
|
283
|
-
def scan_for_term_info(term)
|
284
|
-
e = enum()
|
285
|
-
e.scan_to(term)
|
286
|
-
if e.term != nil and term == e.term
|
287
|
-
return e.term_info()
|
288
|
-
else
|
289
|
-
return nil
|
290
|
-
end
|
291
|
-
end
|
292
|
-
|
293
|
-
def scan_for_term(position)
|
294
|
-
e = enum()
|
295
|
-
while (e.position < position)
|
296
|
-
return nil if not e.next?
|
297
|
-
end
|
298
|
-
|
299
|
-
return e.term
|
300
|
-
end
|
301
|
-
|
302
|
-
# Returns the position of a Term in the set or -1.
|
303
|
-
def get_position(term)
|
304
|
-
return -1 if (@size == 0)
|
305
|
-
|
306
|
-
ind_offset = get_index_offset(term)
|
307
|
-
seek_enum(ind_offset)
|
308
|
-
|
309
|
-
e = enum()
|
310
|
-
while (term > e.term and e.next?)
|
311
|
-
end
|
312
|
-
|
313
|
-
if (term == e.term())
|
314
|
-
return e.position
|
315
|
-
else
|
316
|
-
return -1
|
317
|
-
end
|
318
|
-
end
|
319
|
-
|
320
|
-
end
|
321
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
module Ferret::Index
|
2
|
-
class TermVectorOffsetInfo
|
3
|
-
attr_accessor :start, :end
|
4
|
-
|
5
|
-
def initialize(start, endd)
|
6
|
-
@end = endd
|
7
|
-
@start = start
|
8
|
-
end
|
9
|
-
|
10
|
-
def eql?(o)
|
11
|
-
return false if !o.instance_of?(TermVectorOffsetInfo)
|
12
|
-
@end == o.end and @start == o.start
|
13
|
-
end
|
14
|
-
alias :== :eql?
|
15
|
-
|
16
|
-
def hash()
|
17
|
-
29 * @start + @end
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
@@ -1,553 +0,0 @@
|
|
1
|
-
module Ferret::Index
|
2
|
-
# Writer works by opening a document and then opening the fields within
|
3
|
-
# the document and then writing out the vectors for each field.
|
4
|
-
#
|
5
|
-
# Rough usage:
|
6
|
-
#
|
7
|
-
# for each document
|
8
|
-
#
|
9
|
-
# writer.open_document()
|
10
|
-
# for each field on the document
|
11
|
-
#
|
12
|
-
# writer.open_field(field)
|
13
|
-
# for all of the @terms
|
14
|
-
#
|
15
|
-
# writer.add_term(...)
|
16
|
-
# end
|
17
|
-
# writer.close_field
|
18
|
-
# end
|
19
|
-
# writer.close_document()
|
20
|
-
# end
|
21
|
-
#
|
22
|
-
#
|
23
|
-
class TermVectorsWriter
|
24
|
-
STORE_POSITIONS_WITH_TERMVECTOR = 0x1
|
25
|
-
STORE_OFFSET_WITH_TERMVECTOR = 0x2
|
26
|
-
|
27
|
-
FORMAT_VERSION = 2
|
28
|
-
|
29
|
-
# The size in bytes that the FORMAT_VERSION will take up at the beginning
|
30
|
-
# of each file
|
31
|
-
FORMAT_SIZE = 4
|
32
|
-
|
33
|
-
TVX_EXTENSION = ".tvx"
|
34
|
-
TVD_EXTENSION = ".tvd"
|
35
|
-
TVF_EXTENSION = ".tvf"
|
36
|
-
|
37
|
-
def initialize(directory, segment, field_infos)
|
38
|
-
@current_field = nil
|
39
|
-
@current_doc_pointer = -1
|
40
|
-
|
41
|
-
# Open files for TermVector storage
|
42
|
-
@tvx = directory.create_output(segment + TVX_EXTENSION)
|
43
|
-
@tvx.write_int(FORMAT_VERSION)
|
44
|
-
@tvd = directory.create_output(segment + TVD_EXTENSION)
|
45
|
-
@tvd.write_int(FORMAT_VERSION)
|
46
|
-
@tvf = directory.create_output(segment + TVF_EXTENSION)
|
47
|
-
@tvf.write_int(FORMAT_VERSION)
|
48
|
-
|
49
|
-
@field_infos = field_infos
|
50
|
-
@fields = []
|
51
|
-
@terms = []
|
52
|
-
end
|
53
|
-
|
54
|
-
|
55
|
-
def open_document()
|
56
|
-
close_document()
|
57
|
-
@current_doc_pointer = @tvd.pos()
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
def close_document()
|
62
|
-
|
63
|
-
if (document_open?())
|
64
|
-
close_field()
|
65
|
-
write_doc()
|
66
|
-
@fields.clear()
|
67
|
-
@current_doc_pointer = -1
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
|
72
|
-
def document_open?()
|
73
|
-
return @current_doc_pointer != -1
|
74
|
-
end
|
75
|
-
|
76
|
-
|
77
|
-
# Start processing a field. This can be followed by a number of calls to
|
78
|
-
# add_term, and a final call to close_field to indicate the end of
|
79
|
-
# processing of this field. If a field was previously open, it is closed
|
80
|
-
# automatically.
|
81
|
-
def open_field(field)
|
82
|
-
field_info = @field_infos[field]
|
83
|
-
create_field(field_info.number,
|
84
|
-
field_info.store_positions?,
|
85
|
-
field_info.store_offsets?)
|
86
|
-
end
|
87
|
-
|
88
|
-
# Finished processing current field. This should be followed by a call
|
89
|
-
# to open_field before future calls to add_term.
|
90
|
-
def close_field()
|
91
|
-
if field_open?
|
92
|
-
#puts("close_field()")
|
93
|
-
|
94
|
-
# save field and @terms
|
95
|
-
write_field()
|
96
|
-
@fields << @current_field
|
97
|
-
@terms.clear()
|
98
|
-
@current_field = nil
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
# Return true if a field is currently open.
|
103
|
-
def field_open?()
|
104
|
-
return @current_field != nil
|
105
|
-
end
|
106
|
-
|
107
|
-
# Add term to the field's term vector. Field must already be open.
|
108
|
-
#
|
109
|
-
# Terms should be added in increasing order of @terms, one call per
|
110
|
-
# unique termNum. ProxPointer is a pointer into the TermPosition file
|
111
|
-
# (prx). Freq is the number of times this term appears in this field, in
|
112
|
-
# this document. raises:: IllegalStateException if document or field is
|
113
|
-
# not open
|
114
|
-
def add_term(term_text, freq, positions = nil, offsets = nil)
|
115
|
-
if not document_open?
|
116
|
-
raise IllegalStateError, "Cannot add terms when document is not open"
|
117
|
-
end
|
118
|
-
if not field_open?
|
119
|
-
raise IllegalStateError, "Cannot add terms when field is not open"
|
120
|
-
end
|
121
|
-
|
122
|
-
add_term_internal(term_text, freq, positions, offsets)
|
123
|
-
end
|
124
|
-
|
125
|
-
def add_term_internal(term_text, freq, positions, offsets)
|
126
|
-
@terms << TVTerm.new(term_text, freq, positions, offsets)
|
127
|
-
end
|
128
|
-
|
129
|
-
# Add a complete document specified by all its term vectors. If document has no
|
130
|
-
# term vectors, add value for @tvx.
|
131
|
-
#
|
132
|
-
# vectors:: The documents to have their term vectors added
|
133
|
-
# raises:: IOException
|
134
|
-
def add_all_doc_vectors(vectors)
|
135
|
-
|
136
|
-
open_document()
|
137
|
-
|
138
|
-
if vectors != nil
|
139
|
-
vectors.each do |vector|
|
140
|
-
store_positions = (vector.size > 0 and vector.positions != nil)
|
141
|
-
store_offsets = (vector.size > 0 and vector.offsets != nil)
|
142
|
-
|
143
|
-
create_field(@field_infos.field_number(vector.field),
|
144
|
-
store_positions, store_offsets)
|
145
|
-
|
146
|
-
vector.size.times do |j|
|
147
|
-
add_term_internal(vector.terms[j],
|
148
|
-
vector.freqs[j],
|
149
|
-
store_positions ? vector.positions[j] : nil,
|
150
|
-
store_offsets ? vector.offsets[j] : nil)
|
151
|
-
end
|
152
|
-
close_field()
|
153
|
-
end
|
154
|
-
end
|
155
|
-
close_document()
|
156
|
-
end
|
157
|
-
|
158
|
-
# Close all streams.
|
159
|
-
def close()
|
160
|
-
begin
|
161
|
-
close_document()
|
162
|
-
ensure
|
163
|
-
# make an effort to close all streams we can but remember and re-raise
|
164
|
-
# the last exception encountered in this process
|
165
|
-
keep = nil
|
166
|
-
[@tvx, @tvd, @tvf].compact.each do |os|
|
167
|
-
begin
|
168
|
-
os.close()
|
169
|
-
rescue IOError => e
|
170
|
-
keep = e
|
171
|
-
end
|
172
|
-
end
|
173
|
-
raise keep if (keep != nil)
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
class TVField
|
178
|
-
attr_accessor :number, :tvf_pointer, :store_positions, :store_offsets
|
179
|
-
def initialize(number, store_pos, store_off)
|
180
|
-
@tvf_pointer = 0
|
181
|
-
@number = number
|
182
|
-
@store_positions = store_pos
|
183
|
-
@store_offsets = store_off
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
187
|
-
class TVTerm
|
188
|
-
attr_accessor :term_text, :freq, :positions, :offsets
|
189
|
-
|
190
|
-
def initialize(term_text=nil, freq=nil, positions=nil, offsets=nil)
|
191
|
-
@term_text = term_text
|
192
|
-
@freq = freq
|
193
|
-
@positions = positions
|
194
|
-
@offsets = offsets
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
private
|
199
|
-
|
200
|
-
def write_field()
|
201
|
-
# remember where this field is written
|
202
|
-
@current_field.tvf_pointer = @tvf.pos
|
203
|
-
|
204
|
-
size = @terms.size
|
205
|
-
@tvf.write_vint(size)
|
206
|
-
|
207
|
-
store_positions = @current_field.store_positions
|
208
|
-
store_offsets = @current_field.store_offsets
|
209
|
-
bits = 0x0
|
210
|
-
if (store_positions)
|
211
|
-
bits |= STORE_POSITIONS_WITH_TERMVECTOR
|
212
|
-
end
|
213
|
-
if (store_offsets)
|
214
|
-
bits |= STORE_OFFSET_WITH_TERMVECTOR
|
215
|
-
end
|
216
|
-
@tvf.write_byte(bits)
|
217
|
-
|
218
|
-
last_term_text = ""
|
219
|
-
@terms.each do |term|
|
220
|
-
start = Ferret::Utils::StringHelper.string_difference(last_term_text,
|
221
|
-
term.term_text)
|
222
|
-
length = term.term_text.length() - start
|
223
|
-
@tvf.write_vint(start) # write shared prefix length
|
224
|
-
@tvf.write_vint(length) # write delta length
|
225
|
-
@tvf.write_chars(term.term_text, start, length) # write delta chars
|
226
|
-
@tvf.write_vint(term.freq)
|
227
|
-
last_term_text = term.term_text
|
228
|
-
|
229
|
-
if (store_positions)
|
230
|
-
if (term.positions == nil)
|
231
|
-
raise IllegalStateError, "Trying to write positions that are nil!"
|
232
|
-
end
|
233
|
-
|
234
|
-
# use delta encoding for positions
|
235
|
-
position = 0
|
236
|
-
term.freq.times do |j|
|
237
|
-
@tvf.write_vint(term.positions[j] - position)
|
238
|
-
position = term.positions[j]
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
if (store_offsets)
|
243
|
-
if(term.offsets == nil)
|
244
|
-
raise IllegalStateError, "Trying to write offsets that are nil!"
|
245
|
-
end
|
246
|
-
|
247
|
-
# use delta encoding for offsets
|
248
|
-
position = 0
|
249
|
-
term.freq.times do |j|
|
250
|
-
@tvf.write_vint(term.offsets[j].start - position)
|
251
|
-
#Save the diff between the two.
|
252
|
-
@tvf.write_vint(term.offsets[j].end -
|
253
|
-
term.offsets[j].start)
|
254
|
-
position = term.offsets[j].end()
|
255
|
-
end
|
256
|
-
end
|
257
|
-
end
|
258
|
-
end
|
259
|
-
|
260
|
-
def write_doc()
|
261
|
-
if field_open?
|
262
|
-
raise IllegalStateError, "Field is still open while writing document"
|
263
|
-
end
|
264
|
-
#puts("Writing doc pointer: " + @current_doc_pointer)
|
265
|
-
# write document index record
|
266
|
-
@tvx.write_long(@current_doc_pointer)
|
267
|
-
|
268
|
-
# write document data record
|
269
|
-
size = @fields.size
|
270
|
-
|
271
|
-
# write the number of @fields
|
272
|
-
@tvd.write_vint(size)
|
273
|
-
|
274
|
-
# write field numbers
|
275
|
-
@fields.each { |field| @tvd.write_vint(field.number) }
|
276
|
-
|
277
|
-
# write field pointers
|
278
|
-
last_field_pointer = 0
|
279
|
-
@fields.each do |field|
|
280
|
-
@tvd.write_vlong(field.tvf_pointer - last_field_pointer)
|
281
|
-
last_field_pointer = field.tvf_pointer
|
282
|
-
end
|
283
|
-
#puts("After writing doc pointer: " + @tvx.pos())
|
284
|
-
end
|
285
|
-
|
286
|
-
def create_field(field_number, store_position, store_offset)
|
287
|
-
if not document_open?
|
288
|
-
raise IllegalStateError, "Cannot open field when no document is open."
|
289
|
-
end
|
290
|
-
close_field()
|
291
|
-
@current_field = TVField.new(field_number, store_position, store_offset)
|
292
|
-
end
|
293
|
-
end
|
294
|
-
|
295
|
-
class TermVectorsReader
|
296
|
-
attr_reader :size
|
297
|
-
|
298
|
-
# accessors for clone method
|
299
|
-
attr_accessor :tvx, :tvd, :tvf
|
300
|
-
protected :tvx, :tvx=, :tvd, :tvd=, :tvf, :tvf=
|
301
|
-
|
302
|
-
|
303
|
-
def initialize(d, segment, field_infos)
|
304
|
-
|
305
|
-
if (d.exists?(segment + TermVectorsWriter::TVX_EXTENSION))
|
306
|
-
@tvx = d.open_input(segment + TermVectorsWriter::TVX_EXTENSION)
|
307
|
-
check_valid_format(@tvx)
|
308
|
-
@tvd = d.open_input(segment + TermVectorsWriter::TVD_EXTENSION)
|
309
|
-
@tvd_format = check_valid_format(@tvd)
|
310
|
-
@tvf = d.open_input(segment + TermVectorsWriter::TVF_EXTENSION)
|
311
|
-
@tvf_format = check_valid_format(@tvf)
|
312
|
-
@size = @tvx.length / 8
|
313
|
-
else
|
314
|
-
@tvx = nil
|
315
|
-
@tvd = nil
|
316
|
-
@tvf = nil
|
317
|
-
end
|
318
|
-
|
319
|
-
@field_infos = field_infos
|
320
|
-
end
|
321
|
-
|
322
|
-
def close()
|
323
|
-
# make an effort to close all streams we can but remember and re-raise
|
324
|
-
# the last exception encountered in this process
|
325
|
-
keep = nil
|
326
|
-
[@tvx, @tvd, @tvf].compact.each do |os|
|
327
|
-
begin
|
328
|
-
os.close()
|
329
|
-
rescue IOError => e
|
330
|
-
keep = e
|
331
|
-
end
|
332
|
-
end
|
333
|
-
raise keep if (keep != nil)
|
334
|
-
end
|
335
|
-
|
336
|
-
# Retrieve the term vector for the given document and field
|
337
|
-
# doc_num:: The document number to retrieve the vector for
|
338
|
-
# field:: The field within the document to retrieve
|
339
|
-
# returns:: The TermFreqVector for the document and field or nil if there
|
340
|
-
# is no termVector for this field.
|
341
|
-
# raises:: IOException if there is an error reading the term vector files
|
342
|
-
def get_field_tv(doc_num, field)
|
343
|
-
# Check if no term vectors are available for this segment at all
|
344
|
-
field = field.to_s
|
345
|
-
field_number = @field_infos.field_number(field)
|
346
|
-
result = nil
|
347
|
-
if (@tvx != nil)
|
348
|
-
#We need to account for the FORMAT_SIZE at when seeking in the @tvx
|
349
|
-
#We don't need to do this in other seeks because we already have the
|
350
|
-
# file pointer
|
351
|
-
#that was written in another file
|
352
|
-
@tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
|
353
|
-
#puts("TVX Pointer: " + @tvx.pos())
|
354
|
-
position = @tvx.read_long()
|
355
|
-
|
356
|
-
@tvd.seek(position)
|
357
|
-
field_count = @tvd.read_vint()
|
358
|
-
#puts("Num Fields: " + field_count)
|
359
|
-
# There are only a few fields per document. We opt for a full scan
|
360
|
-
# rather then requiring that they be ordered. We need to read through
|
361
|
-
# all of the fields anyway to get to the tvf pointers.
|
362
|
-
number = 0
|
363
|
-
found = -1
|
364
|
-
field_count.times do |i|
|
365
|
-
if @tvd_format == TermVectorsWriter::FORMAT_VERSION
|
366
|
-
number = @tvd.read_vint()
|
367
|
-
else
|
368
|
-
number += @tvd.read_vint()
|
369
|
-
end
|
370
|
-
if (number == field_number)
|
371
|
-
found = i
|
372
|
-
end
|
373
|
-
end
|
374
|
-
|
375
|
-
# This field, although valid in the segment, was not found in this
|
376
|
-
# document
|
377
|
-
if (found != -1)
|
378
|
-
# Compute position in the @tvf file
|
379
|
-
position = 0
|
380
|
-
(found + 1).times do
|
381
|
-
position += @tvd.read_vlong()
|
382
|
-
end
|
383
|
-
|
384
|
-
result = read_term_vector(field, position)
|
385
|
-
end
|
386
|
-
end
|
387
|
-
return result
|
388
|
-
end
|
389
|
-
|
390
|
-
# Return all term vectors stored for this document or nil if it could
|
391
|
-
# not be read in.
|
392
|
-
#
|
393
|
-
# doc_num:: The document number to retrieve the vector for
|
394
|
-
# returns:: All term frequency vectors
|
395
|
-
# raises:: IOException if there is an error reading the term vector files
|
396
|
-
def get_tv(doc_num)
|
397
|
-
result = nil
|
398
|
-
# Check if no term vectors are available for this segment at all
|
399
|
-
if (@tvx != nil)
|
400
|
-
#We need to offset by
|
401
|
-
@tvx.seek((doc_num * 8) + TermVectorsWriter::FORMAT_SIZE)
|
402
|
-
position = @tvx.read_long()
|
403
|
-
|
404
|
-
@tvd.seek(position)
|
405
|
-
field_count = @tvd.read_vint()
|
406
|
-
|
407
|
-
# No fields are vectorized for this document
|
408
|
-
if (field_count != 0)
|
409
|
-
number = 0
|
410
|
-
fields = Array.new(field_count)
|
411
|
-
|
412
|
-
field_count.times do |i|
|
413
|
-
if @tvd_format == TermVectorsWriter::FORMAT_VERSION
|
414
|
-
number = @tvd.read_vint()
|
415
|
-
else
|
416
|
-
number += @tvd.read_vint()
|
417
|
-
end
|
418
|
-
|
419
|
-
fields[i] = @field_infos[number].name
|
420
|
-
end
|
421
|
-
|
422
|
-
# Compute position in the @tvf file
|
423
|
-
position = 0
|
424
|
-
tvf_pointers = Array.new(field_count)
|
425
|
-
field_count.times do |i|
|
426
|
-
position += @tvd.read_vlong()
|
427
|
-
tvf_pointers[i] = position
|
428
|
-
end
|
429
|
-
|
430
|
-
result = read_term_vectors(fields, tvf_pointers)
|
431
|
-
end
|
432
|
-
end
|
433
|
-
return result
|
434
|
-
end
|
435
|
-
|
436
|
-
def clone()
|
437
|
-
|
438
|
-
if (@tvx == nil or @tvd == nil or @tvf == nil)
|
439
|
-
return nil
|
440
|
-
end
|
441
|
-
|
442
|
-
clone = self
|
443
|
-
clone.tvx = @tvx.clone()
|
444
|
-
clone.tvd = @tvd.clone()
|
445
|
-
clone.tvf = @tvf.clone()
|
446
|
-
|
447
|
-
return clone
|
448
|
-
end
|
449
|
-
|
450
|
-
private
|
451
|
-
|
452
|
-
def read_term_vectors(fields, tvf_pointers)
|
453
|
-
|
454
|
-
res = Array.new(fields.length)
|
455
|
-
fields.length.times do |i|
|
456
|
-
res[i] = read_term_vector(fields[i], tvf_pointers[i])
|
457
|
-
end
|
458
|
-
return res
|
459
|
-
end
|
460
|
-
|
461
|
-
# field:: The field to read in
|
462
|
-
# tvf_pointer:: The pointer within the @tvf file where we should start reading
|
463
|
-
# returns:: The TermVector located at that position
|
464
|
-
# raises:: IOException
|
465
|
-
def read_term_vector(field, tvf_pointer)
|
466
|
-
# Now read the data from specified position
|
467
|
-
# We don't need to offset by the FORMAT here since the pointer
|
468
|
-
# already includes the offset
|
469
|
-
@tvf.seek(tvf_pointer)
|
470
|
-
|
471
|
-
num_terms = @tvf.read_vint()
|
472
|
-
# If no terms - return a constant empty termvector. However, this should
|
473
|
-
# never occur!
|
474
|
-
if (num_terms == 0)
|
475
|
-
return SegmentTermVector.new(field, nil, nil)
|
476
|
-
end
|
477
|
-
|
478
|
-
|
479
|
-
if(@tvf_format == TermVectorsWriter::FORMAT_VERSION)
|
480
|
-
bits = @tvf.read_byte()
|
481
|
-
store_positions = (bits & TermVectorsWriter::STORE_POSITIONS_WITH_TERMVECTOR) != 0
|
482
|
-
store_offsets = (bits & TermVectorsWriter::STORE_OFFSET_WITH_TERMVECTOR) != 0
|
483
|
-
else
|
484
|
-
@tvf.read_vint()
|
485
|
-
store_positions = false
|
486
|
-
store_offsets = false
|
487
|
-
end
|
488
|
-
|
489
|
-
terms = Array.new(num_terms)
|
490
|
-
term_freqs = Array.new(num_terms)
|
491
|
-
|
492
|
-
# we may not need these, but declare them
|
493
|
-
positions = nil
|
494
|
-
offsets = nil
|
495
|
-
if(store_positions)
|
496
|
-
positions = Array.new(num_terms)
|
497
|
-
end
|
498
|
-
if(store_offsets)
|
499
|
-
offsets = Array.new(num_terms)
|
500
|
-
end
|
501
|
-
|
502
|
-
start = 0
|
503
|
-
delta_length = 0
|
504
|
-
total_length = 0
|
505
|
-
buffer = ""
|
506
|
-
previous_buffer = ""
|
507
|
-
|
508
|
-
num_terms.times do |i|
|
509
|
-
start = @tvf.read_vint()
|
510
|
-
delta_length = @tvf.read_vint()
|
511
|
-
total_length = start + delta_length
|
512
|
-
@tvf.read_chars(buffer, start, delta_length)
|
513
|
-
terms[i] = buffer[0, total_length].to_s
|
514
|
-
previous_string = terms[i]
|
515
|
-
freq = @tvf.read_vint()
|
516
|
-
term_freqs[i] = freq
|
517
|
-
|
518
|
-
if (store_positions) #read in the positions
|
519
|
-
pos = Array.new(freq)
|
520
|
-
positions[i] = pos
|
521
|
-
prev_position = 0
|
522
|
-
freq.times do |j|
|
523
|
-
pos[j] = prev_position + @tvf.read_vint()
|
524
|
-
prev_position = pos[j]
|
525
|
-
end
|
526
|
-
end
|
527
|
-
|
528
|
-
if (store_offsets)
|
529
|
-
offs = Array.new(freq)
|
530
|
-
offsets[i] = offs
|
531
|
-
prev_offset = 0
|
532
|
-
freq.times do |j|
|
533
|
-
start_offset = prev_offset + @tvf.read_vint()
|
534
|
-
end_offset = start_offset + @tvf.read_vint()
|
535
|
-
offs[j] = TermVectorOffsetInfo.new(start_offset, end_offset)
|
536
|
-
prev_offset = end_offset
|
537
|
-
end
|
538
|
-
end
|
539
|
-
end
|
540
|
-
|
541
|
-
SegmentTermVector.new(field, terms, term_freqs, positions, offsets)
|
542
|
-
end
|
543
|
-
|
544
|
-
def check_valid_format(istream)
|
545
|
-
format = istream.read_int()
|
546
|
-
if (format > TermVectorsWriter::FORMAT_VERSION)
|
547
|
-
raise IOError, "Incompatible format version: #{format} expected #{TermVectorsWriter::FORMAT_VERSION} or less"
|
548
|
-
end
|
549
|
-
return format
|
550
|
-
end
|
551
|
-
|
552
|
-
end
|
553
|
-
end
|