ferret 0.9.6 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -8,342 +8,217 @@ module IndexReaderCommon
|
|
8
8
|
def test_index_reader
|
9
9
|
do_test_get_field_names()
|
10
10
|
|
11
|
+
do_test_term_enum()
|
12
|
+
|
11
13
|
do_test_term_doc_enum()
|
12
|
-
|
14
|
+
|
13
15
|
do_test_term_vectors()
|
14
16
|
|
15
|
-
do_test_changing_field()
|
16
|
-
|
17
17
|
do_test_get_doc()
|
18
|
-
|
19
|
-
do_test_term_enum()
|
20
18
|
end
|
21
19
|
|
22
20
|
def do_test_get_field_names()
|
23
|
-
field_names = @ir.
|
24
|
-
|
25
|
-
assert(field_names.include?(
|
26
|
-
assert(field_names.include?(
|
27
|
-
assert(field_names.include?(
|
28
|
-
assert(field_names.include?(
|
29
|
-
assert(field_names.include?(
|
30
|
-
assert(field_names.include?(
|
21
|
+
field_names = @ir.field_names
|
22
|
+
|
23
|
+
assert(field_names.include?(:body))
|
24
|
+
assert(field_names.include?(:changing_field))
|
25
|
+
assert(field_names.include?(:author))
|
26
|
+
assert(field_names.include?(:title))
|
27
|
+
assert(field_names.include?(:text))
|
28
|
+
assert(field_names.include?(:year))
|
31
29
|
end
|
32
30
|
|
33
31
|
def do_test_term_enum()
|
34
|
-
te = @ir.terms
|
32
|
+
te = @ir.terms(:author)
|
35
33
|
|
36
34
|
assert(te.next?)
|
37
|
-
assert_equal(
|
35
|
+
assert_equal("Leo", te.term)
|
38
36
|
assert_equal(1, te.doc_freq)
|
39
37
|
assert(te.next?)
|
40
|
-
assert_equal(
|
38
|
+
assert_equal("Tolstoy", te.term)
|
41
39
|
assert_equal(1, te.doc_freq)
|
40
|
+
assert(! te.next?)
|
41
|
+
|
42
|
+
te.field = :body
|
42
43
|
assert(te.next?)
|
43
|
-
assert_equal(
|
44
|
+
assert_equal("And", te.term)
|
44
45
|
assert_equal(1, te.doc_freq)
|
45
46
|
|
46
|
-
|
47
|
-
|
48
|
-
assert_equal(Term.new("body", "Not"), te.term)
|
47
|
+
assert(te.skip_to("Not"))
|
48
|
+
assert_equal("Not", te.term)
|
49
49
|
assert_equal(1, te.doc_freq)
|
50
50
|
assert(te.next?)
|
51
|
-
assert_equal(
|
51
|
+
assert_equal("Random", te.term)
|
52
52
|
assert_equal(16, te.doc_freq)
|
53
53
|
|
54
|
-
|
55
|
-
assert(
|
54
|
+
te.field = :text
|
55
|
+
assert(te.skip_to("which"))
|
56
|
+
assert("which", te.term)
|
56
57
|
assert_equal(1, te.doc_freq)
|
58
|
+
assert(! te.next?)
|
59
|
+
|
60
|
+
te.field = :title
|
57
61
|
assert(te.next?)
|
58
|
-
assert_equal(
|
62
|
+
assert_equal("War And Peace", te.term)
|
59
63
|
assert_equal(1, te.doc_freq)
|
60
64
|
assert(!te.next?)
|
61
65
|
|
62
|
-
te.
|
63
|
-
|
64
|
-
te = @ir.terms_from(Term.new("body", "Not"))
|
65
|
-
assert_equal(Term.new("body", "Not"), te.term)
|
66
|
+
te = @ir.terms_from(:body, "Not")
|
67
|
+
assert_equal("Not", te.term)
|
66
68
|
assert_equal(1, te.doc_freq)
|
67
69
|
assert(te.next?)
|
68
|
-
assert_equal(
|
70
|
+
assert_equal("Random", te.term)
|
69
71
|
assert_equal(16, te.doc_freq)
|
70
|
-
te.close
|
71
72
|
end
|
72
73
|
|
73
74
|
def do_test_term_doc_enum()
|
74
75
|
|
75
|
-
assert_equal(IndexTestHelper::
|
76
|
-
assert_equal(IndexTestHelper::
|
77
|
-
|
78
|
-
term = Term.new("body", "Wally")
|
79
|
-
assert_equal(4, @ir.doc_freq(term))
|
80
|
-
|
81
|
-
tde = @ir.term_docs_for(term)
|
82
|
-
|
83
|
-
assert(tde.next?)
|
84
|
-
assert_equal(0, tde.doc())
|
85
|
-
assert_equal(1, tde.freq())
|
86
|
-
assert(tde.next?)
|
87
|
-
assert_equal(5, tde.doc())
|
88
|
-
assert_equal(1, tde.freq())
|
89
|
-
assert(tde.next?)
|
90
|
-
assert_equal(18, tde.doc())
|
91
|
-
assert_equal(3, tde.freq())
|
92
|
-
assert(tde.next?)
|
93
|
-
assert_equal(20, tde.doc())
|
94
|
-
assert_equal(6, tde.freq())
|
95
|
-
assert_equal(false, tde.next?)
|
96
|
-
|
97
|
-
# test fast read. Use a small array to exercise repeat read
|
98
|
-
docs = Array.new(3)
|
99
|
-
freqs = Array.new(3)
|
100
|
-
|
101
|
-
term = Term.new("body", "read")
|
102
|
-
tde.seek(term)
|
103
|
-
assert_equal(3, tde.read(docs, freqs))
|
104
|
-
assert_equal([1,2,6], docs)
|
105
|
-
assert_equal([1,2,4], freqs)
|
76
|
+
assert_equal(IndexTestHelper::INDEX_TEST_DOCS.size, @ir.num_docs())
|
77
|
+
assert_equal(IndexTestHelper::INDEX_TEST_DOCS.size, @ir.max_doc())
|
106
78
|
|
107
|
-
assert_equal(
|
108
|
-
assert_equal([9, 10, 15], docs)
|
109
|
-
assert_equal([3, 1, 1], freqs)
|
79
|
+
assert_equal(4, @ir.doc_freq(:body, "Wally"))
|
110
80
|
|
111
|
-
|
112
|
-
assert_equal([16, 17, 20], docs)
|
113
|
-
assert_equal([2, 1, 1], freqs)
|
81
|
+
tde = @ir.term_docs_for(:body, "Wally")
|
114
82
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
83
|
+
[
|
84
|
+
[ 0, 1],
|
85
|
+
[ 5, 1],
|
86
|
+
[18, 3],
|
87
|
+
[20, 6]
|
88
|
+
].each do |doc, freq|
|
89
|
+
assert(tde.next?)
|
90
|
+
assert_equal(doc, tde.doc())
|
91
|
+
assert_equal(freq, tde.freq())
|
92
|
+
end
|
93
|
+
assert(! tde.next?)
|
120
94
|
|
121
95
|
do_test_term_docpos_enum_skip_to(tde)
|
122
|
-
tde.close()
|
123
96
|
|
124
97
|
# test term positions
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
assert_equal(3, tde.freq())
|
147
|
-
assert_equal(0, tde.next_position())
|
148
|
-
assert_equal(4, tde.next_position())
|
149
|
-
|
150
|
-
assert(tde.skip_to(16))
|
151
|
-
assert_equal(16, tde.doc())
|
152
|
-
assert_equal(2, tde.freq())
|
153
|
-
assert_equal(2, tde.next_position())
|
154
|
-
|
155
|
-
assert(tde.skip_to(21))
|
156
|
-
assert_equal(21, tde.doc())
|
157
|
-
assert_equal(6, tde.freq())
|
158
|
-
assert_equal(3, tde.next_position())
|
159
|
-
assert_equal(4, tde.next_position())
|
160
|
-
assert_equal(5, tde.next_position())
|
161
|
-
assert_equal(8, tde.next_position())
|
162
|
-
assert_equal(9, tde.next_position())
|
163
|
-
assert_equal(10, tde.next_position())
|
164
|
-
|
165
|
-
assert_equal(false, tde.next?)
|
98
|
+
tde = @ir.term_positions_for(:body, "read")
|
99
|
+
[
|
100
|
+
[false, 1, 1, [3]],
|
101
|
+
[false, 2, 2, [1, 4]],
|
102
|
+
[false, 6, 4, [3, 4]],
|
103
|
+
[false, 9, 3, [0, 4]],
|
104
|
+
[ true, 16, 2, [2]],
|
105
|
+
[ true, 21, 6, [3, 4, 5, 8, 9, 10]]
|
106
|
+
].each do |skip, doc, freq, positions|
|
107
|
+
if skip
|
108
|
+
assert(tde.skip_to(doc))
|
109
|
+
else
|
110
|
+
assert(tde.next?)
|
111
|
+
end
|
112
|
+
assert_equal(doc, tde.doc())
|
113
|
+
assert_equal(freq, tde.freq())
|
114
|
+
positions.each {|pos| assert_equal(pos, tde.next_position())}
|
115
|
+
end
|
116
|
+
|
117
|
+
assert_nil(tde.next_position())
|
118
|
+
assert(! tde.next?)
|
166
119
|
|
167
120
|
do_test_term_docpos_enum_skip_to(tde)
|
168
|
-
tde.close()
|
169
121
|
end
|
170
122
|
|
171
123
|
def do_test_term_docpos_enum_skip_to(tde)
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
124
|
+
tde.seek(:text, "skip")
|
125
|
+
|
126
|
+
[
|
127
|
+
[10, 22],
|
128
|
+
[44, 44],
|
129
|
+
[60, 60],
|
130
|
+
[62, 62],
|
131
|
+
[63, 63],
|
132
|
+
].each do |skip_doc, doc_and_freq|
|
133
|
+
assert(tde.skip_to(skip_doc))
|
134
|
+
assert_equal(doc_and_freq, tde.doc())
|
135
|
+
assert_equal(doc_and_freq, tde.freq())
|
136
|
+
end
|
182
137
|
|
183
|
-
tde.seek(term)
|
184
|
-
assert(tde.skip_to(45))
|
185
|
-
assert_equal(45, tde.doc())
|
186
|
-
assert_equal(45, tde.freq())
|
187
138
|
|
188
|
-
assert(tde.skip_to(
|
189
|
-
|
190
|
-
|
139
|
+
assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
|
140
|
+
assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
|
141
|
+
assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT + 100))
|
191
142
|
|
192
|
-
|
193
|
-
|
194
|
-
|
143
|
+
tde.seek(:text, "skip")
|
144
|
+
assert(! tde.skip_to(IndexTestHelper::INDEX_TEST_DOC_COUNT))
|
145
|
+
end
|
195
146
|
|
196
|
-
|
147
|
+
def do_test_term_vectors()
|
148
|
+
expected_tv = TermVector.new(:body,
|
149
|
+
[
|
150
|
+
TVTerm.new("word1", [2, 4, 7]),
|
151
|
+
TVTerm.new("word2", [3]),
|
152
|
+
TVTerm.new("word3", [0, 5, 8, 9]),
|
153
|
+
TVTerm.new("word4", [1, 6])
|
154
|
+
],
|
155
|
+
[*(0...10)].collect {|i| TVOffsets.new(i*6, (i+1)*6 - 1)})
|
197
156
|
|
198
|
-
|
199
|
-
assert_equal(false, tde.skip_to(64))
|
200
|
-
end
|
157
|
+
tv = @ir.term_vector(3, :body)
|
201
158
|
|
202
|
-
|
203
|
-
TermVectorOffsetInfo.new(start_offset, end_offset)
|
204
|
-
end
|
159
|
+
assert_equal(expected_tv, tv)
|
205
160
|
|
206
|
-
|
207
|
-
tv = @ir.get_term_vector(3, :body)
|
208
|
-
|
209
|
-
assert_equal("body", tv.field)
|
210
|
-
assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
|
211
|
-
assert_equal([3, 1, 4, 2], tv.freqs)
|
212
|
-
assert_equal([[2, 4, 7], [3], [0, 5, 8, 9], [1,6]], tv.positions)
|
213
|
-
assert_equal([[t(12,17), t(24,29), t(42,47)],
|
214
|
-
[t(18,23)],
|
215
|
-
[t(0,5), t(30,35), t(48,53), t(54,59)],
|
216
|
-
[t(6,11), t(36,41)]], tv.offsets)
|
217
|
-
tv = nil
|
218
|
-
|
219
|
-
tvs = @ir.get_term_vectors(3)
|
161
|
+
tvs = @ir.term_vectors(3)
|
220
162
|
assert_equal(3, tvs.size)
|
221
|
-
tv = tvs[0]
|
222
|
-
assert_equal("author", tv.field)
|
223
|
-
assert_equal(["Leo", "Tolstoy"], tv.terms)
|
224
|
-
assert(tv.offsets.nil?)
|
225
|
-
tv = tvs[1]
|
226
|
-
assert_equal("body", tv.field)
|
227
|
-
assert_equal(["word1", "word2", "word3", "word4"], tv.terms)
|
228
|
-
tv = tvs[2]
|
229
|
-
assert_equal("title", tv.field)
|
230
|
-
assert_equal(["War And Peace"], tv.terms)
|
231
|
-
assert(tv.positions.nil?)
|
232
|
-
assert_equal(t(0, 13), tv.offsets[0][0])
|
233
|
-
end
|
234
|
-
|
235
|
-
def do_test_changing_field()
|
236
|
-
tv = @ir.get_term_vector(0, "changing_field")
|
237
|
-
assert(tv.nil?)
|
238
163
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
assert(tv.positions)
|
164
|
+
assert_equal(expected_tv, tvs[:body])
|
165
|
+
|
166
|
+
tv = tvs[:author]
|
167
|
+
assert_equal(:author, tv.field)
|
168
|
+
assert_equal([TVTerm.new("Leo", [0]), TVTerm.new("Tolstoy", [1])], tv.terms)
|
245
169
|
assert(tv.offsets.nil?)
|
246
170
|
|
247
|
-
tv = @ir.get_term_vector(19, "changing_field")
|
248
|
-
assert(tv.positions.nil?)
|
249
|
-
assert(tv.offsets)
|
250
171
|
|
251
|
-
tv =
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
tv = @ir.get_term_vector(21, "changing_field")
|
256
|
-
assert(tv.nil?)
|
172
|
+
tv = tvs[:title]
|
173
|
+
assert_equal(:title, tv.field)
|
174
|
+
assert_equal([TVTerm.new("War And Peace", nil)], tv.terms)
|
175
|
+
assert_equal([TVOffsets.new(0, 13)], tv.offsets)
|
257
176
|
end
|
258
|
-
|
177
|
+
|
259
178
|
def do_test_get_doc()
|
260
179
|
doc = @ir.get_document(3)
|
261
|
-
assert_equal(4, doc.
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
assert_equal("Leo Tolstoy",
|
266
|
-
assert_equal(
|
267
|
-
|
268
|
-
assert_equal(
|
269
|
-
assert_equal(
|
270
|
-
|
271
|
-
|
272
|
-
assert_equal(
|
273
|
-
|
274
|
-
assert_equal(false, df.binary?)
|
275
|
-
|
276
|
-
df = doc.field("body")
|
277
|
-
assert_equal("body", df.name)
|
278
|
-
assert_equal("word3 word4 word1 word2 word1 word3 word4 word1 word3 word3", df.data)
|
279
|
-
assert_equal(df.boost, 1.0)
|
280
|
-
assert_equal(true, df.stored?)
|
281
|
-
assert_equal(false, df.compressed?)
|
282
|
-
assert_equal(true, df.indexed?)
|
283
|
-
assert_equal(true, df.tokenized?)
|
284
|
-
assert_equal(true, df.store_term_vector?)
|
285
|
-
assert_equal(true, df.store_positions?)
|
286
|
-
assert_equal(true, df.store_offsets?)
|
287
|
-
assert_equal(false, df.binary?)
|
288
|
-
|
289
|
-
df = doc.field("title")
|
290
|
-
assert_equal("title", df.name)
|
291
|
-
assert_equal("War And Peace", df.data)
|
292
|
-
assert_equal(df.boost, 1.0)
|
293
|
-
assert_equal(true, df.stored?)
|
294
|
-
assert_equal(false, df.compressed?)
|
295
|
-
assert_equal(true, df.indexed?)
|
296
|
-
assert_equal(false, df.tokenized?)
|
297
|
-
assert_equal(true, df.store_term_vector?)
|
298
|
-
assert_equal(false, df.store_positions?)
|
299
|
-
assert_equal(true, df.store_offsets?)
|
300
|
-
assert_equal(false, df.binary?)
|
301
|
-
|
302
|
-
df = doc.field("year")
|
303
|
-
assert_equal("year", df.name)
|
304
|
-
assert_equal("1865", df.data)
|
305
|
-
assert_equal(df.boost, 1.0)
|
306
|
-
assert_equal(true, df.stored?)
|
307
|
-
assert_equal(false, df.compressed?)
|
308
|
-
assert_equal(false, df.indexed?)
|
309
|
-
assert_equal(false, df.tokenized?)
|
310
|
-
assert_equal(false, df.store_term_vector?)
|
311
|
-
assert_equal(false, df.store_positions?)
|
312
|
-
assert_equal(false, df.store_offsets?)
|
313
|
-
assert_equal(false, df.binary?)
|
314
|
-
|
315
|
-
|
316
|
-
df = doc.field("text")
|
317
|
-
assert(df.nil?) # "text" is not stored
|
180
|
+
assert_equal(4, doc.fields.size)
|
181
|
+
assert_equal(0, doc.size)
|
182
|
+
assert_equal([], doc.keys)
|
183
|
+
|
184
|
+
assert_equal("Leo Tolstoy", doc[:author])
|
185
|
+
assert_equal("word3 word4 word1 word2 word1 word3 word4 word1 word3 word3",
|
186
|
+
doc[:body])
|
187
|
+
assert_equal("War And Peace", doc[:title])
|
188
|
+
assert_equal("1865", doc[:year])
|
189
|
+
assert_nil(doc[:text])
|
190
|
+
|
191
|
+
assert_equal(4, doc.size)
|
192
|
+
[:author, :body, :title, :year].each {|fn| assert(doc.keys.include?(fn))}
|
318
193
|
end
|
319
194
|
|
320
195
|
def test_ir_norms()
|
321
|
-
@ir.set_norm(3,
|
322
|
-
@ir.set_norm(3,
|
323
|
-
@ir.set_norm(3,
|
324
|
-
@ir.set_norm(3,
|
325
|
-
@ir.set_norm(3,
|
326
|
-
@ir.set_norm(25,
|
327
|
-
@ir.set_norm(50,
|
328
|
-
@ir.set_norm(63,
|
329
|
-
|
330
|
-
norms = @ir.
|
331
|
-
|
332
|
-
assert_equal(202, norms[3])
|
333
|
-
assert_equal(20, norms[25])
|
196
|
+
@ir.set_norm(3, :title, 1)
|
197
|
+
@ir.set_norm(3, :body, 12)
|
198
|
+
@ir.set_norm(3, :author, 145)
|
199
|
+
@ir.set_norm(3, :year, 31)
|
200
|
+
@ir.set_norm(3, :text, 202)
|
201
|
+
@ir.set_norm(25, :text, 20)
|
202
|
+
@ir.set_norm(50, :text, 200)
|
203
|
+
@ir.set_norm(63, :text, 155)
|
204
|
+
|
205
|
+
norms = @ir.norms(:text)
|
206
|
+
|
207
|
+
assert_equal(202, norms[ 3])
|
208
|
+
assert_equal( 20, norms[25])
|
334
209
|
assert_equal(200, norms[50])
|
335
210
|
assert_equal(155, norms[63])
|
336
211
|
|
337
|
-
norms = @ir.
|
212
|
+
norms = @ir.norms(:title)
|
338
213
|
assert_equal(1, norms[3])
|
339
214
|
|
340
|
-
norms = @ir.
|
215
|
+
norms = @ir.norms(:body)
|
341
216
|
assert_equal(12, norms[3])
|
342
217
|
|
343
|
-
norms = @ir.
|
218
|
+
norms = @ir.norms(:author)
|
344
219
|
assert_equal(145, norms[3])
|
345
220
|
|
346
|
-
norms = @ir.
|
221
|
+
norms = @ir.norms(:year)
|
347
222
|
# TODO: this returns two possible results depending on whether it is
|
348
223
|
# a multi reader or a segment reader. If it is a multi reader it will
|
349
224
|
# always return an empty set of norms, otherwise it will return nil.
|
@@ -351,117 +226,99 @@ module IndexReaderCommon
|
|
351
226
|
#assert(norms.nil?)
|
352
227
|
|
353
228
|
norms = " " * 164
|
354
|
-
@ir.get_norms_into(
|
229
|
+
@ir.get_norms_into(:text, norms, 100)
|
355
230
|
assert_equal(202, norms[103])
|
356
|
-
assert_equal(20, norms[125])
|
231
|
+
assert_equal( 20, norms[125])
|
357
232
|
assert_equal(200, norms[150])
|
358
233
|
assert_equal(155, norms[163])
|
359
234
|
|
360
235
|
@ir.commit()
|
361
236
|
|
362
|
-
|
363
|
-
iw.optimize()
|
364
|
-
iw.close()
|
237
|
+
iw_optimize()
|
365
238
|
|
366
|
-
ir2 =
|
239
|
+
ir2 = ir_new()
|
367
240
|
|
368
241
|
norms = " " * 164
|
369
|
-
ir2.get_norms_into(
|
242
|
+
ir2.get_norms_into(:text, norms, 100)
|
370
243
|
assert_equal(202, norms[103])
|
371
|
-
assert_equal(20, norms[125])
|
244
|
+
assert_equal( 20, norms[125])
|
372
245
|
assert_equal(200, norms[150])
|
373
246
|
assert_equal(155, norms[163])
|
374
247
|
ir2.close()
|
375
248
|
end
|
376
249
|
|
377
250
|
def test_ir_delete()
|
378
|
-
doc_count = IndexTestHelper::
|
379
|
-
|
251
|
+
doc_count = IndexTestHelper::INDEX_TEST_DOCS.size
|
252
|
+
@ir.delete(1000) # non existant doc_num
|
253
|
+
assert(! @ir.has_deletions?())
|
380
254
|
assert_equal(doc_count, @ir.max_doc())
|
381
255
|
assert_equal(doc_count, @ir.num_docs())
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
256
|
+
assert(! @ir.deleted?(10))
|
257
|
+
|
258
|
+
[
|
259
|
+
[10, doc_count - 1],
|
260
|
+
[10, doc_count - 1],
|
261
|
+
[doc_count - 1, doc_count - 2],
|
262
|
+
[doc_count - 2, doc_count - 3],
|
263
|
+
].each do |del_num, num_docs|
|
264
|
+
@ir.delete(del_num)
|
265
|
+
assert(@ir.has_deletions?())
|
266
|
+
assert_equal(doc_count, @ir.max_doc())
|
267
|
+
assert_equal(num_docs, @ir.num_docs())
|
268
|
+
assert(@ir.deleted?(del_num))
|
269
|
+
end
|
389
270
|
|
390
|
-
@ir.
|
391
|
-
|
271
|
+
@ir.undelete_all()
|
272
|
+
assert(! @ir.has_deletions?())
|
392
273
|
assert_equal(doc_count, @ir.max_doc())
|
393
|
-
assert_equal(doc_count
|
394
|
-
|
274
|
+
assert_equal(doc_count, @ir.num_docs())
|
275
|
+
assert(! @ir.deleted?(10))
|
276
|
+
assert(! @ir.deleted?(doc_count - 2))
|
277
|
+
assert(! @ir.deleted?(doc_count - 1))
|
395
278
|
|
396
|
-
|
397
|
-
assert_equal(true, @ir.has_deletions?())
|
398
|
-
assert_equal(doc_count, @ir.max_doc())
|
399
|
-
assert_equal(doc_count - 2, @ir.num_docs())
|
400
|
-
assert_equal(true, @ir.deleted?(doc_count - 1))
|
279
|
+
del_list = [10, 20, 30, 40, 50, doc_count - 1]
|
401
280
|
|
402
|
-
@ir.delete(
|
403
|
-
|
281
|
+
del_list.each {|doc_num| @ir.delete(doc_num)}
|
282
|
+
assert(@ir.has_deletions?())
|
404
283
|
assert_equal(doc_count, @ir.max_doc())
|
405
|
-
assert_equal(doc_count -
|
406
|
-
|
284
|
+
assert_equal(doc_count - del_list.size, @ir.num_docs())
|
285
|
+
del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
|
407
286
|
|
408
|
-
|
409
|
-
|
410
|
-
assert_equal(doc_count,
|
411
|
-
assert_equal(doc_count,
|
412
|
-
assert_equal(false, @ir.deleted?(10))
|
413
|
-
assert_equal(false, @ir.deleted?(doc_count - 2))
|
414
|
-
assert_equal(false, @ir.deleted?(doc_count - 1))
|
415
|
-
|
416
|
-
@ir.delete(10)
|
417
|
-
@ir.delete(20)
|
418
|
-
@ir.delete(30)
|
419
|
-
@ir.delete(40)
|
420
|
-
@ir.delete(50)
|
421
|
-
@ir.delete(doc_count - 1)
|
422
|
-
assert_equal(true, @ir.has_deletions?())
|
423
|
-
assert_equal(doc_count, @ir.max_doc())
|
424
|
-
assert_equal(doc_count - 6, @ir.num_docs())
|
287
|
+
ir2 = ir_new()
|
288
|
+
assert(! ir2.has_deletions?())
|
289
|
+
assert_equal(doc_count, ir2.max_doc())
|
290
|
+
assert_equal(doc_count, ir2.num_docs())
|
425
291
|
|
426
292
|
@ir.commit()
|
427
293
|
|
428
|
-
ir2
|
294
|
+
assert(! ir2.has_deletions?())
|
295
|
+
assert_equal(doc_count, ir2.max_doc())
|
296
|
+
assert_equal(doc_count, ir2.num_docs())
|
429
297
|
|
430
|
-
|
298
|
+
ir2 = ir_new()
|
299
|
+
assert(ir2.has_deletions?())
|
431
300
|
assert_equal(doc_count, ir2.max_doc())
|
432
301
|
assert_equal(doc_count - 6, ir2.num_docs())
|
433
|
-
|
434
|
-
assert_equal(true, ir2.deleted?(20))
|
435
|
-
assert_equal(true, ir2.deleted?(30))
|
436
|
-
assert_equal(true, ir2.deleted?(40))
|
437
|
-
assert_equal(true, ir2.deleted?(50))
|
438
|
-
assert_equal(true, ir2.deleted?(doc_count - 1))
|
302
|
+
del_list.each {|doc_num| assert(ir2.deleted?(doc_num))}
|
439
303
|
|
440
304
|
ir2.undelete_all()
|
441
|
-
|
305
|
+
assert(! ir2.has_deletions?())
|
442
306
|
assert_equal(doc_count, ir2.max_doc())
|
443
307
|
assert_equal(doc_count, ir2.num_docs())
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
assert_equal(false, ir2.deleted?(40))
|
448
|
-
assert_equal(false, ir2.deleted?(50))
|
449
|
-
assert_equal(false, ir2.deleted?(doc_count - 1))
|
450
|
-
|
451
|
-
ir2.delete(10)
|
452
|
-
ir2.delete(20)
|
453
|
-
ir2.delete(30)
|
454
|
-
ir2.delete(40)
|
455
|
-
ir2.delete(50)
|
456
|
-
ir2.delete(doc_count - 1)
|
308
|
+
del_list.each {|doc_num| assert(! ir2.deleted?(doc_num))}
|
309
|
+
|
310
|
+
del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
|
457
311
|
|
458
312
|
ir2.commit()
|
459
313
|
|
460
|
-
|
461
|
-
|
462
|
-
|
314
|
+
del_list.each {|doc_num| assert(@ir.deleted?(doc_num))}
|
315
|
+
|
316
|
+
del_list.each {|doc_num| ir2.delete(doc_num)}
|
317
|
+
ir2.commit()
|
463
318
|
|
464
|
-
|
319
|
+
iw_optimize()
|
320
|
+
|
321
|
+
ir3 = ir_new()
|
465
322
|
|
466
323
|
assert(!ir3.has_deletions?())
|
467
324
|
assert_equal(doc_count - 6, ir3.max_doc())
|
@@ -469,24 +326,35 @@ module IndexReaderCommon
|
|
469
326
|
|
470
327
|
ir3.close()
|
471
328
|
end
|
472
|
-
|
473
329
|
end
|
474
330
|
|
475
|
-
class
|
331
|
+
class MultiReaderTest < Test::Unit::TestCase
|
476
332
|
include IndexReaderCommon
|
477
333
|
|
478
|
-
def
|
479
|
-
|
480
|
-
|
481
|
-
docs = IndexTestHelper.prepare_ir_test_docs()
|
482
|
-
IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
|
483
|
-
iw << docs[i]
|
484
|
-
end
|
334
|
+
def ir_new
|
335
|
+
IndexReader.new(@dir)
|
336
|
+
end
|
485
337
|
|
486
|
-
|
338
|
+
def iw_optimize
|
339
|
+
iw = IndexWriter.new(:dir => @dir, :analyzer => WhiteSpaceAnalyzer.new())
|
487
340
|
iw.optimize()
|
488
341
|
iw.close()
|
489
|
-
|
342
|
+
end
|
343
|
+
|
344
|
+
def setup
|
345
|
+
@dir = Ferret::Store::RAMDirectory.new()
|
346
|
+
|
347
|
+
iw = IndexWriter.new(:dir => @dir,
|
348
|
+
:analyzer => WhiteSpaceAnalyzer.new(),
|
349
|
+
:create => true,
|
350
|
+
:field_infos => IndexTestHelper::INDEX_TEST_FIS,
|
351
|
+
:max_buffered_docs => 15)
|
352
|
+
IndexTestHelper::INDEX_TEST_DOCS.each {|doc| iw << doc}
|
353
|
+
|
354
|
+
# we mustn't optimize here so that MultiReader is used.
|
355
|
+
#iw.optimize() unless self.class == MultiReaderTest
|
356
|
+
iw.close()
|
357
|
+
@ir = ir_new()
|
490
358
|
end
|
491
359
|
|
492
360
|
def tear_down()
|
@@ -495,21 +363,46 @@ class SegmentReaderTest < Test::Unit::TestCase
|
|
495
363
|
end
|
496
364
|
end
|
497
365
|
|
498
|
-
class
|
366
|
+
class SegmentReaderTest < MultiReaderTest
|
367
|
+
end
|
368
|
+
|
369
|
+
class MultiExternalReaderTest < Test::Unit::TestCase
|
499
370
|
include IndexReaderCommon
|
500
371
|
|
501
|
-
def
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
372
|
+
def ir_new
|
373
|
+
readers = @dirs.collect {|dir| IndexReader.new(dir) }
|
374
|
+
IndexReader.new(readers)
|
375
|
+
end
|
376
|
+
|
377
|
+
def iw_optimize
|
378
|
+
@dirs.each do |dir|
|
379
|
+
iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
|
380
|
+
iw.optimize()
|
381
|
+
iw.close()
|
507
382
|
end
|
383
|
+
end
|
508
384
|
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
385
|
+
def setup()
|
386
|
+
@dirs = []
|
387
|
+
|
388
|
+
[
|
389
|
+
[0, 10],
|
390
|
+
[10, 30],
|
391
|
+
[30, IndexTestHelper::INDEX_TEST_DOCS.size]
|
392
|
+
].each do |start, finish|
|
393
|
+
dir = Ferret::Store::RAMDirectory.new()
|
394
|
+
@dirs << dir
|
395
|
+
|
396
|
+
iw = IndexWriter.new(:dir => dir,
|
397
|
+
:analyzer => WhiteSpaceAnalyzer.new(),
|
398
|
+
:create => true,
|
399
|
+
:field_infos => IndexTestHelper::INDEX_TEST_FIS)
|
400
|
+
(start...finish).each do |doc_id|
|
401
|
+
iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
|
402
|
+
end
|
403
|
+
iw.close()
|
404
|
+
end
|
405
|
+
@ir = ir_new
|
513
406
|
end
|
514
407
|
|
515
408
|
def tear_down()
|
@@ -521,7 +414,6 @@ end
|
|
521
414
|
class IndexReaderTest < Test::Unit::TestCase
|
522
415
|
include Ferret::Index
|
523
416
|
include Ferret::Analysis
|
524
|
-
include Ferret::Document
|
525
417
|
|
526
418
|
def setup()
|
527
419
|
@dir = Ferret::Store::RAMDirectory.new()
|
@@ -536,113 +428,69 @@ class IndexReaderTest < Test::Unit::TestCase
|
|
536
428
|
'../../temp/fsdir'))
|
537
429
|
@fs_dir = Ferret::Store::FSDirectory.new(@fs_dpath, true)
|
538
430
|
|
539
|
-
iw = IndexWriter.new(
|
540
|
-
|
541
|
-
|
542
|
-
doc
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
#fis = FieldInfos.new()
|
550
|
-
#fis << doc
|
551
|
-
#assert_equal(4, fis.size)
|
552
|
-
|
553
|
-
#fi = fis["tag"]
|
554
|
-
#assert_equal(true, fi.indexed?)
|
555
|
-
#assert_equal(true, fi.store_term_vector?)
|
556
|
-
#assert_equal(true, fi.store_positions?)
|
557
|
-
#assert_equal(true, fi.store_offsets?)
|
558
|
-
|
431
|
+
iw = IndexWriter.new(:dir => @fs_dir,
|
432
|
+
:analyzer => WhiteSpaceAnalyzer.new(),
|
433
|
+
:create => true)
|
434
|
+
doc = {
|
435
|
+
:tag => ["Ruby", "C", "Lucene", "Ferret"],
|
436
|
+
:body => "this is the body Document Field",
|
437
|
+
:title => "this is the title DocField",
|
438
|
+
:author => "this is the author field"
|
439
|
+
}
|
559
440
|
iw << doc
|
560
|
-
iw.close()
|
561
|
-
|
562
|
-
@dir = Ferret::Store::RAMDirectory.new(@fs_dir, true)
|
563
|
-
ir = IndexReader.open(@dir, false)
|
564
|
-
|
565
|
-
doc = ir.get_document(0)
|
566
|
-
assert_equal(4, doc.field_count)
|
567
|
-
assert_equal(7, doc.entry_count)
|
568
|
-
entries = doc.fields("tag")
|
569
|
-
assert_equal(4, entries.size)
|
570
|
-
assert_equal("Ruby", entries[0].data)
|
571
|
-
assert_equal("C", entries[1].data)
|
572
|
-
assert_equal("Lucene", entries[2].data)
|
573
|
-
assert_equal("Ferret", entries[3].data)
|
574
|
-
|
575
|
-
doc.remove_field("tag")
|
576
|
-
assert_equal(4, doc.field_count)
|
577
|
-
assert_equal(6, doc.entry_count)
|
578
|
-
assert_equal("C", doc.field("tag").data)
|
579
|
-
|
580
|
-
doc.remove_fields("tag")
|
581
|
-
assert_equal(3, doc.field_count)
|
582
|
-
assert_equal(3, doc.entry_count)
|
583
|
-
|
584
|
-
ir.delete(0)
|
585
|
-
ir.close()
|
586
441
|
|
587
|
-
iw = IndexWriter.new(@dir, :analyzer => WhiteSpaceAnalyzer.new())
|
588
|
-
iw << doc
|
589
|
-
iw.optimize()
|
590
442
|
iw.close()
|
591
|
-
doc = nil
|
592
|
-
|
593
|
-
ir = IndexReader.open(@dir, false)
|
594
|
-
doc = ir.get_document(0)
|
595
|
-
assert_equal(3, doc.field_count)
|
596
|
-
assert_equal(3, doc.entry_count)
|
597
443
|
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
def t(start_offset, end_offset)
|
602
|
-
TermVectorOffsetInfo.new(start_offset, end_offset)
|
444
|
+
@dir = Ferret::Store::RAMDirectory.new(@fs_dir)
|
445
|
+
ir = IndexReader.new(@dir)
|
446
|
+
assert_equal(doc, ir.get_document(0).load)
|
603
447
|
end
|
604
448
|
|
605
449
|
def do_test_term_vectors(ir)
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
450
|
+
expected_tv = TermVector.new(:body,
|
451
|
+
[
|
452
|
+
TVTerm.new("word1", [2, 4, 7]),
|
453
|
+
TVTerm.new("word2", [3]),
|
454
|
+
TVTerm.new("word3", [0, 5, 8, 9]),
|
455
|
+
TVTerm.new("word4", [1, 6])
|
456
|
+
],
|
457
|
+
[*(0...10)].collect {|i| TVOffsets.new(i*6, (i+1)*6 - 1)})
|
458
|
+
|
459
|
+
tv = ir.term_vector(3, :body)
|
460
|
+
|
461
|
+
assert_equal(expected_tv, tv)
|
462
|
+
|
463
|
+
tvs = ir.term_vectors(3)
|
619
464
|
assert_equal(3, tvs.size)
|
620
|
-
|
621
|
-
assert_equal(
|
622
|
-
|
465
|
+
|
466
|
+
assert_equal(expected_tv, tvs[:body])
|
467
|
+
|
468
|
+
tv = tvs[:author]
|
469
|
+
assert_equal(:author, tv.field)
|
470
|
+
assert_equal([TVTerm.new("Leo", [0]), TVTerm.new("Tolstoy", [1])], tv.terms)
|
623
471
|
assert(tv.offsets.nil?)
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
tv
|
628
|
-
assert_equal("
|
629
|
-
assert_equal([
|
630
|
-
assert(tv.positions.nil?)
|
631
|
-
assert_equal(t(0, 13), tv.offsets[0][0])
|
472
|
+
|
473
|
+
|
474
|
+
tv = tvs[:title]
|
475
|
+
assert_equal(:title, tv.field)
|
476
|
+
assert_equal([TVTerm.new("War And Peace", nil)], tv.terms)
|
477
|
+
assert_equal([TVOffsets.new(0, 13)], tv.offsets)
|
632
478
|
end
|
633
479
|
|
634
|
-
def
|
635
|
-
iw = IndexWriter.new(
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
480
|
+
def do_test_ir_read_while_optimizing(dir)
|
481
|
+
iw = IndexWriter.new(:dir => dir,
|
482
|
+
:analyzer => WhiteSpaceAnalyzer.new(),
|
483
|
+
:create => true,
|
484
|
+
:field_infos => IndexTestHelper::INDEX_TEST_FIS)
|
485
|
+
|
486
|
+
IndexTestHelper::INDEX_TEST_DOCS.each {|doc| iw << doc}
|
487
|
+
|
640
488
|
iw.close()
|
641
489
|
|
642
|
-
ir = IndexReader.
|
490
|
+
ir = IndexReader.new(dir)
|
643
491
|
do_test_term_vectors(ir)
|
644
492
|
|
645
|
-
iw = IndexWriter.new(
|
493
|
+
iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
|
646
494
|
iw.optimize()
|
647
495
|
iw.close()
|
648
496
|
|
@@ -651,28 +499,15 @@ class IndexReaderTest < Test::Unit::TestCase
|
|
651
499
|
ir.close()
|
652
500
|
end
|
653
501
|
|
502
|
+
def test_ir_read_while_optimizing()
|
503
|
+
do_test_ir_read_while_optimizing(@dir)
|
504
|
+
end
|
505
|
+
|
654
506
|
def test_ir_read_while_optimizing_on_disk()
|
655
507
|
dpath = File.expand_path(File.join(File.dirname(__FILE__),
|
656
508
|
'../../temp/fsdir'))
|
657
509
|
fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
|
658
|
-
|
659
|
-
iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
|
660
|
-
docs = IndexTestHelper.prepare_ir_test_docs()
|
661
|
-
IndexTestHelper::IR_TEST_DOC_CNT.times do |i|
|
662
|
-
iw << docs[i]
|
663
|
-
end
|
664
|
-
iw.close()
|
665
|
-
|
666
|
-
ir = IndexReader.open(fs_dir, false)
|
667
|
-
do_test_term_vectors(ir)
|
668
|
-
|
669
|
-
iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
|
670
|
-
iw.optimize()
|
671
|
-
iw.close()
|
672
|
-
|
673
|
-
do_test_term_vectors(ir)
|
674
|
-
|
675
|
-
ir.close()
|
510
|
+
do_test_ir_read_while_optimizing(fs_dir)
|
676
511
|
fs_dir.close()
|
677
512
|
end
|
678
513
|
|
@@ -681,25 +516,23 @@ class IndexReaderTest < Test::Unit::TestCase
|
|
681
516
|
'../../temp/fsdir'))
|
682
517
|
fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
|
683
518
|
|
684
|
-
iw = IndexWriter.new(
|
685
|
-
|
686
|
-
|
687
|
-
iw <<
|
519
|
+
iw = IndexWriter.new(:dir => fs_dir,
|
520
|
+
:analyzer => WhiteSpaceAnalyzer.new(),
|
521
|
+
:create => true)
|
522
|
+
iw << {:field => "content"}
|
688
523
|
iw.close()
|
689
524
|
|
690
|
-
ir = IndexReader.
|
525
|
+
ir = IndexReader.new(fs_dir)
|
691
526
|
assert(ir.latest?)
|
692
527
|
|
693
|
-
iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
|
694
|
-
|
695
|
-
doc << Field.new("field", "content2", Field::Store::YES, Field::Index::TOKENIZED)
|
696
|
-
iw << doc
|
528
|
+
iw = IndexWriter.new(:dir => fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
|
529
|
+
iw << {:field => "content2"}
|
697
530
|
iw.close()
|
698
531
|
|
699
532
|
assert(!ir.latest?)
|
700
533
|
|
701
534
|
ir.close()
|
702
|
-
ir = IndexReader.
|
535
|
+
ir = IndexReader.new(fs_dir)
|
703
536
|
assert(ir.latest?)
|
704
537
|
ir.close()
|
705
538
|
end
|