ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,183 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Expert: A +Scorer+ for documents matching a +Term+.
|
3
|
-
class TermScorer < Scorer
|
4
|
-
SCORE_CACHE_SIZE = 32
|
5
|
-
|
6
|
-
# Returns the current document number matching the query.
|
7
|
-
# Initially invalid, until #next() is called the first time.
|
8
|
-
attr_reader :doc
|
9
|
-
|
10
|
-
# Construct a +TermScorer+.
|
11
|
-
# weight:: The weight of the +Term+ in the query.
|
12
|
-
# td:: An iterator over the documents matching the +Term+.
|
13
|
-
# similarity:: The +Similarity+ implementation to be used for score
|
14
|
-
# computations.
|
15
|
-
# norms:: The field norms of the document fields for the +Term+.
|
16
|
-
def initialize(weight, td, similarity, norms)
|
17
|
-
super(similarity)
|
18
|
-
|
19
|
-
@doc = 0
|
20
|
-
@docs = Array.new(SCORE_CACHE_SIZE, 0) # buffered doc numbers
|
21
|
-
@freqs = Array.new(SCORE_CACHE_SIZE, 0) # buffered term freqs
|
22
|
-
@pointer = @pointer_max = 0;
|
23
|
-
@score_cache = Array.new(SCORE_CACHE_SIZE)
|
24
|
-
|
25
|
-
@weight = weight
|
26
|
-
@term_docs = td
|
27
|
-
@norms = norms
|
28
|
-
@weight_value = weight.value
|
29
|
-
|
30
|
-
SCORE_CACHE_SIZE.times do |i|
|
31
|
-
@score_cache[i] = similarity().tf(i) * @weight_value
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# Expert: Iterates over matching all documents, yielding the document
|
36
|
-
# number and the score.
|
37
|
-
#
|
38
|
-
# returns:: true if more matching documents may remain.
|
39
|
-
def each_hit() # :yields: doc, score
|
40
|
-
sim = similarity() # cache sim in local
|
41
|
-
while next?
|
42
|
-
f = @freqs[@pointer]
|
43
|
-
|
44
|
-
# compute tf(f)*weight
|
45
|
-
if f < SCORE_CACHE_SIZE # check cache
|
46
|
-
score = @score_cache[f] # cache hit
|
47
|
-
else
|
48
|
-
score = sim.tf(f) * @weight_value # cache miss
|
49
|
-
end
|
50
|
-
|
51
|
-
score *= sim.decode_norm(@norms[@doc]) # normalize for field
|
52
|
-
|
53
|
-
yield(@doc, score) # collect score
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# Expert: Iterates over matching documents in a range.
|
58
|
-
#
|
59
|
-
# NOTE: that #next? needs to be called first.
|
60
|
-
#
|
61
|
-
# max:: Do not score documents past this. Default will search all documents
|
62
|
-
# avaliable.
|
63
|
-
# returns:: true if more matching documents may remain.
|
64
|
-
def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
|
65
|
-
sim = similarity() # cache sim in local
|
66
|
-
while (@doc < max) # for docs in window
|
67
|
-
f = @freqs[@pointer]
|
68
|
-
|
69
|
-
# compute tf(f)*weight
|
70
|
-
if f < SCORE_CACHE_SIZE # check cache
|
71
|
-
score = @score_cache[f] # cache hit
|
72
|
-
else
|
73
|
-
score = sim.tf(f) * @weight_value # cache miss
|
74
|
-
end
|
75
|
-
|
76
|
-
score *= sim.decode_norm(@norms[@doc]) # normalize for field
|
77
|
-
|
78
|
-
yield(@doc, score) # collect score
|
79
|
-
if not next?
|
80
|
-
return false
|
81
|
-
end
|
82
|
-
end
|
83
|
-
return true # false if we didn't find +max+ hits
|
84
|
-
end
|
85
|
-
|
86
|
-
|
87
|
-
# Advances to the next document matching the query.
|
88
|
-
#
|
89
|
-
# The iterator over the matching documents is buffered using
|
90
|
-
# TermDocEnum#read(int[],int[]).
|
91
|
-
#
|
92
|
-
# returns:: true iff there is another document matching the query.
|
93
|
-
def next?()
|
94
|
-
@pointer += 1
|
95
|
-
if @pointer >= @pointer_max
|
96
|
-
@pointer_max = @term_docs.read(@docs, @freqs) # refill buffer
|
97
|
-
if @pointer_max != 0
|
98
|
-
@pointer = 0
|
99
|
-
else
|
100
|
-
@term_docs.close() # close stream
|
101
|
-
@doc = MAX_DOCS # set to sentinel value
|
102
|
-
return false
|
103
|
-
end
|
104
|
-
end
|
105
|
-
@doc = @docs[@pointer]
|
106
|
-
return true
|
107
|
-
end
|
108
|
-
|
109
|
-
def score()
|
110
|
-
f = @freqs[@pointer]
|
111
|
-
# compute tf(f)*weight
|
112
|
-
if f < SCORE_CACHE_SIZE # check cache
|
113
|
-
raw = @score_cache[f] # cache hit
|
114
|
-
else
|
115
|
-
raw = similarity().tf(f) * @weight_value # cache miss
|
116
|
-
end
|
117
|
-
|
118
|
-
return raw * Similarity.decode_norm(@norms[@doc]) # normalize for field
|
119
|
-
end
|
120
|
-
|
121
|
-
# Skips to the first match beyond the current whose document number is
|
122
|
-
# greater than or equal to a given target.
|
123
|
-
#
|
124
|
-
# The implementation uses TermDocEnum#skip_to(int).
|
125
|
-
# target:: The target document number.
|
126
|
-
# returns:: true iff there is such a match.
|
127
|
-
def skip_to(target)
|
128
|
-
# first scan in cache
|
129
|
-
while (@pointer += 1) < @pointer_max
|
130
|
-
if @docs[@pointer] >= target
|
131
|
-
@doc = @docs[@pointer]
|
132
|
-
return true
|
133
|
-
end
|
134
|
-
end
|
135
|
-
|
136
|
-
# not found in cache, seek underlying stream
|
137
|
-
result = @term_docs.skip_to(target)
|
138
|
-
if (result)
|
139
|
-
@pointer_max = 1
|
140
|
-
@pointer = 0
|
141
|
-
@docs[@pointer] = @doc = @term_docs.doc
|
142
|
-
@freqs[@pointer] = @term_docs.freq
|
143
|
-
else
|
144
|
-
@doc = MAX_DOCS
|
145
|
-
end
|
146
|
-
return result
|
147
|
-
end
|
148
|
-
|
149
|
-
# Returns an explanation of the score for a document.
|
150
|
-
#
|
151
|
-
# When this method is used, the #next() method and the #score() method
|
152
|
-
# should not be used.
|
153
|
-
#
|
154
|
-
# doc:: The document number for the explanation.
|
155
|
-
# TODO: Modify to make use of TermDocEnum#skip_to(int).
|
156
|
-
def explain(doc)
|
157
|
-
query = @weight.query()
|
158
|
-
tf_explanation = Explanation.new()
|
159
|
-
tf = 0
|
160
|
-
while (@pointer < @pointer_max)
|
161
|
-
if (@docs[@pointer] == doc)
|
162
|
-
tf = @freqs[@pointer]
|
163
|
-
end
|
164
|
-
@pointer += 1
|
165
|
-
end
|
166
|
-
if (tf == 0)
|
167
|
-
while (@term_docs.next?)
|
168
|
-
if (@term_docs.doc() == doc)
|
169
|
-
tf = @term_docs.freq()
|
170
|
-
end
|
171
|
-
end
|
172
|
-
end
|
173
|
-
@term_docs.close()
|
174
|
-
tf_explanation.value = similarity().tf(tf)
|
175
|
-
tf_explanation.description = "tf(term_freq(#{query.term})=#{tf})"
|
176
|
-
|
177
|
-
return tf_explanation
|
178
|
-
end
|
179
|
-
|
180
|
-
# Returns a string representation of this +TermScorer+.
|
181
|
-
def to_s() return "scorer(" + @weight + ")"; end
|
182
|
-
end
|
183
|
-
end
|
@@ -1,36 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Expert: Returned by low-level search implementations.
|
3
|
-
# See Searcher#search
|
4
|
-
class TopDocs
|
5
|
-
# Expert: The total number of hits for the query.
|
6
|
-
# See Hits#length()
|
7
|
-
attr_accessor :score_docs, :total_hits, :fields
|
8
|
-
alias :size :total_hits
|
9
|
-
|
10
|
-
# iterate through each of the score docs, yielding the document number and
|
11
|
-
# the score. eg:
|
12
|
-
#
|
13
|
-
# top_docs.each do |doc, score|
|
14
|
-
# puts "Doc number #{doc} found with score of #{score}"}
|
15
|
-
# end
|
16
|
-
#
|
17
|
-
def each
|
18
|
-
score_docs.each {|sd| yield(sd.doc, sd.score) }
|
19
|
-
end
|
20
|
-
|
21
|
-
# Expert: Constructs a TopDocs.
|
22
|
-
def initialize(total_hits, score_docs, fields = SortField::FIELD_SCORE)
|
23
|
-
@total_hits = total_hits
|
24
|
-
@score_docs = score_docs
|
25
|
-
@fields = fields
|
26
|
-
end
|
27
|
-
|
28
|
-
def to_s
|
29
|
-
buffer = "#{total_hits} hits sorted by <"
|
30
|
-
buffer << [fields].flatten.map {|field| "#{@field}" }.join(", ")
|
31
|
-
buffer << ">:\n"
|
32
|
-
score_docs.each {|sd| buffer << "\t#{sd}\n" }
|
33
|
-
return buffer
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Expert: Returned by low-level sorted search implementations.
|
3
|
-
class TopFieldDocs < TopDocs
|
4
|
-
|
5
|
-
# The fields which were used to sort results by.
|
6
|
-
attr_accessor :fields
|
7
|
-
|
8
|
-
# Creates one of these objects.
|
9
|
-
# total_hits:: Total number of hits for the query.
|
10
|
-
# score_docs:: The top hits for the query.
|
11
|
-
# fields:: The sort criteria used to find the top hits.
|
12
|
-
def initialize(total_hits, score_docs, fields)
|
13
|
-
super(total_hits, score_docs)
|
14
|
-
@fields = fields
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
data/lib/ferret/search/weight.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
module Ferret
|
2
|
-
module Search
|
3
|
-
# Expert: Calculate query weights and build query scorers.
|
4
|
-
#
|
5
|
-
# The purpose of Weight is to make it so that searching does not modify
|
6
|
-
# a Query, so that a Query instance can be reused.
|
7
|
-
#
|
8
|
-
# Searcher dependent state of the query should reside in the Weight.
|
9
|
-
#
|
10
|
-
# IndexReader dependent state should reside in the Scorer.
|
11
|
-
#
|
12
|
-
# A +Weight+ is used in the following way:
|
13
|
-
#
|
14
|
-
# 1. A +Weight+ is constructed by a top-level query, given a +Searcher+
|
15
|
-
# (See Query#create_weight).
|
16
|
-
# 2. The #sum_of_squared_weights() method is called on the +Weight+ to
|
17
|
-
# compute the query normalization factor Similarity#query_norm(float)
|
18
|
-
# of the query clauses contained in the query.
|
19
|
-
# 3. The query normalization factor is passed to #normalize().
|
20
|
-
# At this point the weighting is complete.
|
21
|
-
# 4. A +Scorer+ is constructed by #scorer()
|
22
|
-
class Weight
|
23
|
-
# The query that this concerns.
|
24
|
-
def query()
|
25
|
-
raise NotImplementedError
|
26
|
-
end
|
27
|
-
|
28
|
-
# The weight for this query.
|
29
|
-
def value()
|
30
|
-
raise NotImplementedError
|
31
|
-
end
|
32
|
-
|
33
|
-
# The sum of squared weights of contained query clauses.
|
34
|
-
def sum_of_squared_weights()
|
35
|
-
raise NotImplementedError
|
36
|
-
end
|
37
|
-
|
38
|
-
# Assigns the query normalization factor to this.
|
39
|
-
def normalize(norm)
|
40
|
-
raise NotImplementedError
|
41
|
-
end
|
42
|
-
|
43
|
-
# Constructs a scorer for this.
|
44
|
-
def scorer(reader)
|
45
|
-
raise NotImplementedError
|
46
|
-
end
|
47
|
-
|
48
|
-
# An explanation of the score computation for the named document.
|
49
|
-
def explain(reader, doc)
|
50
|
-
raise NotImplementedError
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Implements the wildcard search query. Supported wildcards are +*+, which
|
3
|
-
# matches any character sequence (including the empty one), and +?+, which
|
4
|
-
# matches any single character. Note this query can be slow, as it needs to
|
5
|
-
# iterate over many terms. In order to prevent extremely slow
|
6
|
-
# WildcardQueries, a Wildcard term should not start with one of the
|
7
|
-
# wildcards +*+ or +?+.
|
8
|
-
#
|
9
|
-
# See WildcardTermEnum
|
10
|
-
class WildcardQuery < MultiTermQuery
|
11
|
-
def initialize(term)
|
12
|
-
super(term)
|
13
|
-
end
|
14
|
-
|
15
|
-
def get_term_enum(reader)
|
16
|
-
return WildcardTermEnum.new(reader, @term)
|
17
|
-
end
|
18
|
-
|
19
|
-
def eql?(o)
|
20
|
-
if o.instance_of?(WildcardQuery)
|
21
|
-
return super(o)
|
22
|
-
end
|
23
|
-
return false
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,61 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Subclass of FilteredTermEnum for enumerating all terms that match the
|
3
|
-
# specified wildcard filter term.
|
4
|
-
#
|
5
|
-
# Term enumerations are always ordered by Term.compareTo(). Each term in
|
6
|
-
# the enumeration is greater than all that precede it.
|
7
|
-
#
|
8
|
-
class WildcardTermEnum < FilteredTermEnum
|
9
|
-
include Ferret::Index
|
10
|
-
|
11
|
-
attr_reader :end_enum
|
12
|
-
|
13
|
-
WILDCARD_STRING = '*'
|
14
|
-
WILDCARD_CHAR = '?'
|
15
|
-
|
16
|
-
# Creates a new +WildcardTermEnum+. Passing in a
|
17
|
-
# org.apache.lucene.index.Term Term that does not contain a
|
18
|
-
# +WILDCARD_CHAR+ will cause an exception to be raisen.
|
19
|
-
#
|
20
|
-
# After calling the constructor the enumeration is already pointing to the first
|
21
|
-
# valid term if such a term exists.
|
22
|
-
def initialize(reader, term)
|
23
|
-
super()
|
24
|
-
@end_enum = false
|
25
|
-
@search_term = term
|
26
|
-
@field = @search_term.field
|
27
|
-
text = @search_term.text
|
28
|
-
len = text.length
|
29
|
-
|
30
|
-
sidx = text.index(WILDCARD_STRING)||len
|
31
|
-
cidx = text.index(WILDCARD_CHAR)||len
|
32
|
-
idx = [sidx, cidx].min
|
33
|
-
|
34
|
-
@pre = @search_term.text[0,idx]
|
35
|
-
@pre_len = idx
|
36
|
-
@pattern = /^#{Regexp.escape(text[idx..-1]).gsub(/\\([?*])/){".#{$1}"}}$/
|
37
|
-
self.enum = reader.terms_from(Term.new(@search_term.field, @pre))
|
38
|
-
end
|
39
|
-
|
40
|
-
def term_compare(term)
|
41
|
-
if (@field == term.field)
|
42
|
-
search_text = term.text
|
43
|
-
if (search_text[0, @pre_len] == @pre)
|
44
|
-
return (search_text[@pre_len..-1] =~ @pattern)
|
45
|
-
end
|
46
|
-
end
|
47
|
-
@end_enum = true
|
48
|
-
return false
|
49
|
-
end
|
50
|
-
|
51
|
-
def difference()
|
52
|
-
return 1.0
|
53
|
-
end
|
54
|
-
|
55
|
-
def close()
|
56
|
-
super()
|
57
|
-
@pattern = nil
|
58
|
-
@field = nil
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|
data/lib/ferret/stemmers.rb
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
require 'ferret/stemmers/porter_stemmer'
|
@@ -1,218 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# $Id: PorterStemmer.rb,v 1.1.1.1 2004/04/17 13:55:20 pragdave Exp $
|
4
|
-
#
|
5
|
-
# See example usage at the end of this file.
|
6
|
-
#
|
7
|
-
|
8
|
-
module Stemmable
|
9
|
-
|
10
|
-
STEMMED = {}
|
11
|
-
|
12
|
-
STEP_2_LIST = {
|
13
|
-
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
|
14
|
-
'izer'=>'ize', 'bli'=>'ble',
|
15
|
-
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
|
16
|
-
'ization'=>'ize', 'ation'=>'ate',
|
17
|
-
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
|
18
|
-
'ousness'=>'ous', 'aliti'=>'al',
|
19
|
-
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
20
|
-
}
|
21
|
-
|
22
|
-
STEP_3_LIST = {
|
23
|
-
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
24
|
-
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
25
|
-
}
|
26
|
-
|
27
|
-
|
28
|
-
SUFFIX_1_REGEXP = /(
|
29
|
-
ational |
|
30
|
-
tional |
|
31
|
-
enci |
|
32
|
-
anci |
|
33
|
-
izer |
|
34
|
-
bli |
|
35
|
-
alli |
|
36
|
-
entli |
|
37
|
-
eli |
|
38
|
-
ousli |
|
39
|
-
ization |
|
40
|
-
ation |
|
41
|
-
ator |
|
42
|
-
alism |
|
43
|
-
iveness |
|
44
|
-
fulness |
|
45
|
-
ousness |
|
46
|
-
aliti |
|
47
|
-
iviti |
|
48
|
-
biliti |
|
49
|
-
logi)$/x
|
50
|
-
|
51
|
-
|
52
|
-
SUFFIX_2_REGEXP = /(
|
53
|
-
al |
|
54
|
-
ance |
|
55
|
-
ence |
|
56
|
-
er |
|
57
|
-
ic |
|
58
|
-
able |
|
59
|
-
ible |
|
60
|
-
ant |
|
61
|
-
ement |
|
62
|
-
ment |
|
63
|
-
ent |
|
64
|
-
ou |
|
65
|
-
ism |
|
66
|
-
ate |
|
67
|
-
iti |
|
68
|
-
ous |
|
69
|
-
ive |
|
70
|
-
ize)$/x
|
71
|
-
|
72
|
-
|
73
|
-
C = "[^aeiou]" # consonant
|
74
|
-
V = "[aeiouy]" # vowel
|
75
|
-
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
76
|
-
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
77
|
-
|
78
|
-
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
|
79
|
-
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
|
80
|
-
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
|
81
|
-
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
|
82
|
-
|
83
|
-
#
|
84
|
-
# Porter stemmer in Ruby.
|
85
|
-
#
|
86
|
-
# This is the Porter stemming algorithm, ported to Ruby from the
|
87
|
-
# version coded up in Perl. It's easy to follow against the rules
|
88
|
-
# in the original paper in:
|
89
|
-
#
|
90
|
-
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
91
|
-
# no. 3, pp 130-137,
|
92
|
-
#
|
93
|
-
# See also http://www.tartarus.org/~martin/PorterStemmer
|
94
|
-
#
|
95
|
-
# Send comments to raypereda@hotmail.com
|
96
|
-
#
|
97
|
-
|
98
|
-
def stem_porter(w = self.to_str.dup)
|
99
|
-
|
100
|
-
# make a copy of the given object and convert it to a string.
|
101
|
-
original_word = w
|
102
|
-
|
103
|
-
return w if w.length < 3
|
104
|
-
|
105
|
-
result = STEMMED[w]
|
106
|
-
return result if result
|
107
|
-
|
108
|
-
# now map initial y to Y so that the patterns never treat it as vowel
|
109
|
-
w[0] = 'Y' if w[0] == ?y
|
110
|
-
|
111
|
-
# Step 1a
|
112
|
-
if w =~ /(ss|i)es$/
|
113
|
-
w = $` + $1
|
114
|
-
elsif w =~ /([^s])s$/
|
115
|
-
w = $` + $1
|
116
|
-
end
|
117
|
-
|
118
|
-
# Step 1b
|
119
|
-
if w =~ /eed$/
|
120
|
-
w.chop! if $` =~ MGR0
|
121
|
-
elsif w =~ /(ed|ing)$/
|
122
|
-
stem = $`
|
123
|
-
if stem =~ VOWEL_IN_STEM
|
124
|
-
w = stem
|
125
|
-
case w
|
126
|
-
when /(at|bl|iz)$/ then w << "e"
|
127
|
-
when /([^aeiouylsz])\1$/ then w.chop!
|
128
|
-
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
if w =~ /y$/
|
134
|
-
stem = $`
|
135
|
-
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
136
|
-
end
|
137
|
-
|
138
|
-
# Step 2
|
139
|
-
if w =~ SUFFIX_1_REGEXP
|
140
|
-
stem = $`
|
141
|
-
suffix = $1
|
142
|
-
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
143
|
-
if stem =~ MGR0
|
144
|
-
w = stem + STEP_2_LIST[suffix]
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
# Step 3
|
149
|
-
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
150
|
-
stem = $`
|
151
|
-
suffix = $1
|
152
|
-
if stem =~ MGR0
|
153
|
-
w = stem + STEP_3_LIST[suffix]
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
# Step 4
|
158
|
-
if w =~ SUFFIX_2_REGEXP
|
159
|
-
stem = $`
|
160
|
-
if stem =~ MGR1
|
161
|
-
w = stem
|
162
|
-
end
|
163
|
-
elsif w =~ /(s|t)(ion)$/
|
164
|
-
stem = $` + $1
|
165
|
-
if stem =~ MGR1
|
166
|
-
w = stem
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
# Step 5
|
171
|
-
if w =~ /e$/
|
172
|
-
stem = $`
|
173
|
-
if (stem =~ MGR1) ||
|
174
|
-
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
175
|
-
w = stem
|
176
|
-
end
|
177
|
-
end
|
178
|
-
|
179
|
-
if w =~ /ll$/ && w =~ MGR1
|
180
|
-
w.chop!
|
181
|
-
end
|
182
|
-
|
183
|
-
# and turn initial Y back to y
|
184
|
-
w[0] = 'y' if w[0] == ?Y
|
185
|
-
|
186
|
-
STEMMED[original_word] = w
|
187
|
-
|
188
|
-
w
|
189
|
-
end
|
190
|
-
|
191
|
-
|
192
|
-
module_function :stem_porter
|
193
|
-
#
|
194
|
-
# make the stem_porter the default stem method, just in case we
|
195
|
-
# feel like having multiple stemmers available later.
|
196
|
-
#
|
197
|
-
alias stem stem_porter
|
198
|
-
public :stem
|
199
|
-
|
200
|
-
end
|
201
|
-
|
202
|
-
|
203
|
-
#
|
204
|
-
# Make this script executable, and send it words on stdin, one per
|
205
|
-
# line, and it will output the stemmed versions to stdout.
|
206
|
-
#
|
207
|
-
if $0 == __FILE__ then
|
208
|
-
class String
|
209
|
-
include Stemmable
|
210
|
-
end
|
211
|
-
|
212
|
-
# the String class, and any subclasses of it you might have, now know
|
213
|
-
# how to stem things.
|
214
|
-
|
215
|
-
$stdin.each do |word|
|
216
|
-
puts word.strip.stem
|
217
|
-
end
|
218
|
-
end
|