ferret 0.9.6 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/lib/ferret/search/scorer.rb
DELETED
@@ -1,91 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Expert: Common scoring functionality for different types of queries.
|
3
|
-
#
|
4
|
-
# A +Scorer+ either iterates over documents matching a query, or provides an
|
5
|
-
# explanation of the score for a query for a given document.
|
6
|
-
#
|
7
|
-
# Document scores are computed using a given +Similarity+ implementation.
|
8
|
-
class Scorer
|
9
|
-
attr_reader :similarity
|
10
|
-
MAX_DOCS = 0x7FFFFFFF
|
11
|
-
|
12
|
-
# Constructs a Scorer.
|
13
|
-
# similarity:: The +Similarity+ implementation used by this scorer.
|
14
|
-
def initialize(similarity)
|
15
|
-
@similarity = similarity
|
16
|
-
end
|
17
|
-
|
18
|
-
# Expert: Iterates over matching all documents, yielding the document
|
19
|
-
# number and the score.
|
20
|
-
#
|
21
|
-
# returns:: true if more matching documents may remain.
|
22
|
-
def each_hit() # :yields: doc, score
|
23
|
-
while next?
|
24
|
-
yield(doc(), score())
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
# Expert: Iterates over matching documents in a range.
|
29
|
-
#
|
30
|
-
# max:: Do not score documents past this. Default will search all documents
|
31
|
-
# avaliable.
|
32
|
-
# returns:: true if more matching documents may remain.
|
33
|
-
def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
|
34
|
-
while (next? and doc() < max)
|
35
|
-
yield(doc(), score())
|
36
|
-
end
|
37
|
-
return doc() < max
|
38
|
-
end
|
39
|
-
|
40
|
-
# Advances to the next document matching the query.
|
41
|
-
# returns:: true iff there is another document matching the query.
|
42
|
-
# When this method is used the #explain(int) method should not be used.
|
43
|
-
def next?()
|
44
|
-
raise NotImplementedError
|
45
|
-
end
|
46
|
-
|
47
|
-
# Returns the current document number matching the query.
|
48
|
-
# Initially invalid, until #next?() is called the first time.
|
49
|
-
def doc()
|
50
|
-
raise NotImplementedError
|
51
|
-
end
|
52
|
-
|
53
|
-
# Returns the score for the current document matching the query.
|
54
|
-
# Initially invalid, until #next?() is called the first time.
|
55
|
-
def score()
|
56
|
-
raise NotImplementedError
|
57
|
-
end
|
58
|
-
|
59
|
-
# Skips to the first match beyond the current whose document number is
|
60
|
-
# greater than or equal to a given target.
|
61
|
-
#
|
62
|
-
# When this method is used the #explain(int) method should not be used.
|
63
|
-
#
|
64
|
-
# target:: The target document number.
|
65
|
-
# returns:: true iff there is such a match.
|
66
|
-
#
|
67
|
-
# Behaves as if written:
|
68
|
-
#
|
69
|
-
# def skip_to(target)
|
70
|
-
# begin
|
71
|
-
# return false if not next?()
|
72
|
-
# end while (target > doc())
|
73
|
-
# return true
|
74
|
-
# end
|
75
|
-
#
|
76
|
-
# Most implementations are considerably more efficient than that.
|
77
|
-
def skip_to(target)
|
78
|
-
raise NotImplementedError
|
79
|
-
end
|
80
|
-
|
81
|
-
# Returns an explanation of the score for a document.
|
82
|
-
#
|
83
|
-
# When this method is used, the #next?(), #skip_to(int) and
|
84
|
-
# #score(HitCollector) methods should not be used.
|
85
|
-
#
|
86
|
-
# doc:: The document number for the explanation.
|
87
|
-
def explain(doc)
|
88
|
-
raise NotImplementedError
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
@@ -1,278 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Expert: Scoring API.
|
3
|
-
# Subclasses implement search scoring.
|
4
|
-
#
|
5
|
-
# The score of query *q* for document *d* is defined
|
6
|
-
# in terms of these methods as follows:
|
7
|
-
#
|
8
|
-
# <table cellpadding="0" cellspacing="0" border="0">
|
9
|
-
# <tr>
|
10
|
-
# <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
|
11
|
-
# <td valign="middle" align="center">
|
12
|
-
# <big><big><big><big><big>&Sigma</big></big></big></big></big></td>
|
13
|
-
# <td valign="middle"><small>
|
14
|
-
# #tf(int) tf(t in d)#
|
15
|
-
# #idf_term(Term,Searcher) idf(t)#
|
16
|
-
# Field#getBoost getBoost(t.field in d)#
|
17
|
-
# #length_norm(String,int) length_norm(t.field in d)
|
18
|
-
# </small></td>
|
19
|
-
# <td valign="middle" rowspan="2"> *
|
20
|
-
# #coord(int,int) coord(q,d)#
|
21
|
-
# #query_norm(float) query_norm(q)
|
22
|
-
# </td>
|
23
|
-
# </tr>
|
24
|
-
# <tr>
|
25
|
-
# <td valign="top" align="right">
|
26
|
-
# <small>t in q</small>
|
27
|
-
# </td>
|
28
|
-
# </tr>
|
29
|
-
# </table>
|
30
|
-
#
|
31
|
-
# See #set_default
|
32
|
-
# See IndexWriter#set_similarity
|
33
|
-
# See Searcher#set_similarity
|
34
|
-
class Similarity
|
35
|
-
|
36
|
-
def Similarity.byte_to_float(b)
|
37
|
-
if (b == 0)
|
38
|
-
return 0.0
|
39
|
-
end
|
40
|
-
mantissa = b & 0x07 # 0x07 = 7 = 0b00000111
|
41
|
-
exponent = (b >> 3) & 0x1F # 0x1f = 31 = 0b00011111
|
42
|
-
return [0,0,(mantissa << 5),(exponent+48)].pack("cccc").unpack("e")[0]
|
43
|
-
end
|
44
|
-
|
45
|
-
def Similarity.float_to_byte(f)
|
46
|
-
if (f <= 0.0) then return 0 end
|
47
|
-
|
48
|
-
bits = [f].pack("e").unpack("cccc")
|
49
|
-
mantissa = (bits[2] & 0xEf) >> 5
|
50
|
-
exponent = (bits[3] - 48)
|
51
|
-
|
52
|
-
if (exponent > 0x1f)
|
53
|
-
exponent = 0x1f # 0x1f = 31 = 0b00011111
|
54
|
-
mantissa = 0x07 # 0x07 = 7 = 0b00000111
|
55
|
-
end
|
56
|
-
|
57
|
-
if (exponent < 0)
|
58
|
-
exponent = 0
|
59
|
-
mantissa = 1
|
60
|
-
end
|
61
|
-
|
62
|
-
return ((exponent<<3) | mantissa)
|
63
|
-
end
|
64
|
-
|
65
|
-
# Cache of decoded bytes
|
66
|
-
NORM_TABLE = Array.new(256) { |i| Similarity.byte_to_float(i) }
|
67
|
-
|
68
|
-
# Decodes a normalization factor stored in an index.
|
69
|
-
# See Similarity#encode_norm(float)
|
70
|
-
def Similarity.decode_norm(b)
|
71
|
-
return NORM_TABLE[b & 0xFF]
|
72
|
-
end
|
73
|
-
|
74
|
-
# Decodes a normalization factor stored in an index.
|
75
|
-
# See Similarity#encode_norm(float)
|
76
|
-
def decode_norm(b)
|
77
|
-
return self.class.decode_norm(b)
|
78
|
-
end
|
79
|
-
|
80
|
-
# Computes the normalization value for a field given the total number of
|
81
|
-
# terms contained in a field. These values, together with field boosts, are
|
82
|
-
# stored in an index and multipled into scores for hits on each field by the
|
83
|
-
# search code.
|
84
|
-
#
|
85
|
-
# Matches in longer fields are less precise, so implemenations of this
|
86
|
-
# method usually return smaller values when *num_tokens* is large,
|
87
|
-
# and larger values when *num_tokens* is small.
|
88
|
-
#
|
89
|
-
# That these values are computed under
|
90
|
-
# IndexWriter#add_document and stored then using
|
91
|
-
# #encode_norm(float). Thus they have limited precision, and documents
|
92
|
-
# must be re-indexed if this method is altered.
|
93
|
-
#
|
94
|
-
# field:: the name of the field
|
95
|
-
# num_tokens:: the total number of tokens contained in fields named
|
96
|
-
# _field_ of _doc_.
|
97
|
-
#
|
98
|
-
# See Field#set_boost
|
99
|
-
def length_norm
|
100
|
-
raise NotImplementedError
|
101
|
-
end
|
102
|
-
|
103
|
-
# Computes the normalization value for a query given the sum of the squared
|
104
|
-
# weights of each of the query terms. This value is then multipled into the
|
105
|
-
# weight of each query term.
|
106
|
-
#
|
107
|
-
# This does not affect ranking, but rather just attempts to make scores
|
108
|
-
# from different queries comparable.
|
109
|
-
#
|
110
|
-
# sum_of_squared_weights:: the sum of the squares of query term weights
|
111
|
-
# Return:: a normalization factor for query weights
|
112
|
-
def query_norm
|
113
|
-
raise NotImplementedError
|
114
|
-
end
|
115
|
-
|
116
|
-
# Encodes a normalization factor for storage in an index.
|
117
|
-
#
|
118
|
-
# The encoding uses a five-bit exponent and three-bit mantissa, thus
|
119
|
-
# representing values from around 7x10^9 to 2x10^-9 with about one
|
120
|
-
# significant decimal digit of accuracy. Zero is also represented.
|
121
|
-
# Negative numbers are rounded up to zero. Values too large to represent
|
122
|
-
# are rounded down to the largest representable value. Positive values too
|
123
|
-
# small to represent are rounded up to the smallest positive representable
|
124
|
-
# value.
|
125
|
-
#
|
126
|
-
# See Field#boost=
|
127
|
-
def Similarity.encode_norm(f)
|
128
|
-
return Similarity.float_to_byte(f)
|
129
|
-
end
|
130
|
-
|
131
|
-
def encode_norm(f)
|
132
|
-
return self.class.float_to_byte(f)
|
133
|
-
end
|
134
|
-
|
135
|
-
# Computes a score factor based on a term or phrase's frequency in a
|
136
|
-
# document. This value is multiplied by the #idf_term(Term, Searcher)
|
137
|
-
# factor for each term in the query and these products are then summed to
|
138
|
-
# form the initial score for a document.
|
139
|
-
#
|
140
|
-
# Terms and phrases repeated in a document indicate the topic of the
|
141
|
-
# document, so implementations of this method usually return larger values
|
142
|
-
# when _freq_ is large, and smaller values when _freq_
|
143
|
-
# is small.
|
144
|
-
#
|
145
|
-
# The default implementation calls #tf(float)
|
146
|
-
#
|
147
|
-
# freq:: the frequency of a term within a document
|
148
|
-
# Return:: a score factor based on a term's within-document frequency
|
149
|
-
def tf
|
150
|
-
raise NotImplementedError
|
151
|
-
end
|
152
|
-
|
153
|
-
# Computes the amount of a sloppy phrase match, based on an edit distance.
|
154
|
-
# This value is summed for each sloppy phrase match in a document to form
|
155
|
-
# the frequency that is passed to #tf(float).
|
156
|
-
#
|
157
|
-
# A phrase match with a small edit distance to a document passage more
|
158
|
-
# closely matches the document, so implementations of this method usually
|
159
|
-
# return larger values when the edit distance is small and smaller values
|
160
|
-
# when it is large.
|
161
|
-
#
|
162
|
-
# See PhraseQuery#slop(int)
|
163
|
-
# distance:: the edit distance of this sloppy phrase match
|
164
|
-
# Return:: the frequency increment for this match
|
165
|
-
def sloppy_freq
|
166
|
-
raise NotImplementedError
|
167
|
-
end
|
168
|
-
|
169
|
-
# Computes a score factor for a simple term.
|
170
|
-
#
|
171
|
-
# The default implementation is:
|
172
|
-
# return idf(searcher.doc_freq(term), searcher.max_doc())
|
173
|
-
#
|
174
|
-
# Note that Searcher#max_doc() is used instead of
|
175
|
-
# IndexReader#num_docs() because it is proportional to
|
176
|
-
# Searcher#doc_freq(Term) , i.e., when one is inaccurate,
|
177
|
-
# so is the other, and in the same direction.
|
178
|
-
#
|
179
|
-
# term:: the term in question
|
180
|
-
# searcher:: the document collection being searched
|
181
|
-
# Return:: a score factor for the term
|
182
|
-
def idf_term(term, searcher)
|
183
|
-
return idf(searcher.doc_freq(term), searcher.max_doc())
|
184
|
-
end
|
185
|
-
|
186
|
-
# Computes a score factor for a phrase.
|
187
|
-
#
|
188
|
-
# The default implementation sums the #idf(Term,Searcher) factor
|
189
|
-
# for each term in the phrase.
|
190
|
-
#
|
191
|
-
# terms:: the terms in the phrase
|
192
|
-
# searcher:: the document collection being searched
|
193
|
-
# Return:: a score factor for the phrase
|
194
|
-
def idf_phrase(terms, searcher)
|
195
|
-
idf = 0.0
|
196
|
-
terms.each { |term| idf += idf_term(term, searcher) }
|
197
|
-
return idf
|
198
|
-
end
|
199
|
-
|
200
|
-
# Computes a score factor based on a term's document frequency (the number
|
201
|
-
# of documents which contain the term). This value is multiplied by the
|
202
|
-
# #tf(int) factor for each term in the query and these products are
|
203
|
-
# then summed to form the initial score for a document.
|
204
|
-
#
|
205
|
-
# Terms that occur in fewer documents are better indicators of topic, so
|
206
|
-
# implemenations of this method usually return larger values for rare terms,
|
207
|
-
# and smaller values for common terms.
|
208
|
-
#
|
209
|
-
# doc_freq:: the number of documents which contain the term
|
210
|
-
# num_docs:: the total number of documents in the collection
|
211
|
-
# Return:: a score factor based on the term's document frequency
|
212
|
-
def idf
|
213
|
-
raise NotImplementedError
|
214
|
-
end
|
215
|
-
|
216
|
-
# Computes a score factor based on the fraction of all query terms that a
|
217
|
-
# document contains. This value is multiplied into scores.
|
218
|
-
#
|
219
|
-
# The presence of a large portion of the query terms indicates a better
|
220
|
-
# match with the query, so implemenations of this method usually return
|
221
|
-
# larger values when the ratio between these parameters is large and smaller
|
222
|
-
# values when the ratio between them is small.
|
223
|
-
#
|
224
|
-
# overlap:: the number of query terms matched in the document
|
225
|
-
# max_overlap:: the total number of terms in the query
|
226
|
-
# Return:: a score factor based on term overlap with the query
|
227
|
-
def coord
|
228
|
-
raise NotImplementedError
|
229
|
-
end
|
230
|
-
end
|
231
|
-
|
232
|
-
# Expert: Default scoring implementation.
|
233
|
-
class DefaultSimilarity < Similarity
|
234
|
-
# See source
|
235
|
-
def length_norm(field, num_terms)
|
236
|
-
return 1.0 / Math.sqrt(num_terms)
|
237
|
-
end
|
238
|
-
|
239
|
-
# See source
|
240
|
-
def query_norm(sum_of_squared_weights)
|
241
|
-
return 1.0 / Math.sqrt(sum_of_squared_weights)
|
242
|
-
end
|
243
|
-
|
244
|
-
# See source
|
245
|
-
def tf(freq)
|
246
|
-
return Math.sqrt(freq)
|
247
|
-
end
|
248
|
-
|
249
|
-
# See source
|
250
|
-
def sloppy_freq(distance)
|
251
|
-
return 1.0 / (distance + 1)
|
252
|
-
end
|
253
|
-
|
254
|
-
# See source
|
255
|
-
def idf(doc_freq, num_docs)
|
256
|
-
return 0.0 if num_docs == 0
|
257
|
-
return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
|
258
|
-
end
|
259
|
-
|
260
|
-
# See source
|
261
|
-
def coord(overlap, max_overlap)
|
262
|
-
return overlap.to_f / max_overlap
|
263
|
-
end
|
264
|
-
end
|
265
|
-
|
266
|
-
class Similarity
|
267
|
-
# The Similarity implementation used by default.
|
268
|
-
@@default = DefaultSimilarity.new()
|
269
|
-
|
270
|
-
def Similarity.default
|
271
|
-
return @@default
|
272
|
-
end
|
273
|
-
|
274
|
-
def Similarity.default=(default)
|
275
|
-
@@default = default
|
276
|
-
end
|
277
|
-
end
|
278
|
-
end
|
@@ -1,47 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
class SloppyPhraseScorer < PhraseScorer
|
3
|
-
|
4
|
-
def initialize(weight, tps, positions, similarity, slop, norms)
|
5
|
-
super(weight, tps, positions, similarity, norms)
|
6
|
-
@slop = slop
|
7
|
-
end
|
8
|
-
|
9
|
-
def phrase_freq()
|
10
|
-
@pq.clear()
|
11
|
-
last_pos = 0
|
12
|
-
each do |pp|
|
13
|
-
pp.first_position()
|
14
|
-
last_pos = pp.position if (pp.position > last_pos)
|
15
|
-
@pq.push(pp) # build pq from list
|
16
|
-
end
|
17
|
-
|
18
|
-
freq = 0.0
|
19
|
-
done = false
|
20
|
-
begin
|
21
|
-
pp = @pq.pop()
|
22
|
-
pos = start = pp.position
|
23
|
-
next_pos = @pq.top().position
|
24
|
-
while pos <= next_pos
|
25
|
-
start = pos # advance pp to min window
|
26
|
-
if not pp.next_position()
|
27
|
-
done = true # ran out of a term -- done
|
28
|
-
break
|
29
|
-
end
|
30
|
-
pos = pp.position
|
31
|
-
end
|
32
|
-
|
33
|
-
match_length = last_pos - start
|
34
|
-
if (match_length <= @slop)
|
35
|
-
freq += @similarity.sloppy_freq(match_length) # score match
|
36
|
-
end
|
37
|
-
|
38
|
-
if (pp.position > last_pos)
|
39
|
-
last_pos = pp.position
|
40
|
-
end
|
41
|
-
@pq.push(pp) # restore pq
|
42
|
-
end while (!done)
|
43
|
-
|
44
|
-
return freq
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
data/lib/ferret/search/sort.rb
DELETED
@@ -1,112 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Encapsulates sort criteria for returned hits.
|
3
|
-
#
|
4
|
-
# The fields used to determine sort order must be carefully chosen.
|
5
|
-
# Documents must contain a single term in such a field, and the value of the
|
6
|
-
# term should indicate the document's relative position in a given sort
|
7
|
-
# order. The field must be indexed, but should not be tokenized, and does
|
8
|
-
# not need to be stored (unless you happen to want it back with the rest of
|
9
|
-
# your document data). In other words:
|
10
|
-
#
|
11
|
-
# document << Field.new("by_number",
|
12
|
-
# x.to_s,
|
13
|
-
# Field::Store::NO,
|
14
|
-
# Field::Index::UN_TOKENIZED))
|
15
|
-
#
|
16
|
-
#
|
17
|
-
# === Valid Types of Values
|
18
|
-
#
|
19
|
-
# There are three possible kinds of term values which may be put into
|
20
|
-
# sorting fields: Integers, Floats, or Strings. Unless SortField objects
|
21
|
-
# are specified, the type of value in the field is determined by parsing the
|
22
|
-
# first term in the field.
|
23
|
-
#
|
24
|
-
# Integer term values should contain only digits and an optional preceeding
|
25
|
-
# negative sign. Values must be base 10. Documents which should appear
|
26
|
-
# first in the sort should have low value integers, later documents high
|
27
|
-
# values (i.e. the documents should be numbered +1..n+ where +1+ is the
|
28
|
-
# first and +n+ the last).
|
29
|
-
#
|
30
|
-
# Float term values should conform to values accepted by String#to_f.
|
31
|
-
# Documents which should appear first in the sort should have low values,
|
32
|
-
# later documents high values.
|
33
|
-
#
|
34
|
-
# String term values can contain any valid String, but should not be
|
35
|
-
# tokenized. The values are sorted according to their Comparable natural
|
36
|
-
# order. Note that using this type of term value has higher memory
|
37
|
-
# requirements than the other two types.
|
38
|
-
#
|
39
|
-
# === Object Reuse
|
40
|
-
#
|
41
|
-
# One of these objects can be used multiple times and the sort order changed
|
42
|
-
# between usages.
|
43
|
-
#
|
44
|
-
# This class is thread safe.
|
45
|
-
#
|
46
|
-
# === Memory Usage
|
47
|
-
#
|
48
|
-
# Sorting uses caches of term values maintained by the internal HitQueue(s).
|
49
|
-
# The cache is static and contains an integer or float array of length
|
50
|
-
# +IndexReader#max_doc+ for each field name for which a sort is performed.
|
51
|
-
# In other words, the size of the cache in bytes is:
|
52
|
-
#
|
53
|
-
# 4 * IndexReader#max_doc * (# of different fields actually used to sort)
|
54
|
-
#
|
55
|
-
# For String fields, the cache is larger: in addition to the above array,
|
56
|
-
# the value of every term in the field is kept in memory. If there are many
|
57
|
-
# unique terms in the field, this could be quite large.
|
58
|
-
#
|
59
|
-
# Note that the size of the cache is not affected by how many fields are in
|
60
|
-
# the index and _might_ be used to sort - only by the ones actually used to
|
61
|
-
# sort a result set.
|
62
|
-
#
|
63
|
-
# The cache is cleared each time a new +IndexReader+ is passed in, or if the
|
64
|
-
# value returned by +max_doc()+ changes for the current IndexReader. This
|
65
|
-
# class is not set up to be able to efficiently sort hits from more than one
|
66
|
-
# index simultaneously.
|
67
|
-
class Sort
|
68
|
-
|
69
|
-
attr_accessor :fields
|
70
|
-
|
71
|
-
# Sorts by computed relevance. You can pass a string representing the name
|
72
|
-
# of the field you want to sort on, a SortField, or an array of either
|
73
|
-
# (but not a mixed array). If you pass a string or and array of strings
|
74
|
-
# you can also pass a reverse flag. If you pass a SortField the reverse is
|
75
|
-
# handled by it.
|
76
|
-
#
|
77
|
-
# fields:: The fields you want to sort on. See also SortField
|
78
|
-
# reverse:: pass true if you want the sort order to be reversed. Only
|
79
|
-
# works if you pass the field names.
|
80
|
-
def initialize(fields = [SortField::FIELD_SCORE, SortField::FIELD_DOC],
|
81
|
-
reverse = false)
|
82
|
-
fields = [fields] unless fields.is_a?(Array)
|
83
|
-
@fields = fields
|
84
|
-
fields = fields.map {|field| field.is_a?(Symbol) ? field.to_s : field}
|
85
|
-
if fields[0].is_a?(String)
|
86
|
-
@fields = fields.map do |field|
|
87
|
-
if (field.is_a?(String))
|
88
|
-
next SortField.new(field, {:sort_type => SortField::SortType::AUTO,
|
89
|
-
:reverse => reverse})
|
90
|
-
else
|
91
|
-
next field
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
doc_sort_added = false
|
96
|
-
@fields.each {|f| doc_sort_added = true if f == SortField::FIELD_DOC }
|
97
|
-
@fields << SortField::FIELD_DOC if not doc_sort_added
|
98
|
-
end
|
99
|
-
|
100
|
-
# Represents sorting by computed relevance. Using this sort criteria returns
|
101
|
-
# the same results as calling Searcher#search(Query) Searcher#search()
|
102
|
-
# without a sort criteria, only with slightly more overhead.
|
103
|
-
RELEVANCE = Sort.new()
|
104
|
-
|
105
|
-
# Represents sorting by index order.
|
106
|
-
INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
|
107
|
-
|
108
|
-
def to_s()
|
109
|
-
return "Sort[" + @fields.map {|field| "#{field}"}.join(", ") + "]"
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|