ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/lib/ferret/search/filter.rb
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Abstract base class providing a mechanism to restrict searches to a subset
|
3
|
-
# of an index.
|
4
|
-
class Filter
|
5
|
-
# Returns a BitSet with true for documents which should be permitted in
|
6
|
-
# search results, and false for those that should not.
|
7
|
-
def bits(reader)
|
8
|
-
raise NotImplementedError
|
9
|
-
end
|
10
|
-
end
|
11
|
-
end
|
@@ -1,130 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# A query that applies a filter to the results of another query.
|
3
|
-
#
|
4
|
-
# Note: the bits are retrieved from the filter each time this
|
5
|
-
# query is used in a search - use a CachingWrapperFilter to avoid
|
6
|
-
# regenerating the bits every time.
|
7
|
-
class FilteredQuery < Query
|
8
|
-
attr_accessor :sub_query
|
9
|
-
attr_reader :filter
|
10
|
-
|
11
|
-
# Constructs a new query which applies a filter to the results of the
|
12
|
-
# original query.
|
13
|
-
#
|
14
|
-
# Filter.bits() will be called every time this query is used in a search.
|
15
|
-
#
|
16
|
-
# query:: Query to be filtered, cannot be +nil+.
|
17
|
-
# filter:: Filter to apply to query results, cannot be +nil+.
|
18
|
-
def initialize(query, filter)
|
19
|
-
super()
|
20
|
-
@sub_query = query
|
21
|
-
@filter = filter
|
22
|
-
end
|
23
|
-
|
24
|
-
# Returns a Weight that applies the filter to the enclosed query's Weight.
|
25
|
-
# This is accomplished by overriding the Scorer returned by the Weight.
|
26
|
-
def create_weight(searcher)
|
27
|
-
sub_weight = @sub_query.create_weight(searcher)
|
28
|
-
similarity = @sub_query.similarity(searcher)
|
29
|
-
return FilteredWeight.new(self, sub_weight, similarity)
|
30
|
-
end
|
31
|
-
|
32
|
-
class FilteredScorer < Scorer
|
33
|
-
def initialize(sub_scorer, bits, similarity)
|
34
|
-
super(similarity)
|
35
|
-
@sub_scorer = sub_scorer
|
36
|
-
@bits = bits
|
37
|
-
end
|
38
|
-
|
39
|
-
# pass these methods through to the enclosed scorer
|
40
|
-
def next?() return @sub_scorer.next?; end
|
41
|
-
def doc() return @sub_scorer.doc; end
|
42
|
-
def skip_to(i) return @sub_scorer.skip_to(i); end
|
43
|
-
|
44
|
-
# if the document has been filtered out, set score to 0.0
|
45
|
-
def score()
|
46
|
-
return (@bits.get(@sub_scorer.doc) ? @sub_scorer.score() : 0.0)
|
47
|
-
end
|
48
|
-
|
49
|
-
# add an explanation about whether the document was filtered
|
50
|
-
def explain(i)
|
51
|
-
exp = @sub_scorer.explain(i)
|
52
|
-
if (@bits.get(i))
|
53
|
-
exp.description = "allowed by filter: #{exp.description}"
|
54
|
-
else
|
55
|
-
exp.description = "removed by filter: #{exp.description}"
|
56
|
-
end
|
57
|
-
return exp
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
class FilteredWeight < Weight
|
62
|
-
attr_reader :query
|
63
|
-
|
64
|
-
def initialize(query, sub_weight, similarity)
|
65
|
-
@query = query
|
66
|
-
@sub_weight = sub_weight
|
67
|
-
@similarity = similarity
|
68
|
-
end
|
69
|
-
|
70
|
-
# pass these methods through to enclosed query's weight
|
71
|
-
def value()
|
72
|
-
return @sub_weight.value
|
73
|
-
end
|
74
|
-
|
75
|
-
def sum_of_squared_weights()
|
76
|
-
return @sub_weight.sum_of_squared_weights
|
77
|
-
end
|
78
|
-
|
79
|
-
def normalize(v)
|
80
|
-
return @sub_weight.normalize(v)
|
81
|
-
end
|
82
|
-
|
83
|
-
def explain(ir, i)
|
84
|
-
return @sub_weight.explain(ir, i)
|
85
|
-
end
|
86
|
-
|
87
|
-
# return a scorer that overrides the enclosed query's score if
|
88
|
-
# the given hit has been filtered out.
|
89
|
-
def scorer(reader)
|
90
|
-
scorer = @sub_weight.scorer(reader)
|
91
|
-
bits = @query.filter.bits(reader)
|
92
|
-
return FilteredScorer.new(scorer, bits, @similarity)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
# Rewrites the wrapped query.
|
97
|
-
def rewrite(reader)
|
98
|
-
rewritten = @sub_query.rewrite(reader)
|
99
|
-
if (rewritten != @sub_query)
|
100
|
-
clone = self.clone()
|
101
|
-
clone.query = rewritten
|
102
|
-
return clone
|
103
|
-
else
|
104
|
-
return self
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
# inherit javadoc
|
109
|
-
def extract_terms(terms)
|
110
|
-
@sub_query.extract_terms(terms)
|
111
|
-
end
|
112
|
-
|
113
|
-
# Prints a user-readable version of this query.
|
114
|
-
def to_s(f = nil)
|
115
|
-
return "filtered(#{@sub_query.to_s(f)})->#{@filter}"
|
116
|
-
end
|
117
|
-
|
118
|
-
# Returns true iff +o+ is equal to this.
|
119
|
-
def eql?(o)
|
120
|
-
return (o.instance_of?(FilteredQuery) and
|
121
|
-
(@sub_query == o.sub_query) and (@filter == o.filter))
|
122
|
-
end
|
123
|
-
alias :== :eql?
|
124
|
-
|
125
|
-
# Returns a hash code value for this object.
|
126
|
-
def hash()
|
127
|
-
return @sub_query.hash ^ @filter.hash
|
128
|
-
end
|
129
|
-
end
|
130
|
-
end
|
@@ -1,79 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
|
3
|
-
# Abstract class for enumerating a subset of all terms.
|
4
|
-
#
|
5
|
-
# Term enumerations are always ordered by Term.<=>(). Each term in
|
6
|
-
# the enumeration is greater than all that precede it.
|
7
|
-
class FilteredTermEnum < Ferret::Index::TermEnum
|
8
|
-
|
9
|
-
# Returns the current Term in the enumeration.
|
10
|
-
# Returns nil if no Term matches or all terms have been enumerated.
|
11
|
-
attr_reader :term
|
12
|
-
|
13
|
-
def initialize()
|
14
|
-
@term = nil
|
15
|
-
@enum = nil
|
16
|
-
@reader = nil
|
17
|
-
end
|
18
|
-
|
19
|
-
# Equality compare on the term
|
20
|
-
def term_compare(term)
|
21
|
-
raise NotImplementedError
|
22
|
-
end
|
23
|
-
|
24
|
-
# Equality measure on the term
|
25
|
-
def difference()
|
26
|
-
raise NotImplementedError
|
27
|
-
end
|
28
|
-
|
29
|
-
# Indiciates the end of the enumeration has been reached
|
30
|
-
def end_enum()
|
31
|
-
raise NotImplementedError
|
32
|
-
end
|
33
|
-
|
34
|
-
def enum=(enum)
|
35
|
-
@enum = enum
|
36
|
-
# Find the first term that matches
|
37
|
-
term = @enum.term()
|
38
|
-
if (term != nil and term_compare(term))
|
39
|
-
@term = term
|
40
|
-
else
|
41
|
-
next?
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
# Returns the doc_freq of the current Term in the enumeration.
|
46
|
-
# Returns -1 if no Term matches or all terms have been enumerated.
|
47
|
-
def doc_freq()
|
48
|
-
if (@enum == nil)
|
49
|
-
return -1
|
50
|
-
end
|
51
|
-
return @enum.doc_freq()
|
52
|
-
end
|
53
|
-
|
54
|
-
# Increments the enumeration to the next element. True if one exists.
|
55
|
-
def next?()
|
56
|
-
return false if (@enum == nil) # enum not initialized
|
57
|
-
@term = nil
|
58
|
-
while @term.nil?
|
59
|
-
if end_enum() or ! @enum.next?
|
60
|
-
return false
|
61
|
-
end
|
62
|
-
term = @enum.term()
|
63
|
-
if (term_compare(term))
|
64
|
-
@term = term
|
65
|
-
return true
|
66
|
-
end
|
67
|
-
end
|
68
|
-
@term = nil
|
69
|
-
return false
|
70
|
-
end
|
71
|
-
|
72
|
-
# Closes the enumeration to further activity, freeing resources.
|
73
|
-
def close()
|
74
|
-
@enum.close()
|
75
|
-
@term = nil
|
76
|
-
@enum = nil
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
@@ -1,154 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Implements the fuzzy search query. The similiarity measurement
|
3
|
-
# is based on the Levenshtein (distance) algorithm.
|
4
|
-
class FuzzyQuery < MultiTermQuery
|
5
|
-
@@default_min_similarity = 0.5
|
6
|
-
@@default_prefix_length = 0
|
7
|
-
|
8
|
-
def FuzzyQuery.default_min_similarity()
|
9
|
-
return @@default_min_similarity
|
10
|
-
end
|
11
|
-
|
12
|
-
def FuzzyQuery.default_min_similarity=(minimum_similarity)
|
13
|
-
if (minimum_similarity >= 1.0)
|
14
|
-
raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
|
15
|
-
elsif (minimum_similarity < 0.0)
|
16
|
-
raise ArgumentError, "minimum_similarity cannot be less than 0"
|
17
|
-
end
|
18
|
-
@@default_min_similarity = minimum_similarity
|
19
|
-
end
|
20
|
-
|
21
|
-
def FuzzyQuery.default_prefix_length()
|
22
|
-
return @@default_prefix_length
|
23
|
-
end
|
24
|
-
|
25
|
-
def FuzzyQuery.default_prefix_length=(prefix_length)
|
26
|
-
if (prefix_length < 0)
|
27
|
-
raise ArgumentError, "prefix_length cannot be less than 0"
|
28
|
-
end
|
29
|
-
@@default_prefix_length = prefix_length
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
attr_reader :prefix_length, :minimum_similarity
|
34
|
-
# Create a new FuzzyQuery that will match terms with a similarity
|
35
|
-
# of at least +minimum_similarity+ to +term+.
|
36
|
-
# If a +prefix_length+ > 0 is specified, a common prefix
|
37
|
-
# of that length is also required.
|
38
|
-
#
|
39
|
-
# term:: the term to search for
|
40
|
-
# minimum_similarity:: a value between 0 and 1 to set the required
|
41
|
-
# similarity between the query term and the matching
|
42
|
-
# terms. For example, for a +minimum_similarity+ of
|
43
|
-
# <tt>0.5</tt> a term of the same length as the query
|
44
|
-
# term is considered similar to the query term if the
|
45
|
-
# edit distance between both terms is less than
|
46
|
-
# <tt>length(term)*0.5</tt>
|
47
|
-
# prefix_length:: length of common (non-fuzzy) prefix. This is the
|
48
|
-
# number of characters at the start of a term that
|
49
|
-
# must be identical (fuzzy) to the query term if the
|
50
|
-
# query is to match that term.
|
51
|
-
# raises:: ArgumentError if minimum_similarity is >= 1 or < 0
|
52
|
-
# or if prefix_length < 0
|
53
|
-
def initialize(term,
|
54
|
-
minimum_similarity = @@default_min_similarity,
|
55
|
-
prefix_length = @@default_prefix_length)
|
56
|
-
super(term)
|
57
|
-
|
58
|
-
if (minimum_similarity >= 1.0)
|
59
|
-
raise ArgumentError, "minimum_similarity >= 1"
|
60
|
-
elsif (minimum_similarity < 0.0)
|
61
|
-
raise ArgumentError, "minimum_similarity < 0"
|
62
|
-
end
|
63
|
-
|
64
|
-
if (prefix_length < 0)
|
65
|
-
raise ArgumentError, "prefix_length < 0"
|
66
|
-
end
|
67
|
-
|
68
|
-
@minimum_similarity = minimum_similarity
|
69
|
-
@prefix_length = prefix_length
|
70
|
-
end
|
71
|
-
|
72
|
-
def get_term_enum(reader)
|
73
|
-
return FuzzyTermEnum.new(reader, @term, @minimum_similarity, @prefix_length)
|
74
|
-
end
|
75
|
-
|
76
|
-
def rewrite(reader)
|
77
|
-
|
78
|
-
fuzzy_enum = get_term_enum(reader)
|
79
|
-
max_clause_count = BooleanQuery.max_clause_count
|
80
|
-
st_queue = ScoreTermQueue.new(max_clause_count)
|
81
|
-
|
82
|
-
begin
|
83
|
-
begin
|
84
|
-
min_score = 0.0
|
85
|
-
score = 0.0
|
86
|
-
t = fuzzy_enum.term()
|
87
|
-
if t
|
88
|
-
score = fuzzy_enum.difference()
|
89
|
-
|
90
|
-
# terms come in alphabetical order, therefore if queue is full and score
|
91
|
-
# not bigger than min_score, we can skip
|
92
|
-
if(st_queue.size < max_clause_count or score > min_score)
|
93
|
-
st_queue.insert(ScoreTerm.new(t, score))
|
94
|
-
min_score = st_queue.top.score # maintain min_score
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end while fuzzy_enum.next?
|
98
|
-
ensure
|
99
|
-
fuzzy_enum.close()
|
100
|
-
end
|
101
|
-
|
102
|
-
bq = BooleanQuery.new(true)
|
103
|
-
st_queue.size.times do |i|
|
104
|
-
st = st_queue.pop()
|
105
|
-
tq = TermQuery.new(st.term) # found a match
|
106
|
-
tq.boost = boost() * st.score # set the boost
|
107
|
-
bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
|
108
|
-
end
|
109
|
-
|
110
|
-
return bq
|
111
|
-
end
|
112
|
-
|
113
|
-
def to_s(field = nil)
|
114
|
-
buffer = ""
|
115
|
-
buffer << "#{@term.field}:" if @term.field != field
|
116
|
-
buffer << "#{@term.text}~"
|
117
|
-
buffer << minimum_similarity.to_s if minimum_similarity != 0.5
|
118
|
-
buffer << "^#{boost()}" if (boost() != 1.0)
|
119
|
-
return buffer
|
120
|
-
end
|
121
|
-
|
122
|
-
class ScoreTerm
|
123
|
-
attr_accessor :term, :score
|
124
|
-
|
125
|
-
def initialize(term, score)
|
126
|
-
@term = term
|
127
|
-
@score = score
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
class ScoreTermQueue < Ferret::Utils::PriorityQueue
|
132
|
-
|
133
|
-
# See PriorityQueue#less_than(o1, o2)
|
134
|
-
def less_than(st1, st2)
|
135
|
-
if (st1.score == st1.score)
|
136
|
-
return st1.term > st2.term
|
137
|
-
else
|
138
|
-
return st1.score < st2.score
|
139
|
-
end
|
140
|
-
end
|
141
|
-
end
|
142
|
-
|
143
|
-
def eql?(o)
|
144
|
-
return (o.instance_of?(FuzzyQuery) and super(o) and
|
145
|
-
(@minimum_similarity == o.minimum_similarity) and
|
146
|
-
(@prefix_length == fuzzyQuery.prefix_length))
|
147
|
-
end
|
148
|
-
alias :== :eql?
|
149
|
-
|
150
|
-
def hash()
|
151
|
-
return super ^ @minimum_similarity.hash ^ @prefix_length.hash
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
@@ -1,247 +0,0 @@
|
|
1
|
-
require 'monitor'
|
2
|
-
|
3
|
-
module Ferret::Search
|
4
|
-
# Subclass of FilteredTermEnum for enumerating all terms that are similiar
|
5
|
-
# to the specified filter term.
|
6
|
-
#
|
7
|
-
# Term enumerations are always ordered by Term.compareTo(). Each term in
|
8
|
-
# the enumeration is greater than all that precede it.
|
9
|
-
class FuzzyTermEnum < FilteredTermEnum
|
10
|
-
include MonitorMixin
|
11
|
-
|
12
|
-
include Ferret::Index
|
13
|
-
attr_reader :end_enum
|
14
|
-
|
15
|
-
# This should be somewhere around the average long word.
|
16
|
-
# If it is longer, we waste time and space. If it is shorter, we waste a
|
17
|
-
# little bit of time growing the array as we encounter longer words.
|
18
|
-
TYPICAL_LONGEST_WORD_IN_INDEX = 19
|
19
|
-
|
20
|
-
# Constructor for enumeration of all terms from specified +reader+ which
|
21
|
-
# share a prefix of length +prefix_length+ with +term+ and which have a
|
22
|
-
# fuzzy similarity > +min_similarity+.
|
23
|
-
#
|
24
|
-
# After calling the constructor the enumeration is already pointing to the
|
25
|
-
# first valid term if such a term exists.
|
26
|
-
#
|
27
|
-
# reader:: Delivers terms.
|
28
|
-
# term:: Pattern term.
|
29
|
-
# min_similarity:: Minimum required similarity for terms from the reader.
|
30
|
-
# Default value is 0.5.
|
31
|
-
# prefix_length:: Length of required common prefix. Default value is 0.
|
32
|
-
def initialize(reader, term,
|
33
|
-
minimum_similarity = FuzzyQuery.default_min_similarity,
|
34
|
-
prefix_length = FuzzyQuery.default_prefix_length)
|
35
|
-
super()
|
36
|
-
|
37
|
-
@reader = reader
|
38
|
-
@end_enum = false
|
39
|
-
@max_distances = Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)
|
40
|
-
|
41
|
-
|
42
|
-
if (minimum_similarity >= 1.0)
|
43
|
-
raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
|
44
|
-
elsif (minimum_similarity < 0.0)
|
45
|
-
raise ArgumentError, "minimum_similarity cannot be less than 0"
|
46
|
-
end
|
47
|
-
if(prefix_length < 0)
|
48
|
-
raise ArgumentError, "prefix_length cannot be less than 0"
|
49
|
-
end
|
50
|
-
|
51
|
-
@minimum_similarity = minimum_similarity
|
52
|
-
@scale_factor = 1.0 / (1.0 - @minimum_similarity)
|
53
|
-
@search_term = term
|
54
|
-
@field = @search_term.field
|
55
|
-
|
56
|
-
# The prefix could be longer than the word.
|
57
|
-
# It's kind of silly though. It means we must match the entire word.
|
58
|
-
term_length = @search_term.text.length
|
59
|
-
if prefix_length > term_length
|
60
|
-
@prefix_length = term_length
|
61
|
-
else
|
62
|
-
@prefix_length = prefix_length
|
63
|
-
end
|
64
|
-
|
65
|
-
@text = @search_term.text[@prefix_length..-1]
|
66
|
-
@prefix = @search_term.text[0, @prefix_length]
|
67
|
-
|
68
|
-
initialize_max_distances()
|
69
|
-
|
70
|
-
# Allows us save time required to create a new array
|
71
|
-
# everytime similarity is called.
|
72
|
-
@d = init_distance_array()
|
73
|
-
|
74
|
-
self.enum = reader.terms_from(Term.new(@search_term.field, @prefix))
|
75
|
-
end
|
76
|
-
|
77
|
-
# The term_compare method in FuzzyTermEnum uses Levenshtein distance to
|
78
|
-
# calculate the distance between the given term and the comparing term.
|
79
|
-
def term_compare(term)
|
80
|
-
if (@field == term.field and term.text[0, @prefix_length] == @prefix)
|
81
|
-
target = term.text[@prefix_length..-1]
|
82
|
-
@similarity = similarity(target)
|
83
|
-
return (@similarity > @minimum_similarity)
|
84
|
-
end
|
85
|
-
@end_enum = true
|
86
|
-
return false
|
87
|
-
end
|
88
|
-
|
89
|
-
def difference()
|
90
|
-
return (@scale_factor * (@similarity - @minimum_similarity))
|
91
|
-
end
|
92
|
-
|
93
|
-
# ****************************
|
94
|
-
# Compute Levenshtein distance
|
95
|
-
# ****************************
|
96
|
-
|
97
|
-
# Finds and returns the smallest of three integers
|
98
|
-
def min(a, b, c)
|
99
|
-
t = (a < b) ? a : b
|
100
|
-
return (t < c) ? t : c
|
101
|
-
end
|
102
|
-
|
103
|
-
def init_distance_array()
|
104
|
-
return Array.new(@text.length() + 1) {Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)}
|
105
|
-
end
|
106
|
-
|
107
|
-
# Similarity returns a number that is 1.0 or less (including negative
|
108
|
-
# numbers) based on how similar the Term is compared to a target term. It
|
109
|
-
# returns exactly 0.0 when
|
110
|
-
#
|
111
|
-
# edit_distance < maximum_edit_distance
|
112
|
-
#
|
113
|
-
# Otherwise it returns:
|
114
|
-
#
|
115
|
-
# 1 - (edit_distance / length)
|
116
|
-
#
|
117
|
-
# where length is the length of the shortest term (text or target)
|
118
|
-
# including a prefix that are identical and edit_distance is the
|
119
|
-
# Levenshtein distance for the two words.
|
120
|
-
#
|
121
|
-
# Embedded within this algorithm is a fail-fast Levenshtein distance
|
122
|
-
# algorithm. The fail-fast algorithm differs from the standard
|
123
|
-
# Levenshtein distance algorithm in that it is aborted if it is discovered
|
124
|
-
# that the mimimum distance between the words is greater than some
|
125
|
-
# threshold.
|
126
|
-
#
|
127
|
-
# To calculate the maximum distance threshold we use the following formula:
|
128
|
-
#
|
129
|
-
# (1 - minimum_similarity) * length
|
130
|
-
#
|
131
|
-
# where length is the shortest term including any prefix that is not part
|
132
|
-
# of the similarity comparision. This formula was derived by solving for
|
133
|
-
# what maximum value of distance returns false for the following
|
134
|
-
# statements:
|
135
|
-
#
|
136
|
-
# similarity = 1 - (distance / (prefix_length + [textlen, targetlen].min))
|
137
|
-
# return (similarity > minimum_similarity)
|
138
|
-
#
|
139
|
-
# where distance is the Levenshtein distance for the two words.
|
140
|
-
#
|
141
|
-
# Levenshtein distance (also known as edit distance) is a measure of
|
142
|
-
# similiarity between two strings where the distance is measured as the
|
143
|
-
# number of character deletions, insertions or substitutions required to
|
144
|
-
# transform one string to the other string.
|
145
|
-
#
|
146
|
-
# target:: the target word or phrase
|
147
|
-
# returns:: the similarity, 0.0 or less indicates that it matches less
|
148
|
-
# than the required threshold and 1.0 indicates that the text and
|
149
|
-
# target are identical
|
150
|
-
def similarity(target)
|
151
|
-
synchronize do
|
152
|
-
m = target.length
|
153
|
-
n = @text.length
|
154
|
-
|
155
|
-
if (n == 0)
|
156
|
-
# we don't have anything to compare. That means if we just add the
|
157
|
-
# letters for m we get the new word
|
158
|
-
return (@prefix_length == 0) ? 0.0 : 1.0 - (m.to_f / @prefix_length)
|
159
|
-
end
|
160
|
-
if (m == 0)
|
161
|
-
return (@prefix_length == 0) ? 0.0 : 1.0 - (n.to_f / @prefix_length)
|
162
|
-
end
|
163
|
-
|
164
|
-
max_distance = max_distance(m)
|
165
|
-
|
166
|
-
if (max_distance < (m-n).abs)
|
167
|
-
#just adding the characters of m to n or vice-versa results in
|
168
|
-
#too many edits
|
169
|
-
#for example "pre" length is 3 and "prefixes" length is 8. We can see that
|
170
|
-
#given this optimal circumstance, the edit distance cannot be less than 5.
|
171
|
-
#which is 8-3 or more precisesly Math.abs(3-8).
|
172
|
-
#if our maximum edit distance is 4, then we can discard this word
|
173
|
-
#without looking at it.
|
174
|
-
return 0.0
|
175
|
-
end
|
176
|
-
|
177
|
-
#let's make sure we have enough room in our array to do the distance calculations.
|
178
|
-
if (@d[0].length <= m)
|
179
|
-
grow_distance_array(m)
|
180
|
-
end
|
181
|
-
|
182
|
-
# init matrix d
|
183
|
-
(n+1).times {|i| @d[i][0] = i}
|
184
|
-
(m+1).times {|j| @d[0][j] = j}
|
185
|
-
|
186
|
-
# start computing edit distance
|
187
|
-
1.upto(n) do |i|
|
188
|
-
best_possible_edit_distance = m
|
189
|
-
s_i = @text[i-1]
|
190
|
-
1.upto(m) do |j|
|
191
|
-
if (s_i != target[j-1])
|
192
|
-
@d[i][j] = min(@d[i-1][j], @d[i][j-1], @d[i-1][j-1])+1
|
193
|
-
else
|
194
|
-
@d[i][j] = min(@d[i-1][j]+1, @d[i][j-1]+1, @d[i-1][j-1])
|
195
|
-
end
|
196
|
-
if @d[i][j] < best_possible_edit_distance
|
197
|
-
best_possible_edit_distance = @d[i][j]
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
# After calculating row i, the best possible edit distance can be
|
202
|
-
# found by found by finding the smallest value in a given column.
|
203
|
-
# If the best_possible_edit_distance is greater than the max distance,
|
204
|
-
# abort.
|
205
|
-
if (i > max_distance and best_possible_edit_distance > max_distance)
|
206
|
-
# equal is okay, but not greater
|
207
|
-
# the closest the target can be to the text is just too far away.
|
208
|
-
# this target is leaving the party early.
|
209
|
-
return 0.0
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
# this will return less than 0.0 when the edit distance is
|
214
|
-
# greater than the number of characters in the shorter word.
|
215
|
-
# but this was the formula that was previously used in FuzzyTermEnum,
|
216
|
-
# so it has not been changed (even though minimum_similarity must be
|
217
|
-
# greater than 0.0)
|
218
|
-
return 1.0 - (@d[n][m].to_f / (@prefix_length + (n < m ? n : m)))
|
219
|
-
end
|
220
|
-
end
|
221
|
-
|
222
|
-
# Grow the second dimension of the array, so that we can calculate the
|
223
|
-
# Levenshtein difference.
|
224
|
-
def grow_distance_array(m)
|
225
|
-
@d = @d.map {Array.new(m+1)}
|
226
|
-
end
|
227
|
-
|
228
|
-
# The max Distance is the maximum Levenshtein distance for the text
|
229
|
-
# compared to some other value that results in score that is
|
230
|
-
# better than the minimum similarity.
|
231
|
-
# m:: the length of the "other value"
|
232
|
-
# returns:: the maximum levenshtein distance that we care about
|
233
|
-
def max_distance(m)
|
234
|
-
return @max_distances[m] ||= calculate_max_distance(m)
|
235
|
-
end
|
236
|
-
|
237
|
-
def initialize_max_distances()
|
238
|
-
@max_distances.length.times do |i|
|
239
|
-
@max_distances[i] = calculate_max_distance(i)
|
240
|
-
end
|
241
|
-
end
|
242
|
-
|
243
|
-
def calculate_max_distance(m)
|
244
|
-
return ((1-@minimum_similarity) * ([@text.length, m].min + @prefix_length))
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|