ferret 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,11 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Abstract base class providing a mechanism to restrict searches to a subset
|
3
|
+
# of an index.
|
4
|
+
class Filter
|
5
|
+
# Returns a BitSet with true for documents which should be permitted in
|
6
|
+
# search results, and false for those that should not.
|
7
|
+
def bits(reader)
|
8
|
+
raise NotImplementedError
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# A query that applies a filter to the results of another query.
|
3
|
+
#
|
4
|
+
# Note: the bits are retrieved from the filter each time this
|
5
|
+
# query is used in a search - use a CachingWrapperFilter to avoid
|
6
|
+
# regenerating the bits every time.
|
7
|
+
class FilteredQuery < Query
|
8
|
+
attr_accessor :sub_query
|
9
|
+
attr_reader :filter
|
10
|
+
|
11
|
+
# Constructs a new query which applies a filter to the results of the
|
12
|
+
# original query.
|
13
|
+
#
|
14
|
+
# Filter.bits() will be called every time this query is used in a search.
|
15
|
+
#
|
16
|
+
# query:: Query to be filtered, cannot be +nil+.
|
17
|
+
# filter:: Filter to apply to query results, cannot be +nil+.
|
18
|
+
def initialize(query, filter)
|
19
|
+
super()
|
20
|
+
@sub_query = query
|
21
|
+
@filter = filter
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns a Weight that applies the filter to the enclosed query's Weight.
|
25
|
+
# This is accomplished by overriding the Scorer returned by the Weight.
|
26
|
+
def create_weight(searcher)
|
27
|
+
sub_weight = @sub_query.create_weight(searcher)
|
28
|
+
similarity = @sub_query.similarity(searcher)
|
29
|
+
return FilteredWeight.new(self, sub_weight, similarity)
|
30
|
+
end
|
31
|
+
|
32
|
+
class FilteredScorer < Scorer
|
33
|
+
def initialize(sub_scorer, bits, similarity)
|
34
|
+
super(similarity)
|
35
|
+
@sub_scorer = sub_scorer
|
36
|
+
@bits = bits
|
37
|
+
end
|
38
|
+
|
39
|
+
# pass these methods through to the enclosed scorer
|
40
|
+
def next?() return @sub_scorer.next?; end
|
41
|
+
def doc() return @sub_scorer.doc; end
|
42
|
+
def skip_to(i) return @sub_scorer.skip_to(i); end
|
43
|
+
|
44
|
+
# if the document has been filtered out, set score to 0.0
|
45
|
+
def score()
|
46
|
+
return (@bits.get(@sub_scorer.doc) ? @sub_scorer.score() : 0.0)
|
47
|
+
end
|
48
|
+
|
49
|
+
# add an explanation about whether the document was filtered
|
50
|
+
def explain(i)
|
51
|
+
exp = @sub_scorer.explain(i)
|
52
|
+
if (@bits.get(i))
|
53
|
+
exp.description = "allowed by filter: #{exp.description}"
|
54
|
+
else
|
55
|
+
exp.description = "removed by filter: #{exp.description}"
|
56
|
+
end
|
57
|
+
return exp
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class FilteredWeight < Weight
|
62
|
+
attr_reader :query
|
63
|
+
|
64
|
+
def initialize(query, sub_weight, similarity)
|
65
|
+
@query = query
|
66
|
+
@sub_weight = sub_weight
|
67
|
+
@similarity = similarity
|
68
|
+
end
|
69
|
+
|
70
|
+
# pass these methods through to enclosed query's weight
|
71
|
+
def value()
|
72
|
+
return @sub_weight.value
|
73
|
+
end
|
74
|
+
|
75
|
+
def sum_of_squared_weights()
|
76
|
+
return @sub_weight.sum_of_squared_weights
|
77
|
+
end
|
78
|
+
|
79
|
+
def normalize(v)
|
80
|
+
return @sub_weight.normalize(v)
|
81
|
+
end
|
82
|
+
|
83
|
+
def explain(ir, i)
|
84
|
+
return @sub_weight.explain(ir, i)
|
85
|
+
end
|
86
|
+
|
87
|
+
# return a scorer that overrides the enclosed query's score if
|
88
|
+
# the given hit has been filtered out.
|
89
|
+
def scorer(reader)
|
90
|
+
scorer = @sub_weight.scorer(reader)
|
91
|
+
bits = @query.filter.bits(reader)
|
92
|
+
return FilteredScorer.new(scorer, bits, @similarity)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# Rewrites the wrapped query.
|
97
|
+
def rewrite(reader)
|
98
|
+
rewritten = @sub_query.rewrite(reader)
|
99
|
+
if (rewritten != @sub_query)
|
100
|
+
clone = self.clone()
|
101
|
+
clone.query = rewritten
|
102
|
+
return clone
|
103
|
+
else
|
104
|
+
return self
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# inherit javadoc
|
109
|
+
def extract_terms(terms)
|
110
|
+
@sub_query.extract_terms(terms)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Prints a user-readable version of this query.
|
114
|
+
def to_s(f = nil)
|
115
|
+
return "filtered(#{@sub_query.to_s(f)})->#{@filter}"
|
116
|
+
end
|
117
|
+
|
118
|
+
# Returns true iff +o+ is equal to this.
|
119
|
+
def eql?(o)
|
120
|
+
return (o.instance_of?(FilteredQuery) and
|
121
|
+
(@sub_query == o.sub_query) and (@filter == o.filter))
|
122
|
+
end
|
123
|
+
alias :== :eql?
|
124
|
+
|
125
|
+
# Returns a hash code value for this object.
|
126
|
+
def hash()
|
127
|
+
return @sub_query.hash ^ @filter.hash
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
|
3
|
+
# Abstract class for enumerating a subset of all terms.
|
4
|
+
#
|
5
|
+
# Term enumerations are always ordered by Term.<=>(). Each term in
|
6
|
+
# the enumeration is greater than all that precede it.
|
7
|
+
class FilteredTermEnum < Ferret::Index::TermEnum
|
8
|
+
|
9
|
+
# Returns the current Term in the enumeration.
|
10
|
+
# Returns nil if no Term matches or all terms have been enumerated.
|
11
|
+
attr_reader :term
|
12
|
+
|
13
|
+
def initialize()
|
14
|
+
@term = nil
|
15
|
+
@enum = nil
|
16
|
+
@reader = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
# Equality compare on the term
|
20
|
+
def term_compare(term)
|
21
|
+
raise NotImplementedError
|
22
|
+
end
|
23
|
+
|
24
|
+
# Equality measure on the term
|
25
|
+
def difference()
|
26
|
+
raise NotImplementedError
|
27
|
+
end
|
28
|
+
|
29
|
+
# Indiciates the end of the enumeration has been reached
|
30
|
+
def end_enum()
|
31
|
+
raise NotImplementedError
|
32
|
+
end
|
33
|
+
|
34
|
+
def enum=(enum)
|
35
|
+
@enum = enum
|
36
|
+
# Find the first term that matches
|
37
|
+
term = @enum.term()
|
38
|
+
if (term != nil and term_compare(term))
|
39
|
+
@term = term
|
40
|
+
else
|
41
|
+
next?
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the doc_freq of the current Term in the enumeration.
|
46
|
+
# Returns -1 if no Term matches or all terms have been enumerated.
|
47
|
+
def doc_freq()
|
48
|
+
if (@enum == nil)
|
49
|
+
return -1
|
50
|
+
end
|
51
|
+
return @enum.doc_freq()
|
52
|
+
end
|
53
|
+
|
54
|
+
# Increments the enumeration to the next element. True if one exists.
|
55
|
+
def next?()
|
56
|
+
return false if (@enum == nil) # enum not initialized
|
57
|
+
@term = nil
|
58
|
+
while @term.nil?
|
59
|
+
if end_enum() or ! @enum.next?
|
60
|
+
return false
|
61
|
+
end
|
62
|
+
term = @enum.term()
|
63
|
+
if (term_compare(term))
|
64
|
+
@term = term
|
65
|
+
return true
|
66
|
+
end
|
67
|
+
end
|
68
|
+
@term = nil
|
69
|
+
return false
|
70
|
+
end
|
71
|
+
|
72
|
+
# Closes the enumeration to further activity, freeing resources.
|
73
|
+
def close()
|
74
|
+
@enum.close()
|
75
|
+
@term = nil
|
76
|
+
@enum = nil
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,153 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Implements the fuzzy search query. The similiarity measurement
|
3
|
+
# is based on the Levenshtein (distance) algorithm.
|
4
|
+
class FuzzyQuery < MultiTermQuery
|
5
|
+
@@default_min_similarity = 0.5
|
6
|
+
@@default_prefix_length = 0
|
7
|
+
|
8
|
+
def FuzzyQuery.default_min_similarity()
|
9
|
+
return @@default_min_similarity
|
10
|
+
end
|
11
|
+
|
12
|
+
def FuzzyQuery.default_min_similarity=(minimum_similarity)
|
13
|
+
if (minimum_similarity >= 1.0)
|
14
|
+
raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
|
15
|
+
elsif (minimum_similarity < 0.0)
|
16
|
+
raise ArgumentError, "minimum_similarity cannot be less than 0"
|
17
|
+
end
|
18
|
+
@@default_min_similarity = minimum_similarity
|
19
|
+
end
|
20
|
+
|
21
|
+
def FuzzyQuery.default_prefix_length()
|
22
|
+
return @@default_prefix_length
|
23
|
+
end
|
24
|
+
|
25
|
+
def FuzzyQuery.default_prefix_length=(prefix_length)
|
26
|
+
if (prefix_length < 0)
|
27
|
+
raise ArgumentError, "prefix_length cannot be less than 0"
|
28
|
+
end
|
29
|
+
@@default_prefix_length = prefix_length
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
attr_reader :prefix_length, :minimum_similarity
|
34
|
+
# Create a new FuzzyQuery that will match terms with a similarity
|
35
|
+
# of at least +minimum_similarity+ to +term+.
|
36
|
+
# If a +prefix_length+ > 0 is specified, a common prefix
|
37
|
+
# of that length is also required.
|
38
|
+
#
|
39
|
+
# term:: the term to search for
|
40
|
+
# minimum_similarity:: a value between 0 and 1 to set the required
|
41
|
+
# similarity between the query term and the matching
|
42
|
+
# terms. For example, for a +minimum_similarity+ of
|
43
|
+
# <tt>0.5</tt> a term of the same length as the query
|
44
|
+
# term is considered similar to the query term if the
|
45
|
+
# edit distance between both terms is less than
|
46
|
+
# <tt>length(term)*0.5</tt>
|
47
|
+
# prefix_length:: length of common (non-fuzzy) prefix. This is the
|
48
|
+
# number of characters at the start of a term that
|
49
|
+
# must be identical (fuzzy) to the query term if the
|
50
|
+
# query is to match that term.
|
51
|
+
# raises:: ArgumentError if minimum_similarity is >= 1 or < 0
|
52
|
+
# or if prefix_length < 0
|
53
|
+
def initialize(term,
|
54
|
+
minimum_similarity = @@default_min_similarity,
|
55
|
+
prefix_length = @@default_prefix_length)
|
56
|
+
super(term)
|
57
|
+
|
58
|
+
if (minimum_similarity >= 1.0)
|
59
|
+
raise ArgumentError, "minimum_similarity >= 1"
|
60
|
+
elsif (minimum_similarity < 0.0)
|
61
|
+
raise ArgumentError, "minimum_similarity < 0"
|
62
|
+
end
|
63
|
+
|
64
|
+
if (prefix_length < 0)
|
65
|
+
raise ArgumentError, "prefix_length < 0"
|
66
|
+
end
|
67
|
+
|
68
|
+
@minimum_similarity = minimum_similarity
|
69
|
+
@prefix_length = prefix_length
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_term_enum(reader)
|
73
|
+
return FuzzyTermEnum.new(reader, @term, @minimum_similarity, @prefix_length)
|
74
|
+
end
|
75
|
+
|
76
|
+
def rewrite(reader)
|
77
|
+
|
78
|
+
fuzzy_enum = get_term_enum(reader)
|
79
|
+
max_clause_count = BooleanQuery.max_clause_count
|
80
|
+
st_queue = ScoreTermQueue.new(max_clause_count)
|
81
|
+
|
82
|
+
begin
|
83
|
+
begin
|
84
|
+
min_score = 0.0
|
85
|
+
score = 0.0
|
86
|
+
t = fuzzy_enum.term()
|
87
|
+
if t
|
88
|
+
score = fuzzy_enum.difference()
|
89
|
+
|
90
|
+
# terms come in alphabetical order, therefore if queue is full and score
|
91
|
+
# not bigger than min_score, we can skip
|
92
|
+
if(st_queue.size < max_clause_count or score > min_score)
|
93
|
+
st_queue.insert(ScoreTerm.new(t, score))
|
94
|
+
min_score = st_queue.top.score # maintain min_score
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end while fuzzy_enum.next?
|
98
|
+
ensure
|
99
|
+
fuzzy_enum.close()
|
100
|
+
end
|
101
|
+
|
102
|
+
bq = BooleanQuery.new(true)
|
103
|
+
st_queue.size.times do |i|
|
104
|
+
st = st_queue.pop()
|
105
|
+
tq = TermQuery.new(st.term) # found a match
|
106
|
+
tq.boost = boost() * st.score # set the boost
|
107
|
+
bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
|
108
|
+
end
|
109
|
+
|
110
|
+
return bq
|
111
|
+
end
|
112
|
+
|
113
|
+
def to_s(field = nil)
|
114
|
+
buffer = ""
|
115
|
+
buffer << "#{@term.field}:" if @term.field != field
|
116
|
+
buffer << "#{@term.text}~#{minimum_similarity}"
|
117
|
+
buffer << "^#{boost()}" if (boost() != 1.0)
|
118
|
+
return buffer
|
119
|
+
end
|
120
|
+
|
121
|
+
class ScoreTerm
|
122
|
+
attr_accessor :term, :score
|
123
|
+
|
124
|
+
def initialize(term, score)
|
125
|
+
@term = term
|
126
|
+
@score = score
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
class ScoreTermQueue < Ferret::Utils::PriorityQueue
|
131
|
+
|
132
|
+
# See PriorityQueue#less_than(o1, o2)
|
133
|
+
def less_than(st1, st2)
|
134
|
+
if (st1.score == st1.score)
|
135
|
+
return st1.term > st2.term
|
136
|
+
else
|
137
|
+
return st1.score < st2.score
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
def eql?(o)
|
143
|
+
return (o.instance_of?(FuzzyQuery) and super(o) and
|
144
|
+
(@minimum_similarity == o.minimum_similarity) and
|
145
|
+
(@prefix_length == fuzzyQuery.prefix_length))
|
146
|
+
end
|
147
|
+
alias :== :eql?
|
148
|
+
|
149
|
+
def hash()
|
150
|
+
return super ^ @minimum_similarity.hash ^ @prefix_length.hash
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
@@ -0,0 +1,244 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Subclass of FilteredTermEnum for enumerating all terms that are similiar
|
3
|
+
# to the specified filter term.
|
4
|
+
#
|
5
|
+
# Term enumerations are always ordered by Term.compareTo(). Each term in
|
6
|
+
# the enumeration is greater than all that precede it.
|
7
|
+
class FuzzyTermEnum < FilteredTermEnum
|
8
|
+
include Ferret::Index
|
9
|
+
attr_reader :end_enum
|
10
|
+
|
11
|
+
# This should be somewhere around the average long word.
|
12
|
+
# If it is longer, we waste time and space. If it is shorter, we waste a
|
13
|
+
# little bit of time growing the array as we encounter longer words.
|
14
|
+
TYPICAL_LONGEST_WORD_IN_INDEX = 19
|
15
|
+
|
16
|
+
# Constructor for enumeration of all terms from specified +reader+ which
|
17
|
+
# share a prefix of length +prefix_length+ with +term+ and which have a
|
18
|
+
# fuzzy similarity > +min_similarity+.
|
19
|
+
#
|
20
|
+
# After calling the constructor the enumeration is already pointing to the
|
21
|
+
# first valid term if such a term exists.
|
22
|
+
#
|
23
|
+
# reader:: Delivers terms.
|
24
|
+
# term:: Pattern term.
|
25
|
+
# min_similarity:: Minimum required similarity for terms from the reader.
|
26
|
+
# Default value is 0.5.
|
27
|
+
# prefix_length:: Length of required common prefix. Default value is 0.
|
28
|
+
def initialize(reader, term,
|
29
|
+
minimum_similarity = FuzzyQuery.default_min_similarity,
|
30
|
+
prefix_length = FuzzyQuery.default_prefix_length)
|
31
|
+
super()
|
32
|
+
|
33
|
+
@reader = reader
|
34
|
+
@end_enum = false
|
35
|
+
@max_distances = Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)
|
36
|
+
|
37
|
+
|
38
|
+
if (minimum_similarity >= 1.0)
|
39
|
+
raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
|
40
|
+
elsif (minimum_similarity < 0.0)
|
41
|
+
raise ArgumentError, "minimum_similarity cannot be less than 0"
|
42
|
+
end
|
43
|
+
if(prefix_length < 0)
|
44
|
+
raise ArgumentError, "prefix_length cannot be less than 0"
|
45
|
+
end
|
46
|
+
|
47
|
+
@minimum_similarity = minimum_similarity
|
48
|
+
@scale_factor = 1.0 / (1.0 - @minimum_similarity)
|
49
|
+
@search_term = term
|
50
|
+
@field = @search_term.field
|
51
|
+
|
52
|
+
# The prefix could be longer than the word.
|
53
|
+
# It's kind of silly though. It means we must match the entire word.
|
54
|
+
term_length = @search_term.text.length
|
55
|
+
if prefix_length > term_length
|
56
|
+
@prefix_length = term_length
|
57
|
+
else
|
58
|
+
@prefix_length = prefix_length
|
59
|
+
end
|
60
|
+
|
61
|
+
@text = @search_term.text[@prefix_length..-1]
|
62
|
+
@prefix = @search_term.text[0, @prefix_length]
|
63
|
+
|
64
|
+
initialize_max_distances()
|
65
|
+
|
66
|
+
# Allows us save time required to create a new array
|
67
|
+
# everytime similarity is called.
|
68
|
+
@d = init_distance_array()
|
69
|
+
|
70
|
+
self.enum = reader.terms_from(Term.new(@search_term.field, @prefix))
|
71
|
+
end
|
72
|
+
|
73
|
+
# The term_compare method in FuzzyTermEnum uses Levenshtein distance to
|
74
|
+
# calculate the distance between the given term and the comparing term.
|
75
|
+
def term_compare(term)
|
76
|
+
if (@field == term.field and term.text[0, @prefix_length] == @prefix)
|
77
|
+
target = term.text[@prefix_length..-1]
|
78
|
+
@similarity = similarity(target)
|
79
|
+
return (@similarity > @minimum_similarity)
|
80
|
+
end
|
81
|
+
@end_enum = true
|
82
|
+
return false
|
83
|
+
end
|
84
|
+
|
85
|
+
def difference()
|
86
|
+
return (@scale_factor * (@similarity - @minimum_similarity))
|
87
|
+
end
|
88
|
+
|
89
|
+
# ****************************
|
90
|
+
# Compute Levenshtein distance
|
91
|
+
# ****************************
|
92
|
+
|
93
|
+
# Finds and returns the smallest of three integers
|
94
|
+
def min(a, b, c)
|
95
|
+
t = (a < b) ? a : b
|
96
|
+
return (t < c) ? t : c
|
97
|
+
end
|
98
|
+
|
99
|
+
def init_distance_array()
|
100
|
+
return Array.new(@text.length() + 1) {Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)}
|
101
|
+
end
|
102
|
+
|
103
|
+
# Similarity returns a number that is 1.0 or less (including negative
|
104
|
+
# numbers) based on how similar the Term is compared to a target term. It
|
105
|
+
# returns exactly 0.0 when
|
106
|
+
#
|
107
|
+
# edit_distance < maximum_edit_distance
|
108
|
+
#
|
109
|
+
# Otherwise it returns:
|
110
|
+
#
|
111
|
+
# 1 - (edit_distance / length)
|
112
|
+
#
|
113
|
+
# where length is the length of the shortest term (text or target)
|
114
|
+
# including a prefix that are identical and edit_distance is the
|
115
|
+
# Levenshtein distance for the two words.
|
116
|
+
#
|
117
|
+
# Embedded within this algorithm is a fail-fast Levenshtein distance
|
118
|
+
# algorithm. The fail-fast algorithm differs from the standard
|
119
|
+
# Levenshtein distance algorithm in that it is aborted if it is discovered
|
120
|
+
# that the mimimum distance between the words is greater than some
|
121
|
+
# threshold.
|
122
|
+
#
|
123
|
+
# To calculate the maximum distance threshold we use the following formula:
|
124
|
+
#
|
125
|
+
# (1 - minimum_similarity) * length
|
126
|
+
#
|
127
|
+
# where length is the shortest term including any prefix that is not part
|
128
|
+
# of the similarity comparision. This formula was derived by solving for
|
129
|
+
# what maximum value of distance returns false for the following
|
130
|
+
# statements:
|
131
|
+
#
|
132
|
+
# similarity = 1 - (distance / (prefix_length + [textlen, targetlen].min))
|
133
|
+
# return (similarity > minimum_similarity)
|
134
|
+
#
|
135
|
+
# where distance is the Levenshtein distance for the two words.
|
136
|
+
#
|
137
|
+
# Levenshtein distance (also known as edit distance) is a measure of
|
138
|
+
# similiarity between two strings where the distance is measured as the
|
139
|
+
# number of character deletions, insertions or substitutions required to
|
140
|
+
# transform one string to the other string.
|
141
|
+
#
|
142
|
+
# target:: the target word or phrase
|
143
|
+
# returns:: the similarity, 0.0 or less indicates that it matches less
|
144
|
+
# than the required threshold and 1.0 indicates that the text and
|
145
|
+
# target are identical
|
146
|
+
def similarity(target)
|
147
|
+
m = target.length
|
148
|
+
n = @text.length
|
149
|
+
|
150
|
+
if (n == 0)
|
151
|
+
# we don't have anything to compare. That means if we just add the
|
152
|
+
# letters for m we get the new word
|
153
|
+
return (@prefix_length == 0) ? 0.0 : 1.0 - (m.to_f / @prefix_length)
|
154
|
+
end
|
155
|
+
if (m == 0)
|
156
|
+
return (@prefix_length == 0) ? 0.0 : 1.0 - (n.to_f / @prefix_length)
|
157
|
+
end
|
158
|
+
|
159
|
+
max_distance = max_distance(m)
|
160
|
+
|
161
|
+
if (max_distance < (m-n).abs)
|
162
|
+
#just adding the characters of m to n or vice-versa results in
|
163
|
+
#too many edits
|
164
|
+
#for example "pre" length is 3 and "prefixes" length is 8. We can see that
|
165
|
+
#given this optimal circumstance, the edit distance cannot be less than 5.
|
166
|
+
#which is 8-3 or more precisesly Math.abs(3-8).
|
167
|
+
#if our maximum edit distance is 4, then we can discard this word
|
168
|
+
#without looking at it.
|
169
|
+
return 0.0
|
170
|
+
end
|
171
|
+
|
172
|
+
#let's make sure we have enough room in our array to do the distance calculations.
|
173
|
+
if (@d[0].length <= m)
|
174
|
+
grow_distance_array(m)
|
175
|
+
end
|
176
|
+
|
177
|
+
# init matrix d
|
178
|
+
(n+1).times {|i| @d[i][0] = i}
|
179
|
+
(m+1).times {|j| @d[0][j] = j}
|
180
|
+
|
181
|
+
# start computing edit distance
|
182
|
+
1.upto(n) do |i|
|
183
|
+
best_possible_edit_distance = m
|
184
|
+
s_i = @text[i-1]
|
185
|
+
1.upto(m) do |j|
|
186
|
+
if (s_i != target[j-1])
|
187
|
+
@d[i][j] = min(@d[i-1][j], @d[i][j-1], @d[i-1][j-1])+1
|
188
|
+
else
|
189
|
+
@d[i][j] = min(@d[i-1][j]+1, @d[i][j-1]+1, @d[i-1][j-1])
|
190
|
+
end
|
191
|
+
if @d[i][j] < best_possible_edit_distance
|
192
|
+
best_possible_edit_distance = @d[i][j]
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
# After calculating row i, the best possible edit distance can be
|
197
|
+
# found by found by finding the smallest value in a given column.
|
198
|
+
# If the best_possible_edit_distance is greater than the max distance,
|
199
|
+
# abort.
|
200
|
+
if (i > max_distance and best_possible_edit_distance > max_distance)
|
201
|
+
# equal is okay, but not greater
|
202
|
+
# the closest the target can be to the text is just too far away.
|
203
|
+
# this target is leaving the party early.
|
204
|
+
return 0.0
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# this will return less than 0.0 when the edit distance is
|
209
|
+
# greater than the number of characters in the shorter word.
|
210
|
+
# but this was the formula that was previously used in FuzzyTermEnum,
|
211
|
+
# so it has not been changed (even though minimum_similarity must be
|
212
|
+
# greater than 0.0)
|
213
|
+
return 1.0 - (@d[n][m].to_f / (@prefix_length + (n < m ? n : m)))
|
214
|
+
end
|
215
|
+
|
216
|
+
# Grow the second dimension of the array, so that we can calculate the
|
217
|
+
# Levenshtein difference.
|
218
|
+
def grow_distance_array(m)
|
219
|
+
@d = @d.map {Array.new(m+1)}
|
220
|
+
end
|
221
|
+
|
222
|
+
# The max Distance is the maximum Levenshtein distance for the text
|
223
|
+
# compared to some other value that results in score that is
|
224
|
+
# better than the minimum similarity.
|
225
|
+
# m:: the length of the "other value"
|
226
|
+
# returns:: the maximum levenshtein distance that we care about
|
227
|
+
def max_distance(m)
|
228
|
+
if (m >= @max_distances.length)
|
229
|
+
@max_distances[m] = calculate_max_distance(m)
|
230
|
+
end
|
231
|
+
return @max_distances[m]
|
232
|
+
end
|
233
|
+
|
234
|
+
def initialize_max_distances()
|
235
|
+
@max_distances.length.times do |i|
|
236
|
+
@max_distances[i] = calculate_max_distance(i)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
def calculate_max_distance(m)
|
241
|
+
return ((1-@minimum_similarity) * ([@text.length, m].min + @prefix_length))
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|