ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,11 @@
1
+ module Ferret::Search
2
+ # Abstract base class providing a mechanism to restrict searches to a subset
3
+ # of an index.
4
+ class Filter
5
+ # Returns a BitSet with true for documents which should be permitted in
6
+ # search results, and false for those that should not.
7
+ def bits(reader)
8
+ raise NotImplementedError
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,130 @@
1
+ module Ferret::Search
2
+ # A query that applies a filter to the results of another query.
3
+ #
4
+ # Note: the bits are retrieved from the filter each time this
5
+ # query is used in a search - use a CachingWrapperFilter to avoid
6
+ # regenerating the bits every time.
7
+ class FilteredQuery < Query
8
+ attr_accessor :sub_query
9
+ attr_reader :filter
10
+
11
+ # Constructs a new query which applies a filter to the results of the
12
+ # original query.
13
+ #
14
+ # Filter.bits() will be called every time this query is used in a search.
15
+ #
16
+ # query:: Query to be filtered, cannot be +nil+.
17
+ # filter:: Filter to apply to query results, cannot be +nil+.
18
+ def initialize(query, filter)
19
+ super()
20
+ @sub_query = query
21
+ @filter = filter
22
+ end
23
+
24
+ # Returns a Weight that applies the filter to the enclosed query's Weight.
25
+ # This is accomplished by overriding the Scorer returned by the Weight.
26
+ def create_weight(searcher)
27
+ sub_weight = @sub_query.create_weight(searcher)
28
+ similarity = @sub_query.similarity(searcher)
29
+ return FilteredWeight.new(self, sub_weight, similarity)
30
+ end
31
+
32
+ class FilteredScorer < Scorer
33
+ def initialize(sub_scorer, bits, similarity)
34
+ super(similarity)
35
+ @sub_scorer = sub_scorer
36
+ @bits = bits
37
+ end
38
+
39
+ # pass these methods through to the enclosed scorer
40
+ def next?() return @sub_scorer.next?; end
41
+ def doc() return @sub_scorer.doc; end
42
+ def skip_to(i) return @sub_scorer.skip_to(i); end
43
+
44
+ # if the document has been filtered out, set score to 0.0
45
+ def score()
46
+ return (@bits.get(@sub_scorer.doc) ? @sub_scorer.score() : 0.0)
47
+ end
48
+
49
+ # add an explanation about whether the document was filtered
50
+ def explain(i)
51
+ exp = @sub_scorer.explain(i)
52
+ if (@bits.get(i))
53
+ exp.description = "allowed by filter: #{exp.description}"
54
+ else
55
+ exp.description = "removed by filter: #{exp.description}"
56
+ end
57
+ return exp
58
+ end
59
+ end
60
+
61
+ class FilteredWeight < Weight
62
+ attr_reader :query
63
+
64
+ def initialize(query, sub_weight, similarity)
65
+ @query = query
66
+ @sub_weight = sub_weight
67
+ @similarity = similarity
68
+ end
69
+
70
+ # pass these methods through to enclosed query's weight
71
+ def value()
72
+ return @sub_weight.value
73
+ end
74
+
75
+ def sum_of_squared_weights()
76
+ return @sub_weight.sum_of_squared_weights
77
+ end
78
+
79
+ def normalize(v)
80
+ return @sub_weight.normalize(v)
81
+ end
82
+
83
+ def explain(ir, i)
84
+ return @sub_weight.explain(ir, i)
85
+ end
86
+
87
+ # return a scorer that overrides the enclosed query's score if
88
+ # the given hit has been filtered out.
89
+ def scorer(reader)
90
+ scorer = @sub_weight.scorer(reader)
91
+ bits = @query.filter.bits(reader)
92
+ return FilteredScorer.new(scorer, bits, @similarity)
93
+ end
94
+ end
95
+
96
+ # Rewrites the wrapped query.
97
+ def rewrite(reader)
98
+ rewritten = @sub_query.rewrite(reader)
99
+ if (rewritten != @sub_query)
100
+ clone = self.clone()
101
+ clone.query = rewritten
102
+ return clone
103
+ else
104
+ return self
105
+ end
106
+ end
107
+
108
+ # inherit javadoc
109
+ def extract_terms(terms)
110
+ @sub_query.extract_terms(terms)
111
+ end
112
+
113
+ # Prints a user-readable version of this query.
114
+ def to_s(f = nil)
115
+ return "filtered(#{@sub_query.to_s(f)})->#{@filter}"
116
+ end
117
+
118
+ # Returns true iff +o+ is equal to this.
119
+ def eql?(o)
120
+ return (o.instance_of?(FilteredQuery) and
121
+ (@sub_query == o.sub_query) and (@filter == o.filter))
122
+ end
123
+ alias :== :eql?
124
+
125
+ # Returns a hash code value for this object.
126
+ def hash()
127
+ return @sub_query.hash ^ @filter.hash
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,79 @@
1
+ module Ferret::Search
2
+
3
+ # Abstract class for enumerating a subset of all terms.
4
+ #
5
+ # Term enumerations are always ordered by Term.<=>(). Each term in
6
+ # the enumeration is greater than all that precede it.
7
+ class FilteredTermEnum < Ferret::Index::TermEnum
8
+
9
+ # Returns the current Term in the enumeration.
10
+ # Returns nil if no Term matches or all terms have been enumerated.
11
+ attr_reader :term
12
+
13
+ def initialize()
14
+ @term = nil
15
+ @enum = nil
16
+ @reader = nil
17
+ end
18
+
19
+ # Equality compare on the term
20
+ def term_compare(term)
21
+ raise NotImplementedError
22
+ end
23
+
24
+ # Equality measure on the term
25
+ def difference()
26
+ raise NotImplementedError
27
+ end
28
+
29
+ # Indiciates the end of the enumeration has been reached
30
+ def end_enum()
31
+ raise NotImplementedError
32
+ end
33
+
34
+ def enum=(enum)
35
+ @enum = enum
36
+ # Find the first term that matches
37
+ term = @enum.term()
38
+ if (term != nil and term_compare(term))
39
+ @term = term
40
+ else
41
+ next?
42
+ end
43
+ end
44
+
45
+ # Returns the doc_freq of the current Term in the enumeration.
46
+ # Returns -1 if no Term matches or all terms have been enumerated.
47
+ def doc_freq()
48
+ if (@enum == nil)
49
+ return -1
50
+ end
51
+ return @enum.doc_freq()
52
+ end
53
+
54
+ # Increments the enumeration to the next element. True if one exists.
55
+ def next?()
56
+ return false if (@enum == nil) # enum not initialized
57
+ @term = nil
58
+ while @term.nil?
59
+ if end_enum() or ! @enum.next?
60
+ return false
61
+ end
62
+ term = @enum.term()
63
+ if (term_compare(term))
64
+ @term = term
65
+ return true
66
+ end
67
+ end
68
+ @term = nil
69
+ return false
70
+ end
71
+
72
+ # Closes the enumeration to further activity, freeing resources.
73
+ def close()
74
+ @enum.close()
75
+ @term = nil
76
+ @enum = nil
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,153 @@
1
+ module Ferret::Search
2
+ # Implements the fuzzy search query. The similiarity measurement
3
+ # is based on the Levenshtein (distance) algorithm.
4
+ class FuzzyQuery < MultiTermQuery
5
+ @@default_min_similarity = 0.5
6
+ @@default_prefix_length = 0
7
+
8
+ def FuzzyQuery.default_min_similarity()
9
+ return @@default_min_similarity
10
+ end
11
+
12
+ def FuzzyQuery.default_min_similarity=(minimum_similarity)
13
+ if (minimum_similarity >= 1.0)
14
+ raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
15
+ elsif (minimum_similarity < 0.0)
16
+ raise ArgumentError, "minimum_similarity cannot be less than 0"
17
+ end
18
+ @@default_min_similarity = minimum_similarity
19
+ end
20
+
21
+ def FuzzyQuery.default_prefix_length()
22
+ return @@default_prefix_length
23
+ end
24
+
25
+ def FuzzyQuery.default_prefix_length=(prefix_length)
26
+ if (prefix_length < 0)
27
+ raise ArgumentError, "prefix_length cannot be less than 0"
28
+ end
29
+ @@default_prefix_length = prefix_length
30
+ end
31
+
32
+
33
+ attr_reader :prefix_length, :minimum_similarity
34
+ # Create a new FuzzyQuery that will match terms with a similarity
35
+ # of at least +minimum_similarity+ to +term+.
36
+ # If a +prefix_length+ > 0 is specified, a common prefix
37
+ # of that length is also required.
38
+ #
39
+ # term:: the term to search for
40
+ # minimum_similarity:: a value between 0 and 1 to set the required
41
+ # similarity between the query term and the matching
42
+ # terms. For example, for a +minimum_similarity+ of
43
+ # <tt>0.5</tt> a term of the same length as the query
44
+ # term is considered similar to the query term if the
45
+ # edit distance between both terms is less than
46
+ # <tt>length(term)*0.5</tt>
47
+ # prefix_length:: length of common (non-fuzzy) prefix. This is the
48
+ # number of characters at the start of a term that
49
+ # must be identical (fuzzy) to the query term if the
50
+ # query is to match that term.
51
+ # raises:: ArgumentError if minimum_similarity is >= 1 or < 0
52
+ # or if prefix_length < 0
53
+ def initialize(term,
54
+ minimum_similarity = @@default_min_similarity,
55
+ prefix_length = @@default_prefix_length)
56
+ super(term)
57
+
58
+ if (minimum_similarity >= 1.0)
59
+ raise ArgumentError, "minimum_similarity >= 1"
60
+ elsif (minimum_similarity < 0.0)
61
+ raise ArgumentError, "minimum_similarity < 0"
62
+ end
63
+
64
+ if (prefix_length < 0)
65
+ raise ArgumentError, "prefix_length < 0"
66
+ end
67
+
68
+ @minimum_similarity = minimum_similarity
69
+ @prefix_length = prefix_length
70
+ end
71
+
72
+ def get_term_enum(reader)
73
+ return FuzzyTermEnum.new(reader, @term, @minimum_similarity, @prefix_length)
74
+ end
75
+
76
+ def rewrite(reader)
77
+
78
+ fuzzy_enum = get_term_enum(reader)
79
+ max_clause_count = BooleanQuery.max_clause_count
80
+ st_queue = ScoreTermQueue.new(max_clause_count)
81
+
82
+ begin
83
+ begin
84
+ min_score = 0.0
85
+ score = 0.0
86
+ t = fuzzy_enum.term()
87
+ if t
88
+ score = fuzzy_enum.difference()
89
+
90
+ # terms come in alphabetical order, therefore if queue is full and score
91
+ # not bigger than min_score, we can skip
92
+ if(st_queue.size < max_clause_count or score > min_score)
93
+ st_queue.insert(ScoreTerm.new(t, score))
94
+ min_score = st_queue.top.score # maintain min_score
95
+ end
96
+ end
97
+ end while fuzzy_enum.next?
98
+ ensure
99
+ fuzzy_enum.close()
100
+ end
101
+
102
+ bq = BooleanQuery.new(true)
103
+ st_queue.size.times do |i|
104
+ st = st_queue.pop()
105
+ tq = TermQuery.new(st.term) # found a match
106
+ tq.boost = boost() * st.score # set the boost
107
+ bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
108
+ end
109
+
110
+ return bq
111
+ end
112
+
113
+ def to_s(field = nil)
114
+ buffer = ""
115
+ buffer << "#{@term.field}:" if @term.field != field
116
+ buffer << "#{@term.text}~#{minimum_similarity}"
117
+ buffer << "^#{boost()}" if (boost() != 1.0)
118
+ return buffer
119
+ end
120
+
121
+ class ScoreTerm
122
+ attr_accessor :term, :score
123
+
124
+ def initialize(term, score)
125
+ @term = term
126
+ @score = score
127
+ end
128
+ end
129
+
130
+ class ScoreTermQueue < Ferret::Utils::PriorityQueue
131
+
132
+ # See PriorityQueue#less_than(o1, o2)
133
+ def less_than(st1, st2)
134
+ if (st1.score == st1.score)
135
+ return st1.term > st2.term
136
+ else
137
+ return st1.score < st2.score
138
+ end
139
+ end
140
+ end
141
+
142
+ def eql?(o)
143
+ return (o.instance_of?(FuzzyQuery) and super(o) and
144
+ (@minimum_similarity == o.minimum_similarity) and
145
+ (@prefix_length == fuzzyQuery.prefix_length))
146
+ end
147
+ alias :== :eql?
148
+
149
+ def hash()
150
+ return super ^ @minimum_similarity.hash ^ @prefix_length.hash
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,244 @@
1
+ module Ferret::Search
2
+ # Subclass of FilteredTermEnum for enumerating all terms that are similiar
3
+ # to the specified filter term.
4
+ #
5
+ # Term enumerations are always ordered by Term.compareTo(). Each term in
6
+ # the enumeration is greater than all that precede it.
7
+ class FuzzyTermEnum < FilteredTermEnum
8
+ include Ferret::Index
9
+ attr_reader :end_enum
10
+
11
+ # This should be somewhere around the average long word.
12
+ # If it is longer, we waste time and space. If it is shorter, we waste a
13
+ # little bit of time growing the array as we encounter longer words.
14
+ TYPICAL_LONGEST_WORD_IN_INDEX = 19
15
+
16
+ # Constructor for enumeration of all terms from specified +reader+ which
17
+ # share a prefix of length +prefix_length+ with +term+ and which have a
18
+ # fuzzy similarity > +min_similarity+.
19
+ #
20
+ # After calling the constructor the enumeration is already pointing to the
21
+ # first valid term if such a term exists.
22
+ #
23
+ # reader:: Delivers terms.
24
+ # term:: Pattern term.
25
+ # min_similarity:: Minimum required similarity for terms from the reader.
26
+ # Default value is 0.5.
27
+ # prefix_length:: Length of required common prefix. Default value is 0.
28
+ def initialize(reader, term,
29
+ minimum_similarity = FuzzyQuery.default_min_similarity,
30
+ prefix_length = FuzzyQuery.default_prefix_length)
31
+ super()
32
+
33
+ @reader = reader
34
+ @end_enum = false
35
+ @max_distances = Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)
36
+
37
+
38
+ if (minimum_similarity >= 1.0)
39
+ raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
40
+ elsif (minimum_similarity < 0.0)
41
+ raise ArgumentError, "minimum_similarity cannot be less than 0"
42
+ end
43
+ if(prefix_length < 0)
44
+ raise ArgumentError, "prefix_length cannot be less than 0"
45
+ end
46
+
47
+ @minimum_similarity = minimum_similarity
48
+ @scale_factor = 1.0 / (1.0 - @minimum_similarity)
49
+ @search_term = term
50
+ @field = @search_term.field
51
+
52
+ # The prefix could be longer than the word.
53
+ # It's kind of silly though. It means we must match the entire word.
54
+ term_length = @search_term.text.length
55
+ if prefix_length > term_length
56
+ @prefix_length = term_length
57
+ else
58
+ @prefix_length = prefix_length
59
+ end
60
+
61
+ @text = @search_term.text[@prefix_length..-1]
62
+ @prefix = @search_term.text[0, @prefix_length]
63
+
64
+ initialize_max_distances()
65
+
66
+ # Allows us save time required to create a new array
67
+ # everytime similarity is called.
68
+ @d = init_distance_array()
69
+
70
+ self.enum = reader.terms_from(Term.new(@search_term.field, @prefix))
71
+ end
72
+
73
+ # The term_compare method in FuzzyTermEnum uses Levenshtein distance to
74
+ # calculate the distance between the given term and the comparing term.
75
+ def term_compare(term)
76
+ if (@field == term.field and term.text[0, @prefix_length] == @prefix)
77
+ target = term.text[@prefix_length..-1]
78
+ @similarity = similarity(target)
79
+ return (@similarity > @minimum_similarity)
80
+ end
81
+ @end_enum = true
82
+ return false
83
+ end
84
+
85
+ def difference()
86
+ return (@scale_factor * (@similarity - @minimum_similarity))
87
+ end
88
+
89
+ # ****************************
90
+ # Compute Levenshtein distance
91
+ # ****************************
92
+
93
+ # Finds and returns the smallest of three integers
94
+ def min(a, b, c)
95
+ t = (a < b) ? a : b
96
+ return (t < c) ? t : c
97
+ end
98
+
99
+ def init_distance_array()
100
+ return Array.new(@text.length() + 1) {Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)}
101
+ end
102
+
103
+ # Similarity returns a number that is 1.0 or less (including negative
104
+ # numbers) based on how similar the Term is compared to a target term. It
105
+ # returns exactly 0.0 when
106
+ #
107
+ # edit_distance < maximum_edit_distance
108
+ #
109
+ # Otherwise it returns:
110
+ #
111
+ # 1 - (edit_distance / length)
112
+ #
113
+ # where length is the length of the shortest term (text or target)
114
+ # including a prefix that are identical and edit_distance is the
115
+ # Levenshtein distance for the two words.
116
+ #
117
+ # Embedded within this algorithm is a fail-fast Levenshtein distance
118
+ # algorithm. The fail-fast algorithm differs from the standard
119
+ # Levenshtein distance algorithm in that it is aborted if it is discovered
120
+ # that the mimimum distance between the words is greater than some
121
+ # threshold.
122
+ #
123
+ # To calculate the maximum distance threshold we use the following formula:
124
+ #
125
+ # (1 - minimum_similarity) * length
126
+ #
127
+ # where length is the shortest term including any prefix that is not part
128
+ # of the similarity comparision. This formula was derived by solving for
129
+ # what maximum value of distance returns false for the following
130
+ # statements:
131
+ #
132
+ # similarity = 1 - (distance / (prefix_length + [textlen, targetlen].min))
133
+ # return (similarity > minimum_similarity)
134
+ #
135
+ # where distance is the Levenshtein distance for the two words.
136
+ #
137
+ # Levenshtein distance (also known as edit distance) is a measure of
138
+ # similiarity between two strings where the distance is measured as the
139
+ # number of character deletions, insertions or substitutions required to
140
+ # transform one string to the other string.
141
+ #
142
+ # target:: the target word or phrase
143
+ # returns:: the similarity, 0.0 or less indicates that it matches less
144
+ # than the required threshold and 1.0 indicates that the text and
145
+ # target are identical
146
+ def similarity(target)
147
+ m = target.length
148
+ n = @text.length
149
+
150
+ if (n == 0)
151
+ # we don't have anything to compare. That means if we just add the
152
+ # letters for m we get the new word
153
+ return (@prefix_length == 0) ? 0.0 : 1.0 - (m.to_f / @prefix_length)
154
+ end
155
+ if (m == 0)
156
+ return (@prefix_length == 0) ? 0.0 : 1.0 - (n.to_f / @prefix_length)
157
+ end
158
+
159
+ max_distance = max_distance(m)
160
+
161
+ if (max_distance < (m-n).abs)
162
+ #just adding the characters of m to n or vice-versa results in
163
+ #too many edits
164
+ #for example "pre" length is 3 and "prefixes" length is 8. We can see that
165
+ #given this optimal circumstance, the edit distance cannot be less than 5.
166
+ #which is 8-3 or more precisesly Math.abs(3-8).
167
+ #if our maximum edit distance is 4, then we can discard this word
168
+ #without looking at it.
169
+ return 0.0
170
+ end
171
+
172
+ #let's make sure we have enough room in our array to do the distance calculations.
173
+ if (@d[0].length <= m)
174
+ grow_distance_array(m)
175
+ end
176
+
177
+ # init matrix d
178
+ (n+1).times {|i| @d[i][0] = i}
179
+ (m+1).times {|j| @d[0][j] = j}
180
+
181
+ # start computing edit distance
182
+ 1.upto(n) do |i|
183
+ best_possible_edit_distance = m
184
+ s_i = @text[i-1]
185
+ 1.upto(m) do |j|
186
+ if (s_i != target[j-1])
187
+ @d[i][j] = min(@d[i-1][j], @d[i][j-1], @d[i-1][j-1])+1
188
+ else
189
+ @d[i][j] = min(@d[i-1][j]+1, @d[i][j-1]+1, @d[i-1][j-1])
190
+ end
191
+ if @d[i][j] < best_possible_edit_distance
192
+ best_possible_edit_distance = @d[i][j]
193
+ end
194
+ end
195
+
196
+ # After calculating row i, the best possible edit distance can be
197
+ # found by found by finding the smallest value in a given column.
198
+ # If the best_possible_edit_distance is greater than the max distance,
199
+ # abort.
200
+ if (i > max_distance and best_possible_edit_distance > max_distance)
201
+ # equal is okay, but not greater
202
+ # the closest the target can be to the text is just too far away.
203
+ # this target is leaving the party early.
204
+ return 0.0
205
+ end
206
+ end
207
+
208
+ # this will return less than 0.0 when the edit distance is
209
+ # greater than the number of characters in the shorter word.
210
+ # but this was the formula that was previously used in FuzzyTermEnum,
211
+ # so it has not been changed (even though minimum_similarity must be
212
+ # greater than 0.0)
213
+ return 1.0 - (@d[n][m].to_f / (@prefix_length + (n < m ? n : m)))
214
+ end
215
+
216
+ # Grow the second dimension of the array, so that we can calculate the
217
+ # Levenshtein difference.
218
+ def grow_distance_array(m)
219
+ @d = @d.map {Array.new(m+1)}
220
+ end
221
+
222
+ # The max Distance is the maximum Levenshtein distance for the text
223
+ # compared to some other value that results in score that is
224
+ # better than the minimum similarity.
225
+ # m:: the length of the "other value"
226
+ # returns:: the maximum levenshtein distance that we care about
227
+ def max_distance(m)
228
+ if (m >= @max_distances.length)
229
+ @max_distances[m] = calculate_max_distance(m)
230
+ end
231
+ return @max_distances[m]
232
+ end
233
+
234
+ def initialize_max_distances()
235
+ @max_distances.length.times do |i|
236
+ @max_distances[i] = calculate_max_distance(i)
237
+ end
238
+ end
239
+
240
+ def calculate_max_distance(m)
241
+ return ((1-@minimum_similarity) * ([@text.length, m].min + @prefix_length))
242
+ end
243
+ end
244
+ end