ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,11 @@
1
+ module Ferret::Search
2
+ # Abstract base class providing a mechanism to restrict searches to a subset
3
+ # of an index.
4
+ class Filter
5
+ # Returns a BitSet with true for documents which should be permitted in
6
+ # search results, and false for those that should not.
7
+ def bits(reader)
8
+ raise NotImplementedError
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,130 @@
1
+ module Ferret::Search
2
+ # A query that applies a filter to the results of another query.
3
+ #
4
+ # Note: the bits are retrieved from the filter each time this
5
+ # query is used in a search - use a CachingWrapperFilter to avoid
6
+ # regenerating the bits every time.
7
+ class FilteredQuery < Query
8
+ attr_accessor :sub_query
9
+ attr_reader :filter
10
+
11
+ # Constructs a new query which applies a filter to the results of the
12
+ # original query.
13
+ #
14
+ # Filter.bits() will be called every time this query is used in a search.
15
+ #
16
+ # query:: Query to be filtered, cannot be +nil+.
17
+ # filter:: Filter to apply to query results, cannot be +nil+.
18
+ def initialize(query, filter)
19
+ super()
20
+ @sub_query = query
21
+ @filter = filter
22
+ end
23
+
24
+ # Returns a Weight that applies the filter to the enclosed query's Weight.
25
+ # This is accomplished by overriding the Scorer returned by the Weight.
26
+ def create_weight(searcher)
27
+ sub_weight = @sub_query.create_weight(searcher)
28
+ similarity = @sub_query.similarity(searcher)
29
+ return FilteredWeight.new(self, sub_weight, similarity)
30
+ end
31
+
32
+ class FilteredScorer < Scorer
33
+ def initialize(sub_scorer, bits, similarity)
34
+ super(similarity)
35
+ @sub_scorer = sub_scorer
36
+ @bits = bits
37
+ end
38
+
39
+ # pass these methods through to the enclosed scorer
40
+ def next?() return @sub_scorer.next?; end
41
+ def doc() return @sub_scorer.doc; end
42
+ def skip_to(i) return @sub_scorer.skip_to(i); end
43
+
44
+ # if the document has been filtered out, set score to 0.0
45
+ def score()
46
+ return (@bits.get(@sub_scorer.doc) ? @sub_scorer.score() : 0.0)
47
+ end
48
+
49
+ # add an explanation about whether the document was filtered
50
+ def explain(i)
51
+ exp = @sub_scorer.explain(i)
52
+ if (@bits.get(i))
53
+ exp.description = "allowed by filter: #{exp.description}"
54
+ else
55
+ exp.description = "removed by filter: #{exp.description}"
56
+ end
57
+ return exp
58
+ end
59
+ end
60
+
61
+ class FilteredWeight < Weight
62
+ attr_reader :query
63
+
64
+ def initialize(query, sub_weight, similarity)
65
+ @query = query
66
+ @sub_weight = sub_weight
67
+ @similarity = similarity
68
+ end
69
+
70
+ # pass these methods through to enclosed query's weight
71
+ def value()
72
+ return @sub_weight.value
73
+ end
74
+
75
+ def sum_of_squared_weights()
76
+ return @sub_weight.sum_of_squared_weights
77
+ end
78
+
79
+ def normalize(v)
80
+ return @sub_weight.normalize(v)
81
+ end
82
+
83
+ def explain(ir, i)
84
+ return @sub_weight.explain(ir, i)
85
+ end
86
+
87
+ # return a scorer that overrides the enclosed query's score if
88
+ # the given hit has been filtered out.
89
+ def scorer(reader)
90
+ scorer = @sub_weight.scorer(reader)
91
+ bits = @query.filter.bits(reader)
92
+ return FilteredScorer.new(scorer, bits, @similarity)
93
+ end
94
+ end
95
+
96
+ # Rewrites the wrapped query.
97
+ def rewrite(reader)
98
+ rewritten = @sub_query.rewrite(reader)
99
+ if (rewritten != @sub_query)
100
+ clone = self.clone()
101
+ clone.query = rewritten
102
+ return clone
103
+ else
104
+ return self
105
+ end
106
+ end
107
+
108
+ # inherit javadoc
109
+ def extract_terms(terms)
110
+ @sub_query.extract_terms(terms)
111
+ end
112
+
113
+ # Prints a user-readable version of this query.
114
+ def to_s(f = nil)
115
+ return "filtered(#{@sub_query.to_s(f)})->#{@filter}"
116
+ end
117
+
118
+ # Returns true iff +o+ is equal to this.
119
+ def eql?(o)
120
+ return (o.instance_of?(FilteredQuery) and
121
+ (@sub_query == o.sub_query) and (@filter == o.filter))
122
+ end
123
+ alias :== :eql?
124
+
125
+ # Returns a hash code value for this object.
126
+ def hash()
127
+ return @sub_query.hash ^ @filter.hash
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,79 @@
1
+ module Ferret::Search
2
+
3
+ # Abstract class for enumerating a subset of all terms.
4
+ #
5
+ # Term enumerations are always ordered by Term.<=>(). Each term in
6
+ # the enumeration is greater than all that precede it.
7
+ class FilteredTermEnum < Ferret::Index::TermEnum
8
+
9
+ # Returns the current Term in the enumeration.
10
+ # Returns nil if no Term matches or all terms have been enumerated.
11
+ attr_reader :term
12
+
13
+ def initialize()
14
+ @term = nil
15
+ @enum = nil
16
+ @reader = nil
17
+ end
18
+
19
+ # Equality compare on the term
20
+ def term_compare(term)
21
+ raise NotImplementedError
22
+ end
23
+
24
+ # Equality measure on the term
25
+ def difference()
26
+ raise NotImplementedError
27
+ end
28
+
29
+ # Indiciates the end of the enumeration has been reached
30
+ def end_enum()
31
+ raise NotImplementedError
32
+ end
33
+
34
+ def enum=(enum)
35
+ @enum = enum
36
+ # Find the first term that matches
37
+ term = @enum.term()
38
+ if (term != nil and term_compare(term))
39
+ @term = term
40
+ else
41
+ next?
42
+ end
43
+ end
44
+
45
+ # Returns the doc_freq of the current Term in the enumeration.
46
+ # Returns -1 if no Term matches or all terms have been enumerated.
47
+ def doc_freq()
48
+ if (@enum == nil)
49
+ return -1
50
+ end
51
+ return @enum.doc_freq()
52
+ end
53
+
54
+ # Increments the enumeration to the next element. True if one exists.
55
+ def next?()
56
+ return false if (@enum == nil) # enum not initialized
57
+ @term = nil
58
+ while @term.nil?
59
+ if end_enum() or ! @enum.next?
60
+ return false
61
+ end
62
+ term = @enum.term()
63
+ if (term_compare(term))
64
+ @term = term
65
+ return true
66
+ end
67
+ end
68
+ @term = nil
69
+ return false
70
+ end
71
+
72
+ # Closes the enumeration to further activity, freeing resources.
73
+ def close()
74
+ @enum.close()
75
+ @term = nil
76
+ @enum = nil
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,153 @@
1
+ module Ferret::Search
2
+ # Implements the fuzzy search query. The similiarity measurement
3
+ # is based on the Levenshtein (distance) algorithm.
4
+ class FuzzyQuery < MultiTermQuery
5
+ @@default_min_similarity = 0.5
6
+ @@default_prefix_length = 0
7
+
8
+ def FuzzyQuery.default_min_similarity()
9
+ return @@default_min_similarity
10
+ end
11
+
12
+ def FuzzyQuery.default_min_similarity=(minimum_similarity)
13
+ if (minimum_similarity >= 1.0)
14
+ raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
15
+ elsif (minimum_similarity < 0.0)
16
+ raise ArgumentError, "minimum_similarity cannot be less than 0"
17
+ end
18
+ @@default_min_similarity = minimum_similarity
19
+ end
20
+
21
+ def FuzzyQuery.default_prefix_length()
22
+ return @@default_prefix_length
23
+ end
24
+
25
+ def FuzzyQuery.default_prefix_length=(prefix_length)
26
+ if (prefix_length < 0)
27
+ raise ArgumentError, "prefix_length cannot be less than 0"
28
+ end
29
+ @@default_prefix_length = prefix_length
30
+ end
31
+
32
+
33
+ attr_reader :prefix_length, :minimum_similarity
34
+ # Create a new FuzzyQuery that will match terms with a similarity
35
+ # of at least +minimum_similarity+ to +term+.
36
+ # If a +prefix_length+ > 0 is specified, a common prefix
37
+ # of that length is also required.
38
+ #
39
+ # term:: the term to search for
40
+ # minimum_similarity:: a value between 0 and 1 to set the required
41
+ # similarity between the query term and the matching
42
+ # terms. For example, for a +minimum_similarity+ of
43
+ # <tt>0.5</tt> a term of the same length as the query
44
+ # term is considered similar to the query term if the
45
+ # edit distance between both terms is less than
46
+ # <tt>length(term)*0.5</tt>
47
+ # prefix_length:: length of common (non-fuzzy) prefix. This is the
48
+ # number of characters at the start of a term that
49
+ # must be identical (fuzzy) to the query term if the
50
+ # query is to match that term.
51
+ # raises:: ArgumentError if minimum_similarity is >= 1 or < 0
52
+ # or if prefix_length < 0
53
+ def initialize(term,
54
+ minimum_similarity = @@default_min_similarity,
55
+ prefix_length = @@default_prefix_length)
56
+ super(term)
57
+
58
+ if (minimum_similarity >= 1.0)
59
+ raise ArgumentError, "minimum_similarity >= 1"
60
+ elsif (minimum_similarity < 0.0)
61
+ raise ArgumentError, "minimum_similarity < 0"
62
+ end
63
+
64
+ if (prefix_length < 0)
65
+ raise ArgumentError, "prefix_length < 0"
66
+ end
67
+
68
+ @minimum_similarity = minimum_similarity
69
+ @prefix_length = prefix_length
70
+ end
71
+
72
+ def get_term_enum(reader)
73
+ return FuzzyTermEnum.new(reader, @term, @minimum_similarity, @prefix_length)
74
+ end
75
+
76
+ def rewrite(reader)
77
+
78
+ fuzzy_enum = get_term_enum(reader)
79
+ max_clause_count = BooleanQuery.max_clause_count
80
+ st_queue = ScoreTermQueue.new(max_clause_count)
81
+
82
+ begin
83
+ begin
84
+ min_score = 0.0
85
+ score = 0.0
86
+ t = fuzzy_enum.term()
87
+ if t
88
+ score = fuzzy_enum.difference()
89
+
90
+ # terms come in alphabetical order, therefore if queue is full and score
91
+ # not bigger than min_score, we can skip
92
+ if(st_queue.size < max_clause_count or score > min_score)
93
+ st_queue.insert(ScoreTerm.new(t, score))
94
+ min_score = st_queue.top.score # maintain min_score
95
+ end
96
+ end
97
+ end while fuzzy_enum.next?
98
+ ensure
99
+ fuzzy_enum.close()
100
+ end
101
+
102
+ bq = BooleanQuery.new(true)
103
+ st_queue.size.times do |i|
104
+ st = st_queue.pop()
105
+ tq = TermQuery.new(st.term) # found a match
106
+ tq.boost = boost() * st.score # set the boost
107
+ bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
108
+ end
109
+
110
+ return bq
111
+ end
112
+
113
+ def to_s(field = nil)
114
+ buffer = ""
115
+ buffer << "#{@term.field}:" if @term.field != field
116
+ buffer << "#{@term.text}~#{minimum_similarity}"
117
+ buffer << "^#{boost()}" if (boost() != 1.0)
118
+ return buffer
119
+ end
120
+
121
+ class ScoreTerm
122
+ attr_accessor :term, :score
123
+
124
+ def initialize(term, score)
125
+ @term = term
126
+ @score = score
127
+ end
128
+ end
129
+
130
+ class ScoreTermQueue < Ferret::Utils::PriorityQueue
131
+
132
+ # See PriorityQueue#less_than(o1, o2)
133
+ def less_than(st1, st2)
134
+ if (st1.score == st1.score)
135
+ return st1.term > st2.term
136
+ else
137
+ return st1.score < st2.score
138
+ end
139
+ end
140
+ end
141
+
142
+ def eql?(o)
143
+ return (o.instance_of?(FuzzyQuery) and super(o) and
144
+ (@minimum_similarity == o.minimum_similarity) and
145
+ (@prefix_length == fuzzyQuery.prefix_length))
146
+ end
147
+ alias :== :eql?
148
+
149
+ def hash()
150
+ return super ^ @minimum_similarity.hash ^ @prefix_length.hash
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,244 @@
1
+ module Ferret::Search
2
+ # Subclass of FilteredTermEnum for enumerating all terms that are similiar
3
+ # to the specified filter term.
4
+ #
5
+ # Term enumerations are always ordered by Term.compareTo(). Each term in
6
+ # the enumeration is greater than all that precede it.
7
+ class FuzzyTermEnum < FilteredTermEnum
8
+ include Ferret::Index
9
+ attr_reader :end_enum
10
+
11
+ # This should be somewhere around the average long word.
12
+ # If it is longer, we waste time and space. If it is shorter, we waste a
13
+ # little bit of time growing the array as we encounter longer words.
14
+ TYPICAL_LONGEST_WORD_IN_INDEX = 19
15
+
16
+ # Constructor for enumeration of all terms from specified +reader+ which
17
+ # share a prefix of length +prefix_length+ with +term+ and which have a
18
+ # fuzzy similarity > +min_similarity+.
19
+ #
20
+ # After calling the constructor the enumeration is already pointing to the
21
+ # first valid term if such a term exists.
22
+ #
23
+ # reader:: Delivers terms.
24
+ # term:: Pattern term.
25
+ # min_similarity:: Minimum required similarity for terms from the reader.
26
+ # Default value is 0.5.
27
+ # prefix_length:: Length of required common prefix. Default value is 0.
28
+ def initialize(reader, term,
29
+ minimum_similarity = FuzzyQuery.default_min_similarity,
30
+ prefix_length = FuzzyQuery.default_prefix_length)
31
+ super()
32
+
33
+ @reader = reader
34
+ @end_enum = false
35
+ @max_distances = Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)
36
+
37
+
38
+ if (minimum_similarity >= 1.0)
39
+ raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
40
+ elsif (minimum_similarity < 0.0)
41
+ raise ArgumentError, "minimum_similarity cannot be less than 0"
42
+ end
43
+ if(prefix_length < 0)
44
+ raise ArgumentError, "prefix_length cannot be less than 0"
45
+ end
46
+
47
+ @minimum_similarity = minimum_similarity
48
+ @scale_factor = 1.0 / (1.0 - @minimum_similarity)
49
+ @search_term = term
50
+ @field = @search_term.field
51
+
52
+ # The prefix could be longer than the word.
53
+ # It's kind of silly though. It means we must match the entire word.
54
+ term_length = @search_term.text.length
55
+ if prefix_length > term_length
56
+ @prefix_length = term_length
57
+ else
58
+ @prefix_length = prefix_length
59
+ end
60
+
61
+ @text = @search_term.text[@prefix_length..-1]
62
+ @prefix = @search_term.text[0, @prefix_length]
63
+
64
+ initialize_max_distances()
65
+
66
+ # Allows us save time required to create a new array
67
+ # everytime similarity is called.
68
+ @d = init_distance_array()
69
+
70
+ self.enum = reader.terms_from(Term.new(@search_term.field, @prefix))
71
+ end
72
+
73
+ # The term_compare method in FuzzyTermEnum uses Levenshtein distance to
74
+ # calculate the distance between the given term and the comparing term.
75
+ def term_compare(term)
76
+ if (@field == term.field and term.text[0, @prefix_length] == @prefix)
77
+ target = term.text[@prefix_length..-1]
78
+ @similarity = similarity(target)
79
+ return (@similarity > @minimum_similarity)
80
+ end
81
+ @end_enum = true
82
+ return false
83
+ end
84
+
85
+ def difference()
86
+ return (@scale_factor * (@similarity - @minimum_similarity))
87
+ end
88
+
89
+ # ****************************
90
+ # Compute Levenshtein distance
91
+ # ****************************
92
+
93
+ # Finds and returns the smallest of three integers
94
+ def min(a, b, c)
95
+ t = (a < b) ? a : b
96
+ return (t < c) ? t : c
97
+ end
98
+
99
+ def init_distance_array()
100
+ return Array.new(@text.length() + 1) {Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)}
101
+ end
102
+
103
+ # Similarity returns a number that is 1.0 or less (including negative
104
+ # numbers) based on how similar the Term is compared to a target term. It
105
+ # returns exactly 0.0 when
106
+ #
107
+ # edit_distance < maximum_edit_distance
108
+ #
109
+ # Otherwise it returns:
110
+ #
111
+ # 1 - (edit_distance / length)
112
+ #
113
+ # where length is the length of the shortest term (text or target)
114
+ # including a prefix that are identical and edit_distance is the
115
+ # Levenshtein distance for the two words.
116
+ #
117
+ # Embedded within this algorithm is a fail-fast Levenshtein distance
118
+ # algorithm. The fail-fast algorithm differs from the standard
119
+ # Levenshtein distance algorithm in that it is aborted if it is discovered
120
+ # that the mimimum distance between the words is greater than some
121
+ # threshold.
122
+ #
123
+ # To calculate the maximum distance threshold we use the following formula:
124
+ #
125
+ # (1 - minimum_similarity) * length
126
+ #
127
+ # where length is the shortest term including any prefix that is not part
128
+ # of the similarity comparision. This formula was derived by solving for
129
+ # what maximum value of distance returns false for the following
130
+ # statements:
131
+ #
132
+ # similarity = 1 - (distance / (prefix_length + [textlen, targetlen].min))
133
+ # return (similarity > minimum_similarity)
134
+ #
135
+ # where distance is the Levenshtein distance for the two words.
136
+ #
137
+ # Levenshtein distance (also known as edit distance) is a measure of
138
+ # similiarity between two strings where the distance is measured as the
139
+ # number of character deletions, insertions or substitutions required to
140
+ # transform one string to the other string.
141
+ #
142
+ # target:: the target word or phrase
143
+ # returns:: the similarity, 0.0 or less indicates that it matches less
144
+ # than the required threshold and 1.0 indicates that the text and
145
+ # target are identical
146
+ def similarity(target)
147
+ m = target.length
148
+ n = @text.length
149
+
150
+ if (n == 0)
151
+ # we don't have anything to compare. That means if we just add the
152
+ # letters for m we get the new word
153
+ return (@prefix_length == 0) ? 0.0 : 1.0 - (m.to_f / @prefix_length)
154
+ end
155
+ if (m == 0)
156
+ return (@prefix_length == 0) ? 0.0 : 1.0 - (n.to_f / @prefix_length)
157
+ end
158
+
159
+ max_distance = max_distance(m)
160
+
161
+ if (max_distance < (m-n).abs)
162
+ #just adding the characters of m to n or vice-versa results in
163
+ #too many edits
164
+ #for example "pre" length is 3 and "prefixes" length is 8. We can see that
165
+ #given this optimal circumstance, the edit distance cannot be less than 5.
166
+ #which is 8-3 or more precisesly Math.abs(3-8).
167
+ #if our maximum edit distance is 4, then we can discard this word
168
+ #without looking at it.
169
+ return 0.0
170
+ end
171
+
172
+ #let's make sure we have enough room in our array to do the distance calculations.
173
+ if (@d[0].length <= m)
174
+ grow_distance_array(m)
175
+ end
176
+
177
+ # init matrix d
178
+ (n+1).times {|i| @d[i][0] = i}
179
+ (m+1).times {|j| @d[0][j] = j}
180
+
181
+ # start computing edit distance
182
+ 1.upto(n) do |i|
183
+ best_possible_edit_distance = m
184
+ s_i = @text[i-1]
185
+ 1.upto(m) do |j|
186
+ if (s_i != target[j-1])
187
+ @d[i][j] = min(@d[i-1][j], @d[i][j-1], @d[i-1][j-1])+1
188
+ else
189
+ @d[i][j] = min(@d[i-1][j]+1, @d[i][j-1]+1, @d[i-1][j-1])
190
+ end
191
+ if @d[i][j] < best_possible_edit_distance
192
+ best_possible_edit_distance = @d[i][j]
193
+ end
194
+ end
195
+
196
+ # After calculating row i, the best possible edit distance can be
197
+ # found by found by finding the smallest value in a given column.
198
+ # If the best_possible_edit_distance is greater than the max distance,
199
+ # abort.
200
+ if (i > max_distance and best_possible_edit_distance > max_distance)
201
+ # equal is okay, but not greater
202
+ # the closest the target can be to the text is just too far away.
203
+ # this target is leaving the party early.
204
+ return 0.0
205
+ end
206
+ end
207
+
208
+ # this will return less than 0.0 when the edit distance is
209
+ # greater than the number of characters in the shorter word.
210
+ # but this was the formula that was previously used in FuzzyTermEnum,
211
+ # so it has not been changed (even though minimum_similarity must be
212
+ # greater than 0.0)
213
+ return 1.0 - (@d[n][m].to_f / (@prefix_length + (n < m ? n : m)))
214
+ end
215
+
216
+ # Grow the second dimension of the array, so that we can calculate the
217
+ # Levenshtein difference.
218
+ def grow_distance_array(m)
219
+ @d = @d.map {Array.new(m+1)}
220
+ end
221
+
222
+ # The max Distance is the maximum Levenshtein distance for the text
223
+ # compared to some other value that results in score that is
224
+ # better than the minimum similarity.
225
+ # m:: the length of the "other value"
226
+ # returns:: the maximum levenshtein distance that we care about
227
+ def max_distance(m)
228
+ if (m >= @max_distances.length)
229
+ @max_distances[m] = calculate_max_distance(m)
230
+ end
231
+ return @max_distances[m]
232
+ end
233
+
234
+ def initialize_max_distances()
235
+ @max_distances.length.times do |i|
236
+ @max_distances[i] = calculate_max_distance(i)
237
+ end
238
+ end
239
+
240
+ def calculate_max_distance(m)
241
+ return ((1-@minimum_similarity) * ([@text.length, m].min + @prefix_length))
242
+ end
243
+ end
244
+ end