ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,11 +0,0 @@
1
- module Ferret::Search
2
- # Abstract base class providing a mechanism to restrict searches to a subset
3
- # of an index.
4
- class Filter
5
- # Returns a BitSet with true for documents which should be permitted in
6
- # search results, and false for those that should not.
7
- def bits(reader)
8
- raise NotImplementedError
9
- end
10
- end
11
- end
@@ -1,130 +0,0 @@
1
- module Ferret::Search
2
- # A query that applies a filter to the results of another query.
3
- #
4
- # Note: the bits are retrieved from the filter each time this
5
- # query is used in a search - use a CachingWrapperFilter to avoid
6
- # regenerating the bits every time.
7
- class FilteredQuery < Query
8
- attr_accessor :sub_query
9
- attr_reader :filter
10
-
11
- # Constructs a new query which applies a filter to the results of the
12
- # original query.
13
- #
14
- # Filter.bits() will be called every time this query is used in a search.
15
- #
16
- # query:: Query to be filtered, cannot be +nil+.
17
- # filter:: Filter to apply to query results, cannot be +nil+.
18
- def initialize(query, filter)
19
- super()
20
- @sub_query = query
21
- @filter = filter
22
- end
23
-
24
- # Returns a Weight that applies the filter to the enclosed query's Weight.
25
- # This is accomplished by overriding the Scorer returned by the Weight.
26
- def create_weight(searcher)
27
- sub_weight = @sub_query.create_weight(searcher)
28
- similarity = @sub_query.similarity(searcher)
29
- return FilteredWeight.new(self, sub_weight, similarity)
30
- end
31
-
32
- class FilteredScorer < Scorer
33
- def initialize(sub_scorer, bits, similarity)
34
- super(similarity)
35
- @sub_scorer = sub_scorer
36
- @bits = bits
37
- end
38
-
39
- # pass these methods through to the enclosed scorer
40
- def next?() return @sub_scorer.next?; end
41
- def doc() return @sub_scorer.doc; end
42
- def skip_to(i) return @sub_scorer.skip_to(i); end
43
-
44
- # if the document has been filtered out, set score to 0.0
45
- def score()
46
- return (@bits.get(@sub_scorer.doc) ? @sub_scorer.score() : 0.0)
47
- end
48
-
49
- # add an explanation about whether the document was filtered
50
- def explain(i)
51
- exp = @sub_scorer.explain(i)
52
- if (@bits.get(i))
53
- exp.description = "allowed by filter: #{exp.description}"
54
- else
55
- exp.description = "removed by filter: #{exp.description}"
56
- end
57
- return exp
58
- end
59
- end
60
-
61
- class FilteredWeight < Weight
62
- attr_reader :query
63
-
64
- def initialize(query, sub_weight, similarity)
65
- @query = query
66
- @sub_weight = sub_weight
67
- @similarity = similarity
68
- end
69
-
70
- # pass these methods through to enclosed query's weight
71
- def value()
72
- return @sub_weight.value
73
- end
74
-
75
- def sum_of_squared_weights()
76
- return @sub_weight.sum_of_squared_weights
77
- end
78
-
79
- def normalize(v)
80
- return @sub_weight.normalize(v)
81
- end
82
-
83
- def explain(ir, i)
84
- return @sub_weight.explain(ir, i)
85
- end
86
-
87
- # return a scorer that overrides the enclosed query's score if
88
- # the given hit has been filtered out.
89
- def scorer(reader)
90
- scorer = @sub_weight.scorer(reader)
91
- bits = @query.filter.bits(reader)
92
- return FilteredScorer.new(scorer, bits, @similarity)
93
- end
94
- end
95
-
96
- # Rewrites the wrapped query.
97
- def rewrite(reader)
98
- rewritten = @sub_query.rewrite(reader)
99
- if (rewritten != @sub_query)
100
- clone = self.clone()
101
- clone.query = rewritten
102
- return clone
103
- else
104
- return self
105
- end
106
- end
107
-
108
- # inherit javadoc
109
- def extract_terms(terms)
110
- @sub_query.extract_terms(terms)
111
- end
112
-
113
- # Prints a user-readable version of this query.
114
- def to_s(f = nil)
115
- return "filtered(#{@sub_query.to_s(f)})->#{@filter}"
116
- end
117
-
118
- # Returns true iff +o+ is equal to this.
119
- def eql?(o)
120
- return (o.instance_of?(FilteredQuery) and
121
- (@sub_query == o.sub_query) and (@filter == o.filter))
122
- end
123
- alias :== :eql?
124
-
125
- # Returns a hash code value for this object.
126
- def hash()
127
- return @sub_query.hash ^ @filter.hash
128
- end
129
- end
130
- end
@@ -1,79 +0,0 @@
1
- module Ferret::Search
2
-
3
- # Abstract class for enumerating a subset of all terms.
4
- #
5
- # Term enumerations are always ordered by Term.<=>(). Each term in
6
- # the enumeration is greater than all that precede it.
7
- class FilteredTermEnum < Ferret::Index::TermEnum
8
-
9
- # Returns the current Term in the enumeration.
10
- # Returns nil if no Term matches or all terms have been enumerated.
11
- attr_reader :term
12
-
13
- def initialize()
14
- @term = nil
15
- @enum = nil
16
- @reader = nil
17
- end
18
-
19
- # Equality compare on the term
20
- def term_compare(term)
21
- raise NotImplementedError
22
- end
23
-
24
- # Equality measure on the term
25
- def difference()
26
- raise NotImplementedError
27
- end
28
-
29
- # Indiciates the end of the enumeration has been reached
30
- def end_enum()
31
- raise NotImplementedError
32
- end
33
-
34
- def enum=(enum)
35
- @enum = enum
36
- # Find the first term that matches
37
- term = @enum.term()
38
- if (term != nil and term_compare(term))
39
- @term = term
40
- else
41
- next?
42
- end
43
- end
44
-
45
- # Returns the doc_freq of the current Term in the enumeration.
46
- # Returns -1 if no Term matches or all terms have been enumerated.
47
- def doc_freq()
48
- if (@enum == nil)
49
- return -1
50
- end
51
- return @enum.doc_freq()
52
- end
53
-
54
- # Increments the enumeration to the next element. True if one exists.
55
- def next?()
56
- return false if (@enum == nil) # enum not initialized
57
- @term = nil
58
- while @term.nil?
59
- if end_enum() or ! @enum.next?
60
- return false
61
- end
62
- term = @enum.term()
63
- if (term_compare(term))
64
- @term = term
65
- return true
66
- end
67
- end
68
- @term = nil
69
- return false
70
- end
71
-
72
- # Closes the enumeration to further activity, freeing resources.
73
- def close()
74
- @enum.close()
75
- @term = nil
76
- @enum = nil
77
- end
78
- end
79
- end
@@ -1,154 +0,0 @@
1
- module Ferret::Search
2
- # Implements the fuzzy search query. The similiarity measurement
3
- # is based on the Levenshtein (distance) algorithm.
4
- class FuzzyQuery < MultiTermQuery
5
- @@default_min_similarity = 0.5
6
- @@default_prefix_length = 0
7
-
8
- def FuzzyQuery.default_min_similarity()
9
- return @@default_min_similarity
10
- end
11
-
12
- def FuzzyQuery.default_min_similarity=(minimum_similarity)
13
- if (minimum_similarity >= 1.0)
14
- raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
15
- elsif (minimum_similarity < 0.0)
16
- raise ArgumentError, "minimum_similarity cannot be less than 0"
17
- end
18
- @@default_min_similarity = minimum_similarity
19
- end
20
-
21
- def FuzzyQuery.default_prefix_length()
22
- return @@default_prefix_length
23
- end
24
-
25
- def FuzzyQuery.default_prefix_length=(prefix_length)
26
- if (prefix_length < 0)
27
- raise ArgumentError, "prefix_length cannot be less than 0"
28
- end
29
- @@default_prefix_length = prefix_length
30
- end
31
-
32
-
33
- attr_reader :prefix_length, :minimum_similarity
34
- # Create a new FuzzyQuery that will match terms with a similarity
35
- # of at least +minimum_similarity+ to +term+.
36
- # If a +prefix_length+ > 0 is specified, a common prefix
37
- # of that length is also required.
38
- #
39
- # term:: the term to search for
40
- # minimum_similarity:: a value between 0 and 1 to set the required
41
- # similarity between the query term and the matching
42
- # terms. For example, for a +minimum_similarity+ of
43
- # <tt>0.5</tt> a term of the same length as the query
44
- # term is considered similar to the query term if the
45
- # edit distance between both terms is less than
46
- # <tt>length(term)*0.5</tt>
47
- # prefix_length:: length of common (non-fuzzy) prefix. This is the
48
- # number of characters at the start of a term that
49
- # must be identical (fuzzy) to the query term if the
50
- # query is to match that term.
51
- # raises:: ArgumentError if minimum_similarity is >= 1 or < 0
52
- # or if prefix_length < 0
53
- def initialize(term,
54
- minimum_similarity = @@default_min_similarity,
55
- prefix_length = @@default_prefix_length)
56
- super(term)
57
-
58
- if (minimum_similarity >= 1.0)
59
- raise ArgumentError, "minimum_similarity >= 1"
60
- elsif (minimum_similarity < 0.0)
61
- raise ArgumentError, "minimum_similarity < 0"
62
- end
63
-
64
- if (prefix_length < 0)
65
- raise ArgumentError, "prefix_length < 0"
66
- end
67
-
68
- @minimum_similarity = minimum_similarity
69
- @prefix_length = prefix_length
70
- end
71
-
72
- def get_term_enum(reader)
73
- return FuzzyTermEnum.new(reader, @term, @minimum_similarity, @prefix_length)
74
- end
75
-
76
- def rewrite(reader)
77
-
78
- fuzzy_enum = get_term_enum(reader)
79
- max_clause_count = BooleanQuery.max_clause_count
80
- st_queue = ScoreTermQueue.new(max_clause_count)
81
-
82
- begin
83
- begin
84
- min_score = 0.0
85
- score = 0.0
86
- t = fuzzy_enum.term()
87
- if t
88
- score = fuzzy_enum.difference()
89
-
90
- # terms come in alphabetical order, therefore if queue is full and score
91
- # not bigger than min_score, we can skip
92
- if(st_queue.size < max_clause_count or score > min_score)
93
- st_queue.insert(ScoreTerm.new(t, score))
94
- min_score = st_queue.top.score # maintain min_score
95
- end
96
- end
97
- end while fuzzy_enum.next?
98
- ensure
99
- fuzzy_enum.close()
100
- end
101
-
102
- bq = BooleanQuery.new(true)
103
- st_queue.size.times do |i|
104
- st = st_queue.pop()
105
- tq = TermQuery.new(st.term) # found a match
106
- tq.boost = boost() * st.score # set the boost
107
- bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
108
- end
109
-
110
- return bq
111
- end
112
-
113
- def to_s(field = nil)
114
- buffer = ""
115
- buffer << "#{@term.field}:" if @term.field != field
116
- buffer << "#{@term.text}~"
117
- buffer << minimum_similarity.to_s if minimum_similarity != 0.5
118
- buffer << "^#{boost()}" if (boost() != 1.0)
119
- return buffer
120
- end
121
-
122
- class ScoreTerm
123
- attr_accessor :term, :score
124
-
125
- def initialize(term, score)
126
- @term = term
127
- @score = score
128
- end
129
- end
130
-
131
- class ScoreTermQueue < Ferret::Utils::PriorityQueue
132
-
133
- # See PriorityQueue#less_than(o1, o2)
134
- def less_than(st1, st2)
135
- if (st1.score == st1.score)
136
- return st1.term > st2.term
137
- else
138
- return st1.score < st2.score
139
- end
140
- end
141
- end
142
-
143
- def eql?(o)
144
- return (o.instance_of?(FuzzyQuery) and super(o) and
145
- (@minimum_similarity == o.minimum_similarity) and
146
- (@prefix_length == fuzzyQuery.prefix_length))
147
- end
148
- alias :== :eql?
149
-
150
- def hash()
151
- return super ^ @minimum_similarity.hash ^ @prefix_length.hash
152
- end
153
- end
154
- end
@@ -1,247 +0,0 @@
1
- require 'monitor'
2
-
3
- module Ferret::Search
4
- # Subclass of FilteredTermEnum for enumerating all terms that are similiar
5
- # to the specified filter term.
6
- #
7
- # Term enumerations are always ordered by Term.compareTo(). Each term in
8
- # the enumeration is greater than all that precede it.
9
- class FuzzyTermEnum < FilteredTermEnum
10
- include MonitorMixin
11
-
12
- include Ferret::Index
13
- attr_reader :end_enum
14
-
15
- # This should be somewhere around the average long word.
16
- # If it is longer, we waste time and space. If it is shorter, we waste a
17
- # little bit of time growing the array as we encounter longer words.
18
- TYPICAL_LONGEST_WORD_IN_INDEX = 19
19
-
20
- # Constructor for enumeration of all terms from specified +reader+ which
21
- # share a prefix of length +prefix_length+ with +term+ and which have a
22
- # fuzzy similarity > +min_similarity+.
23
- #
24
- # After calling the constructor the enumeration is already pointing to the
25
- # first valid term if such a term exists.
26
- #
27
- # reader:: Delivers terms.
28
- # term:: Pattern term.
29
- # min_similarity:: Minimum required similarity for terms from the reader.
30
- # Default value is 0.5.
31
- # prefix_length:: Length of required common prefix. Default value is 0.
32
- def initialize(reader, term,
33
- minimum_similarity = FuzzyQuery.default_min_similarity,
34
- prefix_length = FuzzyQuery.default_prefix_length)
35
- super()
36
-
37
- @reader = reader
38
- @end_enum = false
39
- @max_distances = Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)
40
-
41
-
42
- if (minimum_similarity >= 1.0)
43
- raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
44
- elsif (minimum_similarity < 0.0)
45
- raise ArgumentError, "minimum_similarity cannot be less than 0"
46
- end
47
- if(prefix_length < 0)
48
- raise ArgumentError, "prefix_length cannot be less than 0"
49
- end
50
-
51
- @minimum_similarity = minimum_similarity
52
- @scale_factor = 1.0 / (1.0 - @minimum_similarity)
53
- @search_term = term
54
- @field = @search_term.field
55
-
56
- # The prefix could be longer than the word.
57
- # It's kind of silly though. It means we must match the entire word.
58
- term_length = @search_term.text.length
59
- if prefix_length > term_length
60
- @prefix_length = term_length
61
- else
62
- @prefix_length = prefix_length
63
- end
64
-
65
- @text = @search_term.text[@prefix_length..-1]
66
- @prefix = @search_term.text[0, @prefix_length]
67
-
68
- initialize_max_distances()
69
-
70
- # Allows us save time required to create a new array
71
- # everytime similarity is called.
72
- @d = init_distance_array()
73
-
74
- self.enum = reader.terms_from(Term.new(@search_term.field, @prefix))
75
- end
76
-
77
- # The term_compare method in FuzzyTermEnum uses Levenshtein distance to
78
- # calculate the distance between the given term and the comparing term.
79
- def term_compare(term)
80
- if (@field == term.field and term.text[0, @prefix_length] == @prefix)
81
- target = term.text[@prefix_length..-1]
82
- @similarity = similarity(target)
83
- return (@similarity > @minimum_similarity)
84
- end
85
- @end_enum = true
86
- return false
87
- end
88
-
89
- def difference()
90
- return (@scale_factor * (@similarity - @minimum_similarity))
91
- end
92
-
93
- # ****************************
94
- # Compute Levenshtein distance
95
- # ****************************
96
-
97
- # Finds and returns the smallest of three integers
98
- def min(a, b, c)
99
- t = (a < b) ? a : b
100
- return (t < c) ? t : c
101
- end
102
-
103
- def init_distance_array()
104
- return Array.new(@text.length() + 1) {Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)}
105
- end
106
-
107
- # Similarity returns a number that is 1.0 or less (including negative
108
- # numbers) based on how similar the Term is compared to a target term. It
109
- # returns exactly 0.0 when
110
- #
111
- # edit_distance < maximum_edit_distance
112
- #
113
- # Otherwise it returns:
114
- #
115
- # 1 - (edit_distance / length)
116
- #
117
- # where length is the length of the shortest term (text or target)
118
- # including a prefix that are identical and edit_distance is the
119
- # Levenshtein distance for the two words.
120
- #
121
- # Embedded within this algorithm is a fail-fast Levenshtein distance
122
- # algorithm. The fail-fast algorithm differs from the standard
123
- # Levenshtein distance algorithm in that it is aborted if it is discovered
124
- # that the mimimum distance between the words is greater than some
125
- # threshold.
126
- #
127
- # To calculate the maximum distance threshold we use the following formula:
128
- #
129
- # (1 - minimum_similarity) * length
130
- #
131
- # where length is the shortest term including any prefix that is not part
132
- # of the similarity comparision. This formula was derived by solving for
133
- # what maximum value of distance returns false for the following
134
- # statements:
135
- #
136
- # similarity = 1 - (distance / (prefix_length + [textlen, targetlen].min))
137
- # return (similarity > minimum_similarity)
138
- #
139
- # where distance is the Levenshtein distance for the two words.
140
- #
141
- # Levenshtein distance (also known as edit distance) is a measure of
142
- # similiarity between two strings where the distance is measured as the
143
- # number of character deletions, insertions or substitutions required to
144
- # transform one string to the other string.
145
- #
146
- # target:: the target word or phrase
147
- # returns:: the similarity, 0.0 or less indicates that it matches less
148
- # than the required threshold and 1.0 indicates that the text and
149
- # target are identical
150
- def similarity(target)
151
- synchronize do
152
- m = target.length
153
- n = @text.length
154
-
155
- if (n == 0)
156
- # we don't have anything to compare. That means if we just add the
157
- # letters for m we get the new word
158
- return (@prefix_length == 0) ? 0.0 : 1.0 - (m.to_f / @prefix_length)
159
- end
160
- if (m == 0)
161
- return (@prefix_length == 0) ? 0.0 : 1.0 - (n.to_f / @prefix_length)
162
- end
163
-
164
- max_distance = max_distance(m)
165
-
166
- if (max_distance < (m-n).abs)
167
- #just adding the characters of m to n or vice-versa results in
168
- #too many edits
169
- #for example "pre" length is 3 and "prefixes" length is 8. We can see that
170
- #given this optimal circumstance, the edit distance cannot be less than 5.
171
- #which is 8-3 or more precisesly Math.abs(3-8).
172
- #if our maximum edit distance is 4, then we can discard this word
173
- #without looking at it.
174
- return 0.0
175
- end
176
-
177
- #let's make sure we have enough room in our array to do the distance calculations.
178
- if (@d[0].length <= m)
179
- grow_distance_array(m)
180
- end
181
-
182
- # init matrix d
183
- (n+1).times {|i| @d[i][0] = i}
184
- (m+1).times {|j| @d[0][j] = j}
185
-
186
- # start computing edit distance
187
- 1.upto(n) do |i|
188
- best_possible_edit_distance = m
189
- s_i = @text[i-1]
190
- 1.upto(m) do |j|
191
- if (s_i != target[j-1])
192
- @d[i][j] = min(@d[i-1][j], @d[i][j-1], @d[i-1][j-1])+1
193
- else
194
- @d[i][j] = min(@d[i-1][j]+1, @d[i][j-1]+1, @d[i-1][j-1])
195
- end
196
- if @d[i][j] < best_possible_edit_distance
197
- best_possible_edit_distance = @d[i][j]
198
- end
199
- end
200
-
201
- # After calculating row i, the best possible edit distance can be
202
- # found by found by finding the smallest value in a given column.
203
- # If the best_possible_edit_distance is greater than the max distance,
204
- # abort.
205
- if (i > max_distance and best_possible_edit_distance > max_distance)
206
- # equal is okay, but not greater
207
- # the closest the target can be to the text is just too far away.
208
- # this target is leaving the party early.
209
- return 0.0
210
- end
211
- end
212
-
213
- # this will return less than 0.0 when the edit distance is
214
- # greater than the number of characters in the shorter word.
215
- # but this was the formula that was previously used in FuzzyTermEnum,
216
- # so it has not been changed (even though minimum_similarity must be
217
- # greater than 0.0)
218
- return 1.0 - (@d[n][m].to_f / (@prefix_length + (n < m ? n : m)))
219
- end
220
- end
221
-
222
- # Grow the second dimension of the array, so that we can calculate the
223
- # Levenshtein difference.
224
- def grow_distance_array(m)
225
- @d = @d.map {Array.new(m+1)}
226
- end
227
-
228
- # The max Distance is the maximum Levenshtein distance for the text
229
- # compared to some other value that results in score that is
230
- # better than the minimum similarity.
231
- # m:: the length of the "other value"
232
- # returns:: the maximum levenshtein distance that we care about
233
- def max_distance(m)
234
- return @max_distances[m] ||= calculate_max_distance(m)
235
- end
236
-
237
- def initialize_max_distances()
238
- @max_distances.length.times do |i|
239
- @max_distances[i] = calculate_max_distance(i)
240
- end
241
- end
242
-
243
- def calculate_max_distance(m)
244
- return ((1-@minimum_similarity) * ([@text.length, m].min + @prefix_length))
245
- end
246
- end
247
- end