ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,34 @@
1
+ module Ferret::Search
2
+ # Lower-level search API.
3
+ #
4
+ # HitCollectors are primarily meant to be used to implement queries, sorting
5
+ # and filtering.
6
+ #
7
+ # See Searcher#search(Query, HitCollector)
8
+ class HitCollector
9
+ # Called once for every non-zero scoring document, with the document number
10
+ # and its score.
11
+ #
12
+ # If, for example, an application wished to collect all of the hits for a
13
+ # query in a BitSet, then it might:
14
+ #
15
+ # searcher = IndexSearcher.new(index_reader)
16
+ # bits = BitSet.new(index_reader.max_doc())
17
+ # searcher.search(query, HitCollector.new()
18
+ # def collect(doc, score)
19
+ # bits.set(doc)
20
+ # end
21
+ # end
22
+ #
23
+ # NOTE: This is called in an inner search loop. For good search
24
+ # performance, implementations of this method should not call
25
+ # Searcher#doc(int) or IndexReader#document(int) on every document number
26
+ # encountered. Doing so can slow searches by an order of magnitude or more.
27
+ #
28
+ # NOTE: The +score+ passed to this method is a raw score. In other words,
29
+ # the score will not necessarily be a float whose value is between 0 and 1.
30
+ def collect(doc, score)
31
+ raise NotImplementedError
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,11 @@
1
+ module Ferret::Search
2
+ class HitQueue < Ferret::Utils::PriorityQueue
3
+ def less_than(hit1, hit2)
4
+ if (hit1.score == hit2.score)
5
+ return hit1.doc > hit2.doc
6
+ else
7
+ return hit1.score < hit2.score
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,173 @@
1
+ module Ferret::Search
2
+
3
+ # Implements search over a single IndexReader.
4
+ #
5
+ # Applications usually need only call the inherited @link #search(Query)end
6
+ # or @link #search(Query,Filter)endmethods. For performance reasons it is
7
+ # recommended to open only one IndexSearcher and use it for all of your searches.
8
+ class IndexSearcher
9
+ include Ferret::Index
10
+
11
+ attr_accessor :similarity, :reader
12
+
13
+ # Creates a searcher searching the index in the provided directory.
14
+ def initialize(arg)
15
+ if arg.is_a?(IndexReader)
16
+ @reader = arg
17
+ elsif arg.is_a?(Ferret::Store::Directory)
18
+ @reader = IndexReader.open(arg)
19
+ elsif arg.is_a?(String)
20
+ @dir = Ferret::Store::FSDirectory.open(arg)
21
+ @reader = IndexReader.open(@dir, true)
22
+ else
23
+ raise ArgumentError, "Unknown argument passed to initialize IndexReader"
24
+ end
25
+
26
+ @similarity = Similarity.default
27
+ end
28
+
29
+ # IndexSearcher was constructed with IndexSearcher(r).
30
+ # If the IndexReader was supplied implicitly by specifying a directory, then
31
+ # the IndexReader gets closed.
32
+ def close()
33
+ @reader.close()
34
+ end
35
+
36
+ # Expert: Returns the number of documents containing +term+.
37
+ # Called by search code to compute term weights.
38
+ # See IndexReader#doc_freq
39
+ def doc_freq(term)
40
+ return @reader.doc_freq(term)
41
+ end
42
+
43
+ # Expert: For each term in the terms array, calculates the number of
44
+ # documents containing +term+. Returns an array with these
45
+ # document frequencies. Used to minimize number of remote calls.
46
+ def doc_freqs(terms)
47
+ result = Array.new(terms.length)
48
+ terms.each_with_index {|term, i| result[i] = doc_freq(term)}
49
+ return result
50
+ end
51
+
52
+ # Expert: Returns the stored fields of document +i+.
53
+ # Called by HitCollector implementations.
54
+ # See IndexReader#get_document
55
+ def doc(i)
56
+ return @reader.document(i)
57
+ end
58
+
59
+ # Expert: Returns one greater than the largest possible document number.
60
+ # Called by search code to compute term weights.
61
+ # See IndexReader#max_doc
62
+ def max_doc()
63
+ return @reader.max_doc()
64
+ end
65
+
66
+ # Creates a weight for +query+
67
+ # returns:: new weight
68
+ def create_weight(query)
69
+ return query.weight(self)
70
+ end
71
+
72
+ # The main search method for the index. You need to create a query to
73
+ # pass to this method. You can also pass a hash with one or more of the
74
+ # following; {filter, num_docs, first_doc, sort}
75
+ #
76
+ # query:: the query to run on the index
77
+ # filter:: filters docs from the search result
78
+ # first_doc:: The index in the results of the first doc retrieved.
79
+ # Default is 0
80
+ # num_docs:: The number of results returned. Default is 10
81
+ # sort:: an array of SortFields describing how to sort the results.
82
+ def search(query, options = {})
83
+ filter = options[:filter]
84
+ first_doc = options[:first_doc]||0
85
+ num_docs = options[:num_docs]||10
86
+ sort = options[:sort]
87
+
88
+ if (num_docs <= 0) # nil might be returned from hq.top() below.
89
+ raise ArgumentError, "num_docs must be > 0 to run a search"
90
+ end
91
+
92
+ scorer = query.weight(self).scorer(@reader)
93
+ if (scorer == nil)
94
+ return TopDocs.new(0, [])
95
+ end
96
+
97
+ bits = (filter.nil? ? nil : filter.bits(@reader))
98
+ if (sort)
99
+ fields = sort.is_a?(Array) ? sort : sort.fields
100
+ hq = FieldSortedHitQueue.new(@reader, fields, num_docs + first_doc)
101
+ else
102
+ hq = HitQueue.new(num_docs + first_doc)
103
+ end
104
+ total_hits = 0
105
+ min_score = 0.0
106
+ scorer.each_hit() do |doc, score|
107
+ if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
108
+ total_hits += 1
109
+ if hq.size < num_docs or score >= min_score
110
+ hq.insert(ScoreDoc.new(doc, score))
111
+ min_score = hq.top.score # maintain min_score
112
+ end
113
+ end
114
+ end
115
+
116
+ score_docs = Array.new(hq.size)
117
+ if (hq.size > first_doc)
118
+ score_docs = Array.new(hq.size - first_doc)
119
+ first_doc.times { hq.pop }
120
+ (hq.size - 1).downto(0) do |i|
121
+ score_docs[i] = hq.pop
122
+ end
123
+ else
124
+ score_docs = []
125
+ hq.clear
126
+ end
127
+
128
+ return TopDocs.new(total_hits, score_docs)
129
+ end
130
+
131
+ # Accepts a block and iterates through all of results yielding the doc
132
+ # number and the score for that hit. The hits are unsorted. This is the
133
+ # fastest way to get all of the hits from a search. However, you will
134
+ # usually want your hits sorted at least by score so you should use the
135
+ # #search method.
136
+ def search_each(query, filter = nil)
137
+ scorer = query.weight(self).scorer(@reader)
138
+ return if scorer == nil
139
+ bits = (filter.nil? ? nil : filter.bits(@reader))
140
+ scorer.each_hit() do |doc, score|
141
+ if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
142
+ yield(doc, score)
143
+ end
144
+ end
145
+ end
146
+
147
+ # rewrites the query into a query that can be processed by the search
148
+ # methods. For example, a Fuzzy query is turned into a massive boolean
149
+ # query.
150
+ #
151
+ # original:: The original query to be rewritten.
152
+ def rewrite(original)
153
+ query = original
154
+ rewritten_query = query.rewrite(@reader)
155
+ while query != rewritten_query
156
+ query = rewritten_query
157
+ rewritten_query = query.rewrite(@reader)
158
+ end
159
+ return query
160
+ end
161
+
162
+ # Returns an Explanation that describes how +doc+ scored against
163
+ # +query+.
164
+ #
165
+ # This is intended to be used in developing Similarity implementations,
166
+ # and, for good performance, should not be displayed with every hit.
167
+ # Computing an explanation is as expensive as executing the query over the
168
+ # entire index.
169
+ def explain(query, doc)
170
+ return query.weight(self).explain(@reader, doc)
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,104 @@
1
+ module Ferret::Search
2
+ # A query that matches all documents.
3
+ class MatchAllDocsQuery < Query
4
+
5
+ def initialize()
6
+ super
7
+ end
8
+
9
+ class MatchAllScorer < Scorer
10
+
11
+ def initialize(reader, similarity)
12
+ super(similarity)
13
+ @reader = reader
14
+ @count = -1
15
+ @max_doc = reader.max_doc
16
+ end
17
+
18
+ def doc()
19
+ return @count
20
+ end
21
+
22
+ def explain(doc)
23
+ return Explanation.new(1.0, "MatchAllDocsQuery")
24
+ end
25
+
26
+ def next?
27
+ while (@count < (@max_doc - 1))
28
+ @count += 1
29
+ if (!@reader.deleted?(@count))
30
+ return true
31
+ end
32
+ end
33
+ return false
34
+ end
35
+
36
+ def score()
37
+ return 1.0
38
+ end
39
+
40
+ def skip_to(target)
41
+ @count = target - 1
42
+ return next?
43
+ end
44
+ end
45
+
46
+ class MatchAllDocsWeight < Weight
47
+ attr_reader :query
48
+ def initialize(query, searcher)
49
+ @query = query
50
+ @searcher = searcher
51
+ end
52
+
53
+ def to_s()
54
+ return "weight(#{@query})"
55
+ end
56
+
57
+ def value()
58
+ return 1.0
59
+ end
60
+
61
+ def sum_of_squared_weights()
62
+ return 1.0
63
+ end
64
+
65
+ def normalize(query_norm)
66
+ end
67
+
68
+ def scorer(reader)
69
+ return MatchAllScorer.new(reader, @query.similarity(@searcher))
70
+ end
71
+
72
+ def explain(reader, doc)
73
+ # explain query weight
74
+ query_expl = Explanation.new(1.0, "MatchAllDocsQuery")
75
+ boost_expl = Explanation.new(@query.boost, "boost")
76
+ if (boost_expl.value != 1.0)
77
+ query_expl << boost_expl
78
+ query_expl.value = boost_expl.value
79
+ end
80
+
81
+ return query_expl
82
+ end
83
+ end
84
+
85
+ def create_weight(searcher)
86
+ return MatchAllDocsWeight.new(self, searcher)
87
+ end
88
+
89
+ def to_s(field)
90
+ buffer = "MatchAllDocsQuery"
91
+ buffer << "^#{boost}" if (boost() != 1.0)
92
+ return buffer
93
+ end
94
+
95
+ def eql?(o)
96
+ return (o.instance_of?(MatchAllDocsQuery) and boost == o.boost)
97
+ end
98
+ alias :== :eql?
99
+
100
+ def hash
101
+ return boost.hash
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,204 @@
1
+ module Ferret::Search
2
+ # MultiPhraseQuery is a generalized version of PhraseQuery, with an added
3
+ # method #add(Term[]).
4
+ #
5
+ # To use this class, to search for the phrase "Microsoft app*" first use
6
+ # add(Term) on the term "Microsoft", then find all terms that have "app" as
7
+ # prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[]
8
+ # terms) to add them to the query.
9
+ #
10
+ # Author Anders Nielsen
11
+ class MultiPhraseQuery < Query
12
+ include Ferret::Index
13
+
14
+ attr_accessor :slop
15
+ attr_reader :positions, :term_arrays, :field
16
+
17
+ def initialize()
18
+ super()
19
+ @slop = 0
20
+ @term_arrays = []
21
+ @positions = []
22
+ @field = nil
23
+ end
24
+
25
+ # Allows to specify the relative position of terms within the phrase.
26
+ #
27
+ # See PhraseQuery#add(Term, int)
28
+ # terms:: the array of terms to search for or a single term
29
+ # position:: the position to search for these terms
30
+ def add(terms, position = nil, pos_inc = 1)
31
+ if position.nil?
32
+ position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
33
+ end
34
+
35
+ if terms.instance_of?(Term)
36
+ terms = [terms]
37
+ end
38
+
39
+ if (@term_arrays.size == 0)
40
+ @field = terms[0].field
41
+ end
42
+
43
+ terms.each do |term|
44
+ if (term.field != @field)
45
+ raise ArgumentError,
46
+ "All phrase terms must be in the same field (#{@field}): #{term}"
47
+ end
48
+ end
49
+
50
+ @term_arrays << terms
51
+ @positions << position
52
+ end
53
+ alias :<< :add
54
+
55
+ class MultiPhraseWeight < Weight
56
+ include Ferret::Index
57
+
58
+ attr_reader :query, :value
59
+
60
+ def initialize(query, searcher)
61
+ @query = query
62
+ @term_arrays = query.term_arrays
63
+ @positions = query.positions
64
+ @similarity = query.similarity(searcher)
65
+ @idf = 0.0
66
+
67
+ # compute idf
68
+ query.term_arrays.each do |terms|
69
+ terms.each do |term|
70
+ @idf += @similarity.idf_term(term, searcher)
71
+ end
72
+ end
73
+ end
74
+
75
+ def sum_of_squared_weights()
76
+ @query_weight = @idf * @query.boost() # compute query weight
77
+ return @query_weight * @query_weight # square it
78
+ end
79
+
80
+ def normalize(query_norm)
81
+ @query_norm = query_norm
82
+ @query_weight *= query_norm # normalize query weight
83
+ @value = @query_weight * @idf # idf for document
84
+ end
85
+
86
+ def scorer(reader)
87
+ return nil if (@term_arrays.size == 0) # optimize zero-term case
88
+ tps = []
89
+ @term_arrays.each do |terms|
90
+ p = []
91
+ if (terms.length > 1)
92
+ p = MultipleTermDocPosEnum.new(reader, terms)
93
+ else
94
+ p = reader.term_positions_for(terms[0])
95
+ end
96
+
97
+ return nil if (p == nil)
98
+
99
+ tps << p
100
+ end
101
+
102
+ if (@query.slop == 0)
103
+ return ExactPhraseScorer.new(self, tps, @positions, @similarity,
104
+ reader.get_norms(@query.field))
105
+ else
106
+ return SloppyPhraseScorer.new(self, tps, @positions, @similarity,
107
+ @query.slop, reader.get_norms(@query.field))
108
+ end
109
+ end
110
+
111
+ def explain(reader, doc)
112
+
113
+ result = Explanation.new()
114
+ result.description = "weight(#{@query} in #{doc}), product of:"
115
+
116
+ idf_expl = Explanation.new(@idf, "idf(#{@query})")
117
+
118
+ # explain query weight
119
+ query_expl = Explanation.new()
120
+ query_expl.description = "query_weight(#{@query}), product of:"
121
+
122
+ boost_expl = Explanation.new(@query.boost(), "boost")
123
+ (query_expl << boost_expl) if (@query.boost() != 1.0)
124
+
125
+ query_expl << idf_expl
126
+
127
+ query_norm_expl = Explanation.new(@query_norm,"query_norm")
128
+ query_expl << query_norm_expl
129
+
130
+ query_expl.value =
131
+ boost_expl.value * idf_expl.value * query_norm_expl.value
132
+
133
+ result << query_expl
134
+
135
+ # explain field weight
136
+ field_expl = Explanation.new()
137
+ field_expl.description =
138
+ "field_weight(#{@query} in #{doc}), product of:"
139
+
140
+ tf_expl = scorer(reader).explain(doc)
141
+ field_expl << tf_expl
142
+ field_expl << idf_expl
143
+
144
+ field_norm_expl = Explanation.new()
145
+ field_norms = reader.get_norms(@query.field)
146
+ field_norm =
147
+ field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
148
+ field_norm_expl.value = field_norm
149
+ field_norm_expl.description =
150
+ "field_norm(field=#{@query.field}, doc=#{doc})"
151
+ field_expl << field_norm_expl
152
+
153
+ field_expl.value = tf_expl.value * idf_expl.value * field_norm_expl.value
154
+ result << field_expl
155
+
156
+ if (query_expl.value == 1.0)
157
+ return field_expl
158
+ else
159
+ result.value = query_expl.value * field_expl.value
160
+ return result
161
+ end
162
+ end
163
+ end
164
+
165
+ def rewrite(reader)
166
+ if (@term_arrays.size() == 1) # optimize one-term case
167
+ terms = @term_arrays[0]
168
+ bq = BooleanQuery.new(true)
169
+ terms.each do |term|
170
+ bq.add(TermQuery.new(term), BooleanClause::Occur::SHOULD)
171
+ end
172
+ bq.boost = boost()
173
+ return boq
174
+ else
175
+ return self
176
+ end
177
+ end
178
+
179
+ def create_weight(searcher)
180
+ return MultiPhraseWeight.new(self, searcher)
181
+ end
182
+
183
+ # Prints a user-readable version of this query.
184
+ def to_s(f = nil)
185
+ buffer = ""
186
+ buffer << "#{@field}:" if @field != f
187
+ buffer << '"'
188
+ last_pos = -1
189
+ @term_arrays.each_index do |i|
190
+ terms = @term_arrays[i]
191
+ pos = @positions[i]
192
+ last_pos.upto(pos-2) {buffer << "<> "}
193
+ last_pos = pos
194
+ buffer << "#{terms.map {|term| term.text}.join("|")} "
195
+ end
196
+ buffer.rstrip!
197
+ buffer << '"'
198
+
199
+ buffer << "~#{@slop}" if (@slop != 0)
200
+ buffer << "^#{boost()}" if boost() != 1.0
201
+ return buffer
202
+ end
203
+ end
204
+ end