ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,34 @@
1
+ module Ferret::Search
2
+ # Lower-level search API.
3
+ #
4
+ # HitCollectors are primarily meant to be used to implement queries, sorting
5
+ # and filtering.
6
+ #
7
+ # See Searcher#search(Query, HitCollector)
8
+ class HitCollector
9
+ # Called once for every non-zero scoring document, with the document number
10
+ # and its score.
11
+ #
12
+ # If, for example, an application wished to collect all of the hits for a
13
+ # query in a BitSet, then it might:
14
+ #
15
+ # searcher = IndexSearcher.new(index_reader)
16
+ # bits = BitSet.new(index_reader.max_doc())
17
+ # searcher.search(query, HitCollector.new()
18
+ # def collect(doc, score)
19
+ # bits.set(doc)
20
+ # end
21
+ # end
22
+ #
23
+ # NOTE: This is called in an inner search loop. For good search
24
+ # performance, implementations of this method should not call
25
+ # Searcher#doc(int) or IndexReader#document(int) on every document number
26
+ # encountered. Doing so can slow searches by an order of magnitude or more.
27
+ #
28
+ # NOTE: The +score+ passed to this method is a raw score. In other words,
29
+ # the score will not necessarily be a float whose value is between 0 and 1.
30
+ def collect(doc, score)
31
+ raise NotImplementedError
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,11 @@
1
+ module Ferret::Search
2
+ class HitQueue < Ferret::Utils::PriorityQueue
3
+ def less_than(hit1, hit2)
4
+ if (hit1.score == hit2.score)
5
+ return hit1.doc > hit2.doc
6
+ else
7
+ return hit1.score < hit2.score
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,173 @@
1
+ module Ferret::Search
2
+
3
+ # Implements search over a single IndexReader.
4
+ #
5
+ # Applications usually need only call the inherited @link #search(Query)end
6
+ # or @link #search(Query,Filter)endmethods. For performance reasons it is
7
+ # recommended to open only one IndexSearcher and use it for all of your searches.
8
+ class IndexSearcher
9
+ include Ferret::Index
10
+
11
+ attr_accessor :similarity, :reader
12
+
13
+ # Creates a searcher searching the index in the provided directory.
14
+ def initialize(arg)
15
+ if arg.is_a?(IndexReader)
16
+ @reader = arg
17
+ elsif arg.is_a?(Ferret::Store::Directory)
18
+ @reader = IndexReader.open(arg)
19
+ elsif arg.is_a?(String)
20
+ @dir = Ferret::Store::FSDirectory.open(arg)
21
+ @reader = IndexReader.open(@dir, true)
22
+ else
23
+ raise ArgumentError, "Unknown argument passed to initialize IndexReader"
24
+ end
25
+
26
+ @similarity = Similarity.default
27
+ end
28
+
29
+ # IndexSearcher was constructed with IndexSearcher(r).
30
+ # If the IndexReader was supplied implicitly by specifying a directory, then
31
+ # the IndexReader gets closed.
32
+ def close()
33
+ @reader.close()
34
+ end
35
+
36
+ # Expert: Returns the number of documents containing +term+.
37
+ # Called by search code to compute term weights.
38
+ # See IndexReader#doc_freq
39
+ def doc_freq(term)
40
+ return @reader.doc_freq(term)
41
+ end
42
+
43
+ # Expert: For each term in the terms array, calculates the number of
44
+ # documents containing +term+. Returns an array with these
45
+ # document frequencies. Used to minimize number of remote calls.
46
+ def doc_freqs(terms)
47
+ result = Array.new(terms.length)
48
+ terms.each_with_index {|term, i| result[i] = doc_freq(term)}
49
+ return result
50
+ end
51
+
52
+ # Expert: Returns the stored fields of document +i+.
53
+ # Called by HitCollector implementations.
54
+ # See IndexReader#get_document
55
+ def doc(i)
56
+ return @reader.document(i)
57
+ end
58
+
59
+ # Expert: Returns one greater than the largest possible document number.
60
+ # Called by search code to compute term weights.
61
+ # See IndexReader#max_doc
62
+ def max_doc()
63
+ return @reader.max_doc()
64
+ end
65
+
66
+ # Creates a weight for +query+
67
+ # returns:: new weight
68
+ def create_weight(query)
69
+ return query.weight(self)
70
+ end
71
+
72
+ # The main search method for the index. You need to create a query to
73
+ # pass to this method. You can also pass a hash with one or more of the
74
+ # following; {filter, num_docs, first_doc, sort}
75
+ #
76
+ # query:: the query to run on the index
77
+ # filter:: filters docs from the search result
78
+ # first_doc:: The index in the results of the first doc retrieved.
79
+ # Default is 0
80
+ # num_docs:: The number of results returned. Default is 10
81
+ # sort:: an array of SortFields describing how to sort the results.
82
+ def search(query, options = {})
83
+ filter = options[:filter]
84
+ first_doc = options[:first_doc]||0
85
+ num_docs = options[:num_docs]||10
86
+ sort = options[:sort]
87
+
88
+ if (num_docs <= 0) # nil might be returned from hq.top() below.
89
+ raise ArgumentError, "num_docs must be > 0 to run a search"
90
+ end
91
+
92
+ scorer = query.weight(self).scorer(@reader)
93
+ if (scorer == nil)
94
+ return TopDocs.new(0, [])
95
+ end
96
+
97
+ bits = (filter.nil? ? nil : filter.bits(@reader))
98
+ if (sort)
99
+ fields = sort.is_a?(Array) ? sort : sort.fields
100
+ hq = FieldSortedHitQueue.new(@reader, fields, num_docs + first_doc)
101
+ else
102
+ hq = HitQueue.new(num_docs + first_doc)
103
+ end
104
+ total_hits = 0
105
+ min_score = 0.0
106
+ scorer.each_hit() do |doc, score|
107
+ if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
108
+ total_hits += 1
109
+ if hq.size < num_docs or score >= min_score
110
+ hq.insert(ScoreDoc.new(doc, score))
111
+ min_score = hq.top.score # maintain min_score
112
+ end
113
+ end
114
+ end
115
+
116
+ score_docs = Array.new(hq.size)
117
+ if (hq.size > first_doc)
118
+ score_docs = Array.new(hq.size - first_doc)
119
+ first_doc.times { hq.pop }
120
+ (hq.size - 1).downto(0) do |i|
121
+ score_docs[i] = hq.pop
122
+ end
123
+ else
124
+ score_docs = []
125
+ hq.clear
126
+ end
127
+
128
+ return TopDocs.new(total_hits, score_docs)
129
+ end
130
+
131
+ # Accepts a block and iterates through all of results yielding the doc
132
+ # number and the score for that hit. The hits are unsorted. This is the
133
+ # fastest way to get all of the hits from a search. However, you will
134
+ # usually want your hits sorted at least by score so you should use the
135
+ # #search method.
136
+ def search_each(query, filter = nil)
137
+ scorer = query.weight(self).scorer(@reader)
138
+ return if scorer == nil
139
+ bits = (filter.nil? ? nil : filter.bits(@reader))
140
+ scorer.each_hit() do |doc, score|
141
+ if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
142
+ yield(doc, score)
143
+ end
144
+ end
145
+ end
146
+
147
+ # rewrites the query into a query that can be processed by the search
148
+ # methods. For example, a Fuzzy query is turned into a massive boolean
149
+ # query.
150
+ #
151
+ # original:: The original query to be rewritten.
152
+ def rewrite(original)
153
+ query = original
154
+ rewritten_query = query.rewrite(@reader)
155
+ while query != rewritten_query
156
+ query = rewritten_query
157
+ rewritten_query = query.rewrite(@reader)
158
+ end
159
+ return query
160
+ end
161
+
162
+ # Returns an Explanation that describes how +doc+ scored against
163
+ # +query+.
164
+ #
165
+ # This is intended to be used in developing Similarity implementations,
166
+ # and, for good performance, should not be displayed with every hit.
167
+ # Computing an explanation is as expensive as executing the query over the
168
+ # entire index.
169
+ def explain(query, doc)
170
+ return query.weight(self).explain(@reader, doc)
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,104 @@
1
+ module Ferret::Search
2
+ # A query that matches all documents.
3
+ class MatchAllDocsQuery < Query
4
+
5
+ def initialize()
6
+ super
7
+ end
8
+
9
+ class MatchAllScorer < Scorer
10
+
11
+ def initialize(reader, similarity)
12
+ super(similarity)
13
+ @reader = reader
14
+ @count = -1
15
+ @max_doc = reader.max_doc
16
+ end
17
+
18
+ def doc()
19
+ return @count
20
+ end
21
+
22
+ def explain(doc)
23
+ return Explanation.new(1.0, "MatchAllDocsQuery")
24
+ end
25
+
26
+ def next?
27
+ while (@count < (@max_doc - 1))
28
+ @count += 1
29
+ if (!@reader.deleted?(@count))
30
+ return true
31
+ end
32
+ end
33
+ return false
34
+ end
35
+
36
+ def score()
37
+ return 1.0
38
+ end
39
+
40
+ def skip_to(target)
41
+ @count = target - 1
42
+ return next?
43
+ end
44
+ end
45
+
46
+ class MatchAllDocsWeight < Weight
47
+ attr_reader :query
48
+ def initialize(query, searcher)
49
+ @query = query
50
+ @searcher = searcher
51
+ end
52
+
53
+ def to_s()
54
+ return "weight(#{@query})"
55
+ end
56
+
57
+ def value()
58
+ return 1.0
59
+ end
60
+
61
+ def sum_of_squared_weights()
62
+ return 1.0
63
+ end
64
+
65
+ def normalize(query_norm)
66
+ end
67
+
68
+ def scorer(reader)
69
+ return MatchAllScorer.new(reader, @query.similarity(@searcher))
70
+ end
71
+
72
+ def explain(reader, doc)
73
+ # explain query weight
74
+ query_expl = Explanation.new(1.0, "MatchAllDocsQuery")
75
+ boost_expl = Explanation.new(@query.boost, "boost")
76
+ if (boost_expl.value != 1.0)
77
+ query_expl << boost_expl
78
+ query_expl.value = boost_expl.value
79
+ end
80
+
81
+ return query_expl
82
+ end
83
+ end
84
+
85
+ def create_weight(searcher)
86
+ return MatchAllDocsWeight.new(self, searcher)
87
+ end
88
+
89
+ def to_s(field)
90
+ buffer = "MatchAllDocsQuery"
91
+ buffer << "^#{boost}" if (boost() != 1.0)
92
+ return buffer
93
+ end
94
+
95
+ def eql?(o)
96
+ return (o.instance_of?(MatchAllDocsQuery) and boost == o.boost)
97
+ end
98
+ alias :== :eql?
99
+
100
+ def hash
101
+ return boost.hash
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,204 @@
1
+ module Ferret::Search
2
+ # MultiPhraseQuery is a generalized version of PhraseQuery, with an added
3
+ # method #add(Term[]).
4
+ #
5
+ # To use this class, to search for the phrase "Microsoft app*" first use
6
+ # add(Term) on the term "Microsoft", then find all terms that have "app" as
7
+ # prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[]
8
+ # terms) to add them to the query.
9
+ #
10
+ # Author Anders Nielsen
11
+ class MultiPhraseQuery < Query
12
+ include Ferret::Index
13
+
14
+ attr_accessor :slop
15
+ attr_reader :positions, :term_arrays, :field
16
+
17
+ def initialize()
18
+ super()
19
+ @slop = 0
20
+ @term_arrays = []
21
+ @positions = []
22
+ @field = nil
23
+ end
24
+
25
+ # Allows to specify the relative position of terms within the phrase.
26
+ #
27
+ # See PhraseQuery#add(Term, int)
28
+ # terms:: the array of terms to search for or a single term
29
+ # position:: the position to search for these terms
30
+ def add(terms, position = nil, pos_inc = 1)
31
+ if position.nil?
32
+ position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
33
+ end
34
+
35
+ if terms.instance_of?(Term)
36
+ terms = [terms]
37
+ end
38
+
39
+ if (@term_arrays.size == 0)
40
+ @field = terms[0].field
41
+ end
42
+
43
+ terms.each do |term|
44
+ if (term.field != @field)
45
+ raise ArgumentError,
46
+ "All phrase terms must be in the same field (#{@field}): #{term}"
47
+ end
48
+ end
49
+
50
+ @term_arrays << terms
51
+ @positions << position
52
+ end
53
+ alias :<< :add
54
+
55
+ class MultiPhraseWeight < Weight
56
+ include Ferret::Index
57
+
58
+ attr_reader :query, :value
59
+
60
+ def initialize(query, searcher)
61
+ @query = query
62
+ @term_arrays = query.term_arrays
63
+ @positions = query.positions
64
+ @similarity = query.similarity(searcher)
65
+ @idf = 0.0
66
+
67
+ # compute idf
68
+ query.term_arrays.each do |terms|
69
+ terms.each do |term|
70
+ @idf += @similarity.idf_term(term, searcher)
71
+ end
72
+ end
73
+ end
74
+
75
+ def sum_of_squared_weights()
76
+ @query_weight = @idf * @query.boost() # compute query weight
77
+ return @query_weight * @query_weight # square it
78
+ end
79
+
80
+ def normalize(query_norm)
81
+ @query_norm = query_norm
82
+ @query_weight *= query_norm # normalize query weight
83
+ @value = @query_weight * @idf # idf for document
84
+ end
85
+
86
+ def scorer(reader)
87
+ return nil if (@term_arrays.size == 0) # optimize zero-term case
88
+ tps = []
89
+ @term_arrays.each do |terms|
90
+ p = []
91
+ if (terms.length > 1)
92
+ p = MultipleTermDocPosEnum.new(reader, terms)
93
+ else
94
+ p = reader.term_positions_for(terms[0])
95
+ end
96
+
97
+ return nil if (p == nil)
98
+
99
+ tps << p
100
+ end
101
+
102
+ if (@query.slop == 0)
103
+ return ExactPhraseScorer.new(self, tps, @positions, @similarity,
104
+ reader.get_norms(@query.field))
105
+ else
106
+ return SloppyPhraseScorer.new(self, tps, @positions, @similarity,
107
+ @query.slop, reader.get_norms(@query.field))
108
+ end
109
+ end
110
+
111
+ def explain(reader, doc)
112
+
113
+ result = Explanation.new()
114
+ result.description = "weight(#{@query} in #{doc}), product of:"
115
+
116
+ idf_expl = Explanation.new(@idf, "idf(#{@query})")
117
+
118
+ # explain query weight
119
+ query_expl = Explanation.new()
120
+ query_expl.description = "query_weight(#{@query}), product of:"
121
+
122
+ boost_expl = Explanation.new(@query.boost(), "boost")
123
+ (query_expl << boost_expl) if (@query.boost() != 1.0)
124
+
125
+ query_expl << idf_expl
126
+
127
+ query_norm_expl = Explanation.new(@query_norm,"query_norm")
128
+ query_expl << query_norm_expl
129
+
130
+ query_expl.value =
131
+ boost_expl.value * idf_expl.value * query_norm_expl.value
132
+
133
+ result << query_expl
134
+
135
+ # explain field weight
136
+ field_expl = Explanation.new()
137
+ field_expl.description =
138
+ "field_weight(#{@query} in #{doc}), product of:"
139
+
140
+ tf_expl = scorer(reader).explain(doc)
141
+ field_expl << tf_expl
142
+ field_expl << idf_expl
143
+
144
+ field_norm_expl = Explanation.new()
145
+ field_norms = reader.get_norms(@query.field)
146
+ field_norm =
147
+ field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
148
+ field_norm_expl.value = field_norm
149
+ field_norm_expl.description =
150
+ "field_norm(field=#{@query.field}, doc=#{doc})"
151
+ field_expl << field_norm_expl
152
+
153
+ field_expl.value = tf_expl.value * idf_expl.value * field_norm_expl.value
154
+ result << field_expl
155
+
156
+ if (query_expl.value == 1.0)
157
+ return field_expl
158
+ else
159
+ result.value = query_expl.value * field_expl.value
160
+ return result
161
+ end
162
+ end
163
+ end
164
+
165
+ def rewrite(reader)
166
+ if (@term_arrays.size() == 1) # optimize one-term case
167
+ terms = @term_arrays[0]
168
+ bq = BooleanQuery.new(true)
169
+ terms.each do |term|
170
+ bq.add(TermQuery.new(term), BooleanClause::Occur::SHOULD)
171
+ end
172
+ bq.boost = boost()
173
+ return boq
174
+ else
175
+ return self
176
+ end
177
+ end
178
+
179
+ def create_weight(searcher)
180
+ return MultiPhraseWeight.new(self, searcher)
181
+ end
182
+
183
+ # Prints a user-readable version of this query.
184
+ def to_s(f = nil)
185
+ buffer = ""
186
+ buffer << "#{@field}:" if @field != f
187
+ buffer << '"'
188
+ last_pos = -1
189
+ @term_arrays.each_index do |i|
190
+ terms = @term_arrays[i]
191
+ pos = @positions[i]
192
+ last_pos.upto(pos-2) {buffer << "<> "}
193
+ last_pos = pos
194
+ buffer << "#{terms.map {|term| term.text}.join("|")} "
195
+ end
196
+ buffer.rstrip!
197
+ buffer << '"'
198
+
199
+ buffer << "~#{@slop}" if (@slop != 0)
200
+ buffer << "^#{boost()}" if boost() != 1.0
201
+ return buffer
202
+ end
203
+ end
204
+ end