ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,32 @@
1
+ module Ferret::Search
2
+ class ExactPhraseScorer < PhraseScorer
3
+
4
+ def initialize(weight, tps, positions, similarity, norms)
5
+ super(weight, tps, positions, similarity, norms)
6
+ end
7
+
8
+ def phrase_freq()
9
+ # sort list with pq
10
+ each do |pp|
11
+ pp.first_position()
12
+ @pq.push(pp) # build pq from list
13
+ end
14
+ pq_to_list() # rebuild list from pq
15
+
16
+ freq = 0
17
+ begin # find position w/ all terms
18
+ while (@first.position < @last.position) # scan forward in first
19
+ begin
20
+ if not @first.next_position()
21
+ return freq
22
+ end
23
+ end while (@first.position < @last.position)
24
+ first_to_last()
25
+ end
26
+ freq += 1 # all equal: a match
27
+ end while @last.next_position()
28
+
29
+ return freq
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,41 @@
1
+ module Ferret::Search
2
+ # Expert: Describes the score computation for document and query.
3
+ class Explanation
4
+ attr_accessor :value, :description, :details
5
+
6
+ def initialize(value = nil, description = nil)
7
+ @value = value
8
+ @description = description
9
+ @details = []
10
+ end
11
+
12
+ def <<(detail)
13
+ @details << detail
14
+ end
15
+
16
+ # Render an explanation as text.
17
+ def to_s(depth = 0)
18
+ buffer = " " * depth
19
+ buffer << "#{@value} = #{@description}\n"
20
+
21
+ @details.each do |detail|
22
+ buffer << detail.to_s(depth + 1)
23
+ end
24
+ return buffer
25
+ end
26
+
27
+ # Render an explanation as HTML.
28
+ def to_html()
29
+ buffer = "<ul>\n"
30
+ buffer << "<li>#{@value} = #{@description}</li>\n"
31
+
32
+ @details.each do |detail|
33
+ buffer << detail.to_html
34
+ end
35
+
36
+ buffer << "</ul>\n"
37
+
38
+ return buffer
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,216 @@
1
+ module Ferret::Search
2
+ require 'monitor'
3
+
4
+ # Expert: The default cache implementation, storing all values in memory.
5
+ # A WeakKeyHash is used for storage.
6
+ class FieldCache
7
+ include Ferret::Index
8
+
9
+ StringIndex = Struct.new(:str_index, :str_map)
10
+
11
+ # Expert: Every key in the internal cache is of this type.
12
+ class Entry
13
+ attr_reader :field, :sort_type, :comparator
14
+ # Creates one of these objects.
15
+ def initialize(field, sort_type, comparator = nil)
16
+ @field = field
17
+ @sort_type = sort_type
18
+ @comparator = comparator
19
+ end
20
+
21
+ # Two of these are equal iff they reference the same field and sort_type.
22
+ def eql?(o)
23
+ return (o.instance_of? Entry and o.field == @field and
24
+ o.sort_type == @sort_type and o.comparator == comparator)
25
+ end
26
+ alias :== :eql?
27
+
28
+ # Composes a hashcode based on the field and sort_type.
29
+ def hash()
30
+ return @field.hash ^ @sort_type.hash ^ @comparator.hash
31
+ end
32
+ end
33
+
34
+ INT_PARSER = lambda {|i| i.to_i}
35
+
36
+ FLOAT_PARSER = lambda {|i| i.to_f}
37
+
38
+ # The internal cache. Maps Entry to array of interpreted term values.
39
+ @@cache = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
40
+
41
+ # See if an object is in the cache.
42
+ def FieldCache.lookup(reader, field, sort_type)
43
+ entry = Entry.new(field, sort_type)
44
+ @@cache.synchronize() do
45
+ reader_cache = @@cache[reader]
46
+ return nil if reader_cache.nil?
47
+ return reader_cache[entry]
48
+ end
49
+ end
50
+
51
+ # Put an object into the cache.
52
+ def FieldCache.store(reader, field, sort_type, value)
53
+ entry = Entry.new(field, sort_type)
54
+ @@cache.synchronize() do
55
+ reader_cache = @@cache[reader]
56
+ if (reader_cache == nil)
57
+ reader_cache = {}
58
+ @@cache[reader] = reader_cache
59
+ end
60
+ return reader_cache[entry] = value
61
+ end
62
+ end
63
+
64
+ # Checks the internal cache for an appropriate entry, and if none is found,
65
+ # reads the terms in +field+ and parses them with the provided parser and
66
+ # returns an array of size +reader.max_doc+ of the value each document has
67
+ # in the given field.
68
+ #
69
+ # reader:: Used to get field values.
70
+ # field:: Which field contains the values.
71
+ # sort_type:: The type of sort to run on the field. Holds the parser
72
+ # return:: The values in the given field for each document.
73
+ def FieldCache.get_index(reader, field, sort_type)
74
+ index = lookup(reader, field, sort_type)
75
+ if (index == nil)
76
+ parser = sort_type.parser
77
+ index = Array.new(reader.max_doc)
78
+ if (index.length > 0)
79
+ term_docs = reader.term_docs
80
+ term_enum = reader.terms_from(Term.new(field, ""))
81
+ begin
82
+ if term_enum.term.nil?
83
+ raise "no terms in field '#{field}' to sort by"
84
+ end
85
+ begin
86
+ term = term_enum.term
87
+ break if (term.field != field)
88
+ termval = parser.call(term.text)
89
+ term_docs.seek(term_enum)
90
+ while term_docs.next?
91
+ index[term_docs.doc] = termval
92
+ end
93
+ end while term_enum.next?
94
+ ensure
95
+ term_docs.close()
96
+ term_enum.close()
97
+ end
98
+ end
99
+ store(reader, field, sort_type, index)
100
+ end
101
+ return index
102
+ end
103
+
104
+ # Checks the internal cache for an appropriate entry, and if none is found
105
+ # reads the term values in +field+ and returns an array of them in natural
106
+ # order, along with an array telling which element in the term array each
107
+ # document uses.
108
+ #
109
+ # reader:: Used to get field values.
110
+ # field:: Which field contains the strings.
111
+ # returns:: Array of terms and index into the array for each document.
112
+ def FieldCache.get_string_index(reader, field)
113
+ index = lookup(reader, field, SortField::SortType::STRING)
114
+ if (index == nil)
115
+ str_index = Array.new(reader.max_doc)
116
+ str_map = Array.new(reader.max_doc+1)
117
+ if (str_index.length > 0)
118
+ term_docs = reader.term_docs
119
+ term_enum = reader.terms_from(Term.new(field,""))
120
+ t = 0 # current term number
121
+
122
+ # an entry for documents that have no terms in this field should a
123
+ # document with no terms be at top or bottom?
124
+ #
125
+ # this puts them at the top - if it is changed, FieldDocSortedHitQueue
126
+ # needs to change as well.
127
+ str_map[t] = nil
128
+ t += 1
129
+
130
+ begin
131
+ if (term_enum.term() == nil)
132
+ raise "no terms in field #{field} to sort by"
133
+ end
134
+ begin
135
+ term = term_enum.term
136
+ break if (term.field != field)
137
+
138
+ # store term text
139
+ # we expect that there is at most one term per document
140
+ if (t >= str_map.length)
141
+ raise "there are more terms than documents in field \"#{field}\", but it's impossible to sort on tokenized fields"
142
+ end
143
+ str_map[t] = term.text
144
+
145
+ term_docs.seek(term_enum)
146
+ while term_docs.next?
147
+ str_index[term_docs.doc] = t
148
+ end
149
+
150
+ t += 1
151
+ end while term_enum.next?
152
+ ensure
153
+ term_docs.close()
154
+ term_enum.close()
155
+ end
156
+
157
+ if (t == 0)
158
+ # if there are no terms, make the term array
159
+ # have a single nil entry
160
+ # str_map = [nil] <= already set above
161
+ elsif (t < str_map.length)
162
+ # if there are less terms than documents,
163
+ # trim off the dead array space
164
+ str_map.compact!
165
+ end
166
+ end
167
+ index = StringIndex.new(str_index, str_map)
168
+ store(reader, field, SortField::SortType::STRING, index)
169
+ end
170
+ return index
171
+ end
172
+
173
+ # Checks the internal cache for an appropriate entry, and if none is found
174
+ # reads +field+ to see if it contains integers, floats or strings, and then
175
+ # calls one of the other methods in this class to get the values. For
176
+ # string values, a StringIndex is returned. After calling this method,
177
+ # there is an entry in the cache for both type +AUTO+ and the actual found
178
+ # type.
179
+ #
180
+ # reader:: Used to get field values.
181
+ # field:: Which field contains the values.
182
+ # return:: Integer Array, Float Array or StringIndex.
183
+ def FieldCache.get_auto_index(reader, field)
184
+ index = lookup(reader, field, SortField::SortType::AUTO)
185
+ if (index == nil)
186
+ term_enum = reader.terms_from(Term.new(field, ""))
187
+ begin
188
+ term = term_enum.term
189
+ if (term == nil)
190
+ raise "no terms in field #{field} to sort by"
191
+ end
192
+ if (term.field == field)
193
+ termtext = term.text.strip
194
+
195
+ if (termtext == termtext.to_i.to_s)
196
+ index = get_index(reader, field, SortField::SortType::INT)
197
+ elsif (termtext == termtext.to_f.to_s or termtext == "%f"%termtext.to_f)
198
+ index = get_index(reader, field, SortField::SortType::FLOAT)
199
+ else
200
+ index = get_string_index(reader, field)
201
+ end
202
+
203
+ if (index != nil)
204
+ store(reader, field, SortField::SortType::AUTO, index)
205
+ end
206
+ else
207
+ raise "field \"#{field}\" does not appear to be indexed"
208
+ end
209
+ ensure
210
+ term_enum.close()
211
+ end
212
+ end
213
+ return index
214
+ end
215
+ end
216
+ end
@@ -0,0 +1,31 @@
1
+ module Ferret::Search
2
+ # Expert: A ScoreDoc which also contains information about
3
+ # how to sort the referenced document. In addition to the
4
+ # document number and score, this object contains an array
5
+ # of values for the document from the field(s) used to sort.
6
+ # For example, if the sort criteria was to sort by fields
7
+ # "a", "b" then "c", the +fields+ object array
8
+ # will have three elements, corresponding respectively to
9
+ # the term values for the document in fields "a", "b" and "c".
10
+ # The class of each element in the array will be either
11
+ # Integer, Float or String depending on the type of values
12
+ # in the terms of each field.
13
+ #
14
+ class FieldDoc < ScoreDoc
15
+
16
+ # Expert: The values which are used to sort the referenced document.
17
+ # The order of these will match the original sort criteria given by a
18
+ # Sort object. Each Object will be either an Integer, Float or String,
19
+ # depending on the type of values in the terms of the original field.
20
+ # See Sort
21
+ # See Searcher#search(Query,Filter,int,Sort)
22
+ attr_accessor :fields
23
+
24
+ # Expert: Creates one of these objects with the given sort information.
25
+ def initialize(doc, score, fields = nil)
26
+ super(doc, score)
27
+ @fields = fields
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,184 @@
1
+ require 'monitor'
2
+
3
+ module Ferret::Search
4
+ # Expert: A hit queue for sorting by hits by terms in more than one field.
5
+ # Uses +FieldCache+ for maintaining internal term lookup tables.
6
+ class FieldSortedHitQueue < Ferret::Utils::PriorityQueue
7
+ # Stores a comparator corresponding to each field being sorted by
8
+ attr_accessor :comparators
9
+
10
+ # Stores the sort criteria being used.
11
+ attr_accessor :fields
12
+
13
+ # Creates a hit queue sorted by the given list of fields.
14
+ #
15
+ # reader:: Index to use.
16
+ # fields:: Field names, in priority order (highest priority first).
17
+ # Cannot be +nil+ or empty. size:: The number of hits to
18
+ # retain. Must be greater than zero.
19
+ # raises:: IOError
20
+ def initialize(reader, fields, size)
21
+ super(size)
22
+ n = fields.length
23
+ @comparators = Array.new(n)
24
+ @fields = Array.new(n)
25
+ fields.each_with_index do |field, i|
26
+ @comparators[i] = get_cached_comparator(reader, field)
27
+ @fields[i] = SortField.new(field.name,
28
+ {:sort_type => comparators[i].sort_type,
29
+ :reverse => field.reverse?})
30
+ end
31
+
32
+ # Stores the maximum score value encountered, for normalizing.
33
+ # we only care about scores greater than 1.0 - if all the scores
34
+ # are less than 1.0, we don't have to normalize.
35
+ @max_score = 1.0
36
+ end
37
+
38
+
39
+ # Returns whether +a+ is less relevant than +b+.
40
+ # sd1:: ScoreDoc
41
+ # sd2:: ScoreDoc
42
+ # returns:: +true+ if document +a+ should be sorted after document +b+.
43
+ def less_than(sd1, sd2)
44
+ # keep track of maximum score
45
+ @max_score = sd1.score if (sd1.score > @max_score)
46
+ @max_score = sd2.score if (sd2.score > @max_score)
47
+
48
+ # run comparators
49
+ c = 0
50
+
51
+ @comparators.length.times do |i|
52
+ if @fields[i].reverse?
53
+ c = @comparators[i].compare(sd2, sd1)
54
+ else
55
+ c = @comparators[i].compare(sd1, sd2)
56
+ end
57
+ break unless c == 0
58
+ end
59
+
60
+ # avoid random sort order that could lead to duplicates
61
+ if (c == 0)
62
+ return sd1.doc > sd2.doc
63
+ end
64
+ return c > 0
65
+ end
66
+
67
+
68
+ # Given a FieldDoc object, stores the values used
69
+ # to sort the given document. These values are not the raw
70
+ # values out of the index, but the internal representation
71
+ # of them. This is so the given search hit can be collated
72
+ # by a MultiSearcher with other search hits.
73
+ # doc:: The FieldDoc to store sort values into.
74
+ # returns:: The same FieldDoc passed in.
75
+ # See Searchable#search(Weight,Filter,int,Sort)
76
+ def fill_fields(doc)
77
+ fields = Array.new(@comparators.length)
78
+ @comparators.each do |comparator|
79
+ fields[i] = comparator.sort_value(doc)
80
+ end
81
+ doc.fields = fields
82
+ end
83
+
84
+ # Internal cache of comparators. Similar to FieldCache, only
85
+ # caches comparators instead of term values.
86
+ @@comparators = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
87
+
88
+ # Returns a comparator if it is in the cache.
89
+ def lookup(reader, field, sort_type, comproc)
90
+ entry = FieldCache::Entry.new(field, sort_type, comproc)
91
+ @@comparators.synchronize() do
92
+ reader_cache = @@comparators[reader]
93
+ return nil if reader_cache.nil?
94
+ return reader_cache[entry]
95
+ end
96
+ end
97
+
98
+ # Stores a comparator into the cache.
99
+ def store(reader, field, sort_type, comproc, value)
100
+ entry = FieldCache::Entry.new(field, sort_type, comproc)
101
+ @@comparators.synchronize do
102
+ reader_cache = @@comparators[reader]
103
+ if reader_cache.nil?
104
+ reader_cache = Hash.new()
105
+ @@comparators[reader] = reader_cache
106
+ end
107
+ return reader_cache[entry] = value
108
+ end
109
+ end
110
+
111
+ def get_cached_comparator(reader, field)
112
+ if field.sort_type == SortField::SortType::DOC
113
+ return ScoreDocComparator::INDEX_ORDER
114
+ end
115
+ if field.sort_type == SortField::SortType::SCORE
116
+ return ScoreDocComparator::RELEVANCE
117
+ end
118
+
119
+ comparator = lookup(reader, field.name, field.sort_type, field.comparator)
120
+ if (comparator == nil)
121
+ case (field.sort_type)
122
+ when SortField::SortType::AUTO:
123
+ comparator = comparator_auto(reader, field.name)
124
+ when SortField::SortType::STRING:
125
+ comparator = comparator_string(reader, field.name)
126
+ else
127
+ comparator = comparator_simple(reader, field)
128
+ end
129
+
130
+ store(reader, field.name, field.sort_type, field.comparator, comparator)
131
+ end
132
+ return comparator
133
+ end
134
+
135
+ # Returns a comparator for sorting hits according to the sort type and the
136
+ # comparator function passed.
137
+ # strings.
138
+ #
139
+ # reader:: Index to use.
140
+ # field:: Lets us know which field to search and how to parse it.
141
+ # returns:: Comparator for sorting hits.
142
+ def comparator_simple(reader, field)
143
+ index = FieldCache.get_index(reader, field.name, field.sort_type)
144
+ comproc = field.comparator
145
+ if (comproc)
146
+ return SpecialFieldComparator.new(index, field.sort_type, comproc)
147
+ else
148
+ return SimpleFieldComparator.new(index, field.sort_type)
149
+ end
150
+ end
151
+
152
+ # Returns a comparator for sorting hits according to a field containing
153
+ # strings.
154
+ #
155
+ # reader:: Index to use.
156
+ # field:: Field containing string values.
157
+ # returns:: Comparator for sorting hits.
158
+ def comparator_string(reader, field)
159
+ index = FieldCache.get_string_index(reader, field)
160
+ return StringFieldComparator.new(index)
161
+ end
162
+
163
+ # Returns a comparator for sorting hits according to values in the given field.
164
+ # The terms in the field are looked at to determine whether they contain integers,
165
+ # floats or strings. Once the type is determined, one of the other static methods
166
+ # in this class is called to get the comparator.
167
+ # reader:: Index to use.
168
+ # field:: Field containg values.
169
+ # returns:: Comparator for sorting hits.
170
+ # raises:: IOException If an error occurs reading the index.
171
+ def comparator_auto(reader, field)
172
+ index = FieldCache.get_auto_index(reader, field)
173
+ if (index.is_a?(FieldCache::StringIndex))
174
+ return StringFieldComparator.new(index)
175
+ elsif (index[0].is_a?(Integer))
176
+ return SimpleFieldComparator.new(index, SortField::SortType::INT)
177
+ elsif (index[0].is_a?(Float))
178
+ return SimpleFieldComparator.new(index, SortField::SortType::FLOAT)
179
+ else
180
+ raise "unknown data type in field '#{field}'. Data = #{index[0]}"
181
+ end
182
+ end
183
+ end
184
+ end