ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,32 @@
1
+ module Ferret::Search
2
+ class ExactPhraseScorer < PhraseScorer
3
+
4
+ def initialize(weight, tps, positions, similarity, norms)
5
+ super(weight, tps, positions, similarity, norms)
6
+ end
7
+
8
+ def phrase_freq()
9
+ # sort list with pq
10
+ each do |pp|
11
+ pp.first_position()
12
+ @pq.push(pp) # build pq from list
13
+ end
14
+ pq_to_list() # rebuild list from pq
15
+
16
+ freq = 0
17
+ begin # find position w/ all terms
18
+ while (@first.position < @last.position) # scan forward in first
19
+ begin
20
+ if not @first.next_position()
21
+ return freq
22
+ end
23
+ end while (@first.position < @last.position)
24
+ first_to_last()
25
+ end
26
+ freq += 1 # all equal: a match
27
+ end while @last.next_position()
28
+
29
+ return freq
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,41 @@
1
+ module Ferret::Search
2
+ # Expert: Describes the score computation for document and query.
3
+ class Explanation
4
+ attr_accessor :value, :description, :details
5
+
6
+ def initialize(value = nil, description = nil)
7
+ @value = value
8
+ @description = description
9
+ @details = []
10
+ end
11
+
12
+ def <<(detail)
13
+ @details << detail
14
+ end
15
+
16
+ # Render an explanation as text.
17
+ def to_s(depth = 0)
18
+ buffer = " " * depth
19
+ buffer << "#{@value} = #{@description}\n"
20
+
21
+ @details.each do |detail|
22
+ buffer << detail.to_s(depth + 1)
23
+ end
24
+ return buffer
25
+ end
26
+
27
+ # Render an explanation as HTML.
28
+ def to_html()
29
+ buffer = "<ul>\n"
30
+ buffer << "<li>#{@value} = #{@description}</li>\n"
31
+
32
+ @details.each do |detail|
33
+ buffer << detail.to_html
34
+ end
35
+
36
+ buffer << "</ul>\n"
37
+
38
+ return buffer
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,216 @@
1
+ module Ferret::Search
2
+ require 'monitor'
3
+
4
+ # Expert: The default cache implementation, storing all values in memory.
5
+ # A WeakKeyHash is used for storage.
6
+ class FieldCache
7
+ include Ferret::Index
8
+
9
+ StringIndex = Struct.new(:str_index, :str_map)
10
+
11
+ # Expert: Every key in the internal cache is of this type.
12
+ class Entry
13
+ attr_reader :field, :sort_type, :comparator
14
+ # Creates one of these objects.
15
+ def initialize(field, sort_type, comparator = nil)
16
+ @field = field
17
+ @sort_type = sort_type
18
+ @comparator = comparator
19
+ end
20
+
21
+ # Two of these are equal iff they reference the same field and sort_type.
22
+ def eql?(o)
23
+ return (o.instance_of? Entry and o.field == @field and
24
+ o.sort_type == @sort_type and o.comparator == comparator)
25
+ end
26
+ alias :== :eql?
27
+
28
+ # Composes a hashcode based on the field and sort_type.
29
+ def hash()
30
+ return @field.hash ^ @sort_type.hash ^ @comparator.hash
31
+ end
32
+ end
33
+
34
+ INT_PARSER = lambda {|i| i.to_i}
35
+
36
+ FLOAT_PARSER = lambda {|i| i.to_f}
37
+
38
+ # The internal cache. Maps Entry to array of interpreted term values.
39
+ @@cache = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
40
+
41
+ # See if an object is in the cache.
42
+ def FieldCache.lookup(reader, field, sort_type)
43
+ entry = Entry.new(field, sort_type)
44
+ @@cache.synchronize() do
45
+ reader_cache = @@cache[reader]
46
+ return nil if reader_cache.nil?
47
+ return reader_cache[entry]
48
+ end
49
+ end
50
+
51
+ # Put an object into the cache.
52
+ def FieldCache.store(reader, field, sort_type, value)
53
+ entry = Entry.new(field, sort_type)
54
+ @@cache.synchronize() do
55
+ reader_cache = @@cache[reader]
56
+ if (reader_cache == nil)
57
+ reader_cache = {}
58
+ @@cache[reader] = reader_cache
59
+ end
60
+ return reader_cache[entry] = value
61
+ end
62
+ end
63
+
64
+ # Checks the internal cache for an appropriate entry, and if none is found,
65
+ # reads the terms in +field+ and parses them with the provided parser and
66
+ # returns an array of size +reader.max_doc+ of the value each document has
67
+ # in the given field.
68
+ #
69
+ # reader:: Used to get field values.
70
+ # field:: Which field contains the values.
71
+ # sort_type:: The type of sort to run on the field. Holds the parser
72
+ # return:: The values in the given field for each document.
73
+ def FieldCache.get_index(reader, field, sort_type)
74
+ index = lookup(reader, field, sort_type)
75
+ if (index == nil)
76
+ parser = sort_type.parser
77
+ index = Array.new(reader.max_doc)
78
+ if (index.length > 0)
79
+ term_docs = reader.term_docs
80
+ term_enum = reader.terms_from(Term.new(field, ""))
81
+ begin
82
+ if term_enum.term.nil?
83
+ raise "no terms in field '#{field}' to sort by"
84
+ end
85
+ begin
86
+ term = term_enum.term
87
+ break if (term.field != field)
88
+ termval = parser.call(term.text)
89
+ term_docs.seek(term_enum)
90
+ while term_docs.next?
91
+ index[term_docs.doc] = termval
92
+ end
93
+ end while term_enum.next?
94
+ ensure
95
+ term_docs.close()
96
+ term_enum.close()
97
+ end
98
+ end
99
+ store(reader, field, sort_type, index)
100
+ end
101
+ return index
102
+ end
103
+
104
+ # Checks the internal cache for an appropriate entry, and if none is found
105
+ # reads the term values in +field+ and returns an array of them in natural
106
+ # order, along with an array telling which element in the term array each
107
+ # document uses.
108
+ #
109
+ # reader:: Used to get field values.
110
+ # field:: Which field contains the strings.
111
+ # returns:: Array of terms and index into the array for each document.
112
+ def FieldCache.get_string_index(reader, field)
113
+ index = lookup(reader, field, SortField::SortType::STRING)
114
+ if (index == nil)
115
+ str_index = Array.new(reader.max_doc)
116
+ str_map = Array.new(reader.max_doc+1)
117
+ if (str_index.length > 0)
118
+ term_docs = reader.term_docs
119
+ term_enum = reader.terms_from(Term.new(field,""))
120
+ t = 0 # current term number
121
+
122
+ # an entry for documents that have no terms in this field should a
123
+ # document with no terms be at top or bottom?
124
+ #
125
+ # this puts them at the top - if it is changed, FieldDocSortedHitQueue
126
+ # needs to change as well.
127
+ str_map[t] = nil
128
+ t += 1
129
+
130
+ begin
131
+ if (term_enum.term() == nil)
132
+ raise "no terms in field #{field} to sort by"
133
+ end
134
+ begin
135
+ term = term_enum.term
136
+ break if (term.field != field)
137
+
138
+ # store term text
139
+ # we expect that there is at most one term per document
140
+ if (t >= str_map.length)
141
+ raise "there are more terms than documents in field \"#{field}\", but it's impossible to sort on tokenized fields"
142
+ end
143
+ str_map[t] = term.text
144
+
145
+ term_docs.seek(term_enum)
146
+ while term_docs.next?
147
+ str_index[term_docs.doc] = t
148
+ end
149
+
150
+ t += 1
151
+ end while term_enum.next?
152
+ ensure
153
+ term_docs.close()
154
+ term_enum.close()
155
+ end
156
+
157
+ if (t == 0)
158
+ # if there are no terms, make the term array
159
+ # have a single nil entry
160
+ # str_map = [nil] <= already set above
161
+ elsif (t < str_map.length)
162
+ # if there are less terms than documents,
163
+ # trim off the dead array space
164
+ str_map.compact!
165
+ end
166
+ end
167
+ index = StringIndex.new(str_index, str_map)
168
+ store(reader, field, SortField::SortType::STRING, index)
169
+ end
170
+ return index
171
+ end
172
+
173
+ # Checks the internal cache for an appropriate entry, and if none is found
174
+ # reads +field+ to see if it contains integers, floats or strings, and then
175
+ # calls one of the other methods in this class to get the values. For
176
+ # string values, a StringIndex is returned. After calling this method,
177
+ # there is an entry in the cache for both type +AUTO+ and the actual found
178
+ # type.
179
+ #
180
+ # reader:: Used to get field values.
181
+ # field:: Which field contains the values.
182
+ # return:: Integer Array, Float Array or StringIndex.
183
+ def FieldCache.get_auto_index(reader, field)
184
+ index = lookup(reader, field, SortField::SortType::AUTO)
185
+ if (index == nil)
186
+ term_enum = reader.terms_from(Term.new(field, ""))
187
+ begin
188
+ term = term_enum.term
189
+ if (term == nil)
190
+ raise "no terms in field #{field} to sort by"
191
+ end
192
+ if (term.field == field)
193
+ termtext = term.text.strip
194
+
195
+ if (termtext == termtext.to_i.to_s)
196
+ index = get_index(reader, field, SortField::SortType::INT)
197
+ elsif (termtext == termtext.to_f.to_s or termtext == "%f"%termtext.to_f)
198
+ index = get_index(reader, field, SortField::SortType::FLOAT)
199
+ else
200
+ index = get_string_index(reader, field)
201
+ end
202
+
203
+ if (index != nil)
204
+ store(reader, field, SortField::SortType::AUTO, index)
205
+ end
206
+ else
207
+ raise "field \"#{field}\" does not appear to be indexed"
208
+ end
209
+ ensure
210
+ term_enum.close()
211
+ end
212
+ end
213
+ return index
214
+ end
215
+ end
216
+ end
@@ -0,0 +1,31 @@
1
+ module Ferret::Search
2
+ # Expert: A ScoreDoc which also contains information about
3
+ # how to sort the referenced document. In addition to the
4
+ # document number and score, this object contains an array
5
+ # of values for the document from the field(s) used to sort.
6
+ # For example, if the sort criteria was to sort by fields
7
+ # "a", "b" then "c", the +fields+ object array
8
+ # will have three elements, corresponding respectively to
9
+ # the term values for the document in fields "a", "b" and "c".
10
+ # The class of each element in the array will be either
11
+ # Integer, Float or String depending on the type of values
12
+ # in the terms of each field.
13
+ #
14
+ class FieldDoc < ScoreDoc
15
+
16
+ # Expert: The values which are used to sort the referenced document.
17
+ # The order of these will match the original sort criteria given by a
18
+ # Sort object. Each Object will be either an Integer, Float or String,
19
+ # depending on the type of values in the terms of the original field.
20
+ # See Sort
21
+ # See Searcher#search(Query,Filter,int,Sort)
22
+ attr_accessor :fields
23
+
24
+ # Expert: Creates one of these objects with the given sort information.
25
+ def initialize(doc, score, fields = nil)
26
+ super(doc, score)
27
+ @fields = fields
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,184 @@
1
+ require 'monitor'
2
+
3
+ module Ferret::Search
4
+ # Expert: A hit queue for sorting by hits by terms in more than one field.
5
+ # Uses +FieldCache+ for maintaining internal term lookup tables.
6
+ class FieldSortedHitQueue < Ferret::Utils::PriorityQueue
7
+ # Stores a comparator corresponding to each field being sorted by
8
+ attr_accessor :comparators
9
+
10
+ # Stores the sort criteria being used.
11
+ attr_accessor :fields
12
+
13
+ # Creates a hit queue sorted by the given list of fields.
14
+ #
15
+ # reader:: Index to use.
16
+ # fields:: Field names, in priority order (highest priority first).
17
+ # Cannot be +nil+ or empty. size:: The number of hits to
18
+ # retain. Must be greater than zero.
19
+ # raises:: IOError
20
+ def initialize(reader, fields, size)
21
+ super(size)
22
+ n = fields.length
23
+ @comparators = Array.new(n)
24
+ @fields = Array.new(n)
25
+ fields.each_with_index do |field, i|
26
+ @comparators[i] = get_cached_comparator(reader, field)
27
+ @fields[i] = SortField.new(field.name,
28
+ {:sort_type => comparators[i].sort_type,
29
+ :reverse => field.reverse?})
30
+ end
31
+
32
+ # Stores the maximum score value encountered, for normalizing.
33
+ # we only care about scores greater than 1.0 - if all the scores
34
+ # are less than 1.0, we don't have to normalize.
35
+ @max_score = 1.0
36
+ end
37
+
38
+
39
+ # Returns whether +a+ is less relevant than +b+.
40
+ # sd1:: ScoreDoc
41
+ # sd2:: ScoreDoc
42
+ # returns:: +true+ if document +a+ should be sorted after document +b+.
43
+ def less_than(sd1, sd2)
44
+ # keep track of maximum score
45
+ @max_score = sd1.score if (sd1.score > @max_score)
46
+ @max_score = sd2.score if (sd2.score > @max_score)
47
+
48
+ # run comparators
49
+ c = 0
50
+
51
+ @comparators.length.times do |i|
52
+ if @fields[i].reverse?
53
+ c = @comparators[i].compare(sd2, sd1)
54
+ else
55
+ c = @comparators[i].compare(sd1, sd2)
56
+ end
57
+ break unless c == 0
58
+ end
59
+
60
+ # avoid random sort order that could lead to duplicates
61
+ if (c == 0)
62
+ return sd1.doc > sd2.doc
63
+ end
64
+ return c > 0
65
+ end
66
+
67
+
68
+ # Given a FieldDoc object, stores the values used
69
+ # to sort the given document. These values are not the raw
70
+ # values out of the index, but the internal representation
71
+ # of them. This is so the given search hit can be collated
72
+ # by a MultiSearcher with other search hits.
73
+ # doc:: The FieldDoc to store sort values into.
74
+ # returns:: The same FieldDoc passed in.
75
+ # See Searchable#search(Weight,Filter,int,Sort)
76
+ def fill_fields(doc)
77
+ fields = Array.new(@comparators.length)
78
+ @comparators.each do |comparator|
79
+ fields[i] = comparator.sort_value(doc)
80
+ end
81
+ doc.fields = fields
82
+ end
83
+
84
+ # Internal cache of comparators. Similar to FieldCache, only
85
+ # caches comparators instead of term values.
86
+ @@comparators = Ferret::Utils::WeakKeyHash.new.extend(MonitorMixin)
87
+
88
+ # Returns a comparator if it is in the cache.
89
+ def lookup(reader, field, sort_type, comproc)
90
+ entry = FieldCache::Entry.new(field, sort_type, comproc)
91
+ @@comparators.synchronize() do
92
+ reader_cache = @@comparators[reader]
93
+ return nil if reader_cache.nil?
94
+ return reader_cache[entry]
95
+ end
96
+ end
97
+
98
+ # Stores a comparator into the cache.
99
+ def store(reader, field, sort_type, comproc, value)
100
+ entry = FieldCache::Entry.new(field, sort_type, comproc)
101
+ @@comparators.synchronize do
102
+ reader_cache = @@comparators[reader]
103
+ if reader_cache.nil?
104
+ reader_cache = Hash.new()
105
+ @@comparators[reader] = reader_cache
106
+ end
107
+ return reader_cache[entry] = value
108
+ end
109
+ end
110
+
111
+ def get_cached_comparator(reader, field)
112
+ if field.sort_type == SortField::SortType::DOC
113
+ return ScoreDocComparator::INDEX_ORDER
114
+ end
115
+ if field.sort_type == SortField::SortType::SCORE
116
+ return ScoreDocComparator::RELEVANCE
117
+ end
118
+
119
+ comparator = lookup(reader, field.name, field.sort_type, field.comparator)
120
+ if (comparator == nil)
121
+ case (field.sort_type)
122
+ when SortField::SortType::AUTO:
123
+ comparator = comparator_auto(reader, field.name)
124
+ when SortField::SortType::STRING:
125
+ comparator = comparator_string(reader, field.name)
126
+ else
127
+ comparator = comparator_simple(reader, field)
128
+ end
129
+
130
+ store(reader, field.name, field.sort_type, field.comparator, comparator)
131
+ end
132
+ return comparator
133
+ end
134
+
135
+ # Returns a comparator for sorting hits according to the sort type and the
136
+ # comparator function passed.
137
+ # strings.
138
+ #
139
+ # reader:: Index to use.
140
+ # field:: Lets us know which field to search and how to parse it.
141
+ # returns:: Comparator for sorting hits.
142
+ def comparator_simple(reader, field)
143
+ index = FieldCache.get_index(reader, field.name, field.sort_type)
144
+ comproc = field.comparator
145
+ if (comproc)
146
+ return SpecialFieldComparator.new(index, field.sort_type, comproc)
147
+ else
148
+ return SimpleFieldComparator.new(index, field.sort_type)
149
+ end
150
+ end
151
+
152
+ # Returns a comparator for sorting hits according to a field containing
153
+ # strings.
154
+ #
155
+ # reader:: Index to use.
156
+ # field:: Field containing string values.
157
+ # returns:: Comparator for sorting hits.
158
+ def comparator_string(reader, field)
159
+ index = FieldCache.get_string_index(reader, field)
160
+ return StringFieldComparator.new(index)
161
+ end
162
+
163
+ # Returns a comparator for sorting hits according to values in the given field.
164
+ # The terms in the field are looked at to determine whether they contain integers,
165
+ # floats or strings. Once the type is determined, one of the other static methods
166
+ # in this class is called to get the comparator.
167
+ # reader:: Index to use.
168
+ # field:: Field containg values.
169
+ # returns:: Comparator for sorting hits.
170
+ # raises:: IOException If an error occurs reading the index.
171
+ def comparator_auto(reader, field)
172
+ index = FieldCache.get_auto_index(reader, field)
173
+ if (index.is_a?(FieldCache::StringIndex))
174
+ return StringFieldComparator.new(index)
175
+ elsif (index[0].is_a?(Integer))
176
+ return SimpleFieldComparator.new(index, SortField::SortType::INT)
177
+ elsif (index[0].is_a?(Float))
178
+ return SimpleFieldComparator.new(index, SortField::SortType::FLOAT)
179
+ else
180
+ raise "unknown data type in field '#{field}'. Data = #{index[0]}"
181
+ end
182
+ end
183
+ end
184
+ end