ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,125 @@
1
+ module Ferret::Search
2
+ # A Scorer for queries with a required subscorer and an excluding (prohibited)
3
+ # subscorer.
4
+ #
5
+ # This +Scorer+ implements Scorer#skip_to(int), and it uses the skip_to() on
6
+ # the given scorers.
7
+ class ReqExclScorer < Scorer
8
+ # Construct a +ReqExclScorer+.
9
+ # req_scorer:: The scorer that must match, except where
10
+ # excl_scorer:: indicates exclusion.
11
+ def initialize(req_scorer, excl_scorer)
12
+ super(nil) # No similarity used.
13
+ @req_scorer = req_scorer
14
+ @excl_scorer = excl_scorer
15
+
16
+ @first_time = true
17
+ end
18
+
19
+
20
+ def next?
21
+ if @first_time
22
+ if not @excl_scorer.next?
23
+ @excl_scorer = nil # exhausted at start
24
+ end
25
+ @first_time = false
26
+ end
27
+ if @req_scorer == nil
28
+ return false
29
+ end
30
+ if not @req_scorer.next?
31
+ @req_scorer = nil; # exhausted, nothing left
32
+ return false
33
+ end
34
+ if @excl_scorer == nil
35
+ return true # @req_scorer.next? already returned true
36
+ end
37
+ return to_non_excluded()
38
+ end
39
+
40
+ # Advance to non excluded doc.
41
+ # On entry:
42
+ #
43
+ # * @req_scorer != nil
44
+ # * @excl_scorer != nil
45
+ # * @req_scorer was advanced once via next? or skip_to() and
46
+ # @req_scorer.doc() may still be excluded.
47
+ #
48
+ # Advances @req_scorer a non excluded required doc, if any.
49
+ #
50
+ # returns:: true iff there is a non excluded required doc.
51
+ def to_non_excluded()
52
+ excl_doc = @excl_scorer.doc
53
+ begin
54
+ req_doc = @req_scorer.doc # may be excluded
55
+ if (req_doc < excl_doc)
56
+ return true # @req_scorer advanced to before @excl_scorer, ie. not excluded
57
+ elsif (req_doc > excl_doc)
58
+ unless @excl_scorer.skip_to(req_doc)
59
+ @excl_scorer = nil # exhausted, no more exclusions
60
+ return true
61
+ end
62
+ excl_doc = @excl_scorer.doc
63
+ if excl_doc > req_doc
64
+ return true; # not excluded
65
+ end
66
+ end
67
+ end while @req_scorer.next?
68
+ @req_scorer = nil; # exhausted, nothing left
69
+ return false
70
+ end
71
+
72
+ # @req_scorer may be nil when next? or skip_to() already return false so
73
+ # only call when you know that a doc exists
74
+ def doc()
75
+ return @req_scorer.doc
76
+ end
77
+
78
+ # Returns the score of the current document matching the query.
79
+ #
80
+ # Initially invalid, until #next? is called the first time.
81
+ #
82
+ # returns:: The score of the required scorer.
83
+ def score()
84
+ return @req_scorer.score()
85
+ end
86
+
87
+ # Skips to the first match beyond the current whose document number is
88
+ # greater than or equal to a given target.
89
+ #
90
+ # When this method is used the #explain(int) method should not be used.
91
+ #
92
+ # target:: The target document number.
93
+ # returns:: true iff there is such a match.
94
+ def skip_to(target)
95
+ if (@first_time)
96
+ @first_time = false
97
+ if (! @excl_scorer.skip_to(target))
98
+ @excl_scorer = nil; # exhausted
99
+ end
100
+ end
101
+ if (@req_scorer == nil)
102
+ return false
103
+ end
104
+ if (@excl_scorer == nil)
105
+ return @req_scorer.skip_to(target)
106
+ end
107
+ if (! @req_scorer.skip_to(target))
108
+ @req_scorer = nil
109
+ return false
110
+ end
111
+ return to_non_excluded()
112
+ end
113
+
114
+ def explain(doc)
115
+ e = Explanation.new()
116
+ if @excl_scorer.skip_to(doc) and @excl_scorer.doc == doc
117
+ e.description = "excluded"
118
+ else
119
+ e.description = "not excluded"
120
+ e.details << @req_scorer.explain(doc)
121
+ end
122
+ return e
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,70 @@
1
+ module Ferret::Search
2
+ # A Scorer for queries with a required part and an optional part.
3
+ # Delays skip_to() on the optional part until a score() is needed.
4
+ #
5
+ # This +Scorer+ implements Scorer#skip_to(int).
6
+ class ReqOptSumScorer < Scorer
7
+ # The scorers passed from the constructor.
8
+ # These are set to nil as soon as their next? or skip_to() returns false.
9
+ #
10
+ # Construct a +ReqOptScorer+.
11
+ # req_scorer:: The required scorer. This must match.
12
+ # opt_scorer:: The optional scorer. This is used for scoring only.
13
+ def initialize(req_scorer, opt_scorer)
14
+ super(nil) # No similarity used.
15
+ @req_scorer = req_scorer
16
+ @opt_scorer = opt_scorer
17
+
18
+ @first_time_opt_scorer = true
19
+ end
20
+
21
+
22
+ def next?
23
+ return @req_scorer.next?
24
+ end
25
+
26
+ def skip_to(target)
27
+ return @req_scorer.skip_to(target)
28
+ end
29
+
30
+ def doc()
31
+ return @req_scorer.doc()
32
+ end
33
+
34
+ # Returns the score of the current document matching the query.
35
+ # Initially invalid, until #next? is called the first time.
36
+ #
37
+ # returns:: The score of the required scorer, eventually increased by the
38
+ # score of the optional scorer when it also matches the current
39
+ # document.
40
+ def score()
41
+ cur_doc = @req_scorer.doc
42
+ req_score = @req_scorer.score
43
+ if @first_time_opt_scorer
44
+ @first_time_opt_scorer = false
45
+ if not @opt_scorer.skip_to(cur_doc)
46
+ @opt_scorer = nil
47
+ return req_score
48
+ end
49
+ elsif @opt_scorer.nil?
50
+ return req_score
51
+ elsif @opt_scorer.doc < cur_doc and not @opt_scorer.skip_to(cur_doc)
52
+ @opt_scorer = nil
53
+ return req_score
54
+ end
55
+ # assert (@opt_scorer != nil) and (@opt_scorer.doc() >= cur_doc)
56
+ return (@opt_scorer.doc == cur_doc) ? req_score + @opt_scorer.score() : req_score
57
+ end
58
+
59
+ # Explain the score of a document.
60
+ # @todo Also show the total score.
61
+ # See BooleanScorer.explain() on how to do this.
62
+ def explain(doc)
63
+ e = Explanation.new()
64
+ e.description = "required, optional"
65
+ e.details << @req_scorer.explain(doc)
66
+ e.details << @opt_scorer.explain(doc)
67
+ return e
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,38 @@
1
+ module Ferret::Search
2
+ # Expert: Returned by low-level search implementations.
3
+ # See TopDocs
4
+ class ScoreDoc
5
+ include Comparable
6
+ # Expert: The score of this document for the query.
7
+ attr_accessor :score
8
+
9
+ # Expert: A hit document's number.
10
+ attr_accessor :doc
11
+
12
+ # Expert: Constructs a ScoreDoc.
13
+ def initialize(doc, score)
14
+ @doc = doc
15
+ @score = score
16
+ end
17
+
18
+ # returns a hash value for storage in a Hash
19
+ def hash()
20
+ return 100 * doc * score
21
+ end
22
+
23
+ # score_docA < score_docB if score_docA.score < score_docB.score or
24
+ # score_docA.doc > score_docB.doc
25
+ def <=>(other)
26
+ result = @score.<=>(other.score)
27
+ if (result == 0)
28
+ return other.doc.<=>(@doc)
29
+ else
30
+ return result
31
+ end
32
+ end
33
+
34
+ def to_s
35
+ "#{@doc} -> %0.2f" % @score
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,114 @@
1
+ module Ferret::Search
2
+ # Expert: Compares two ScoreDoc objects for sorting.
3
+ class ScoreDocComparator
4
+
5
+ # Special comparator for sorting hits according to computed relevance (score).
6
+ RELEVANCE = ScoreDocComparator.new()
7
+ class <<RELEVANCE
8
+ def compare(i, j)
9
+ return -(i.score <=> j.score)
10
+ end
11
+ def sort_value(i)
12
+ return i.score
13
+ end
14
+ def sort_type()
15
+ return SortField::SortType::SCORE
16
+ end
17
+ end
18
+
19
+
20
+ # Special comparator for sorting hits according to index order (number).
21
+ INDEX_ORDER = ScoreDocComparator.new()
22
+ class <<INDEX_ORDER
23
+ def compare(i, j)
24
+ return i.doc <=> j.doc
25
+ end
26
+ def sort_value(i)
27
+ return i.doc
28
+ end
29
+ def sort_type()
30
+ return SortField::SortType::DOC
31
+ end
32
+ end
33
+
34
+
35
+ # Compares two ScoreDoc objects and returns a result indicating their
36
+ # sort order.
37
+ # i:: First ScoreDoc
38
+ # j:: Second ScoreDoc
39
+ # returns:: +-1+ if +i+ should come before +j+
40
+ # +1+ if +i+ should come after +j+
41
+ # +0+ if they are equal
42
+ def compare(i, j)
43
+ return NotImplementedError
44
+ end
45
+
46
+
47
+ # Returns the value used to sort the given document. The object returned
48
+ # must implement the java.io.Serializable interface. This is used by
49
+ # multisearchers to determine how to collate results from their searchers.
50
+ #
51
+ # See FieldDoc
52
+ # i:: Document
53
+ # returns:: Serializable object
54
+ def sort_value(i)
55
+ return NotImplementedError
56
+ end
57
+
58
+
59
+ # Returns the type of sort. Should return +SortField.SCORE+,
60
+ # +SortField.DOC+, +SortField.STRING+, +SortField.INTEGER+,
61
+ # +SortField.FLOAT+ or +SortField.CUSTOM+. It is not valid to return
62
+ # +SortField.AUTO+.
63
+ # This is used by multisearchers to determine how to collate results from
64
+ # their searchers. returns:: One of the constants in SortField.
65
+ # See SortField
66
+ def sort_type()
67
+ return NotImplementedError
68
+ end
69
+ end
70
+
71
+ class SimpleFieldComparator < ScoreDocComparator
72
+ def initialize(index, sort_type)
73
+ @index = index
74
+ @sort_type = sort_type
75
+ end
76
+
77
+ def compare(j, i)
78
+ return @index[i.doc] <=> @index[j.doc]
79
+ end
80
+ def sort_value(i)
81
+ return @index[i.doc]
82
+ end
83
+ def sort_type()
84
+ return @sort_type
85
+ end
86
+ end
87
+
88
+ class SpecialFieldComparator < SimpleFieldComparator
89
+ def initialize(index, sort_type, comparator)
90
+ super(index, sort_type)
91
+ @comparator = comparator
92
+ end
93
+ def compare(j, i)
94
+ return @comparator.call(@index[i.doc], @index[j.doc])
95
+ end
96
+ end
97
+
98
+ class StringFieldComparator < ScoreDocComparator
99
+ def initialize(index)
100
+ @str_index = index.str_index
101
+ @str_map = index.str_map
102
+ end
103
+
104
+ def compare(i, j)
105
+ return @str_index[i.doc] <=> @str_index[j.doc]
106
+ end
107
+ def sort_value(i)
108
+ return @str_map[@str_index[i.doc]]
109
+ end
110
+ def sort_type()
111
+ return SortField::SortType::STRING
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,91 @@
1
+ module Ferret::Search
2
+ # Expert: Common scoring functionality for different types of queries.
3
+ #
4
+ # A +Scorer+ either iterates over documents matching a query, or provides an
5
+ # explanation of the score for a query for a given document.
6
+ #
7
+ # Document scores are computed using a given +Similarity+ implementation.
8
+ class Scorer
9
+ attr_reader :similarity
10
+ MAX_DOCS = 0x7FFFFFFF
11
+
12
+ # Constructs a Scorer.
13
+ # similarity:: The +Similarity+ implementation used by this scorer.
14
+ def initialize(similarity)
15
+ @similarity = similarity
16
+ end
17
+
18
+ # Expert: Iterates over matching all documents, yielding the document
19
+ # number and the score.
20
+ #
21
+ # returns:: true if more matching documents may remain.
22
+ def each_hit() # :yields: doc, score
23
+ while next?
24
+ yield(doc(), score())
25
+ end
26
+ end
27
+
28
+ # Expert: Iterates over matching documents in a range.
29
+ #
30
+ # max:: Do not score documents past this. Default will search all documents
31
+ # avaliable.
32
+ # returns:: true if more matching documents may remain.
33
+ def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
34
+ while (next? and doc() < max)
35
+ yield(doc(), score())
36
+ end
37
+ return doc() < max
38
+ end
39
+
40
+ # Advances to the next document matching the query.
41
+ # returns:: true iff there is another document matching the query.
42
+ # When this method is used the #explain(int) method should not be used.
43
+ def next?()
44
+ raise NotImplementedError
45
+ end
46
+
47
+ # Returns the current document number matching the query.
48
+ # Initially invalid, until #next?() is called the first time.
49
+ def doc()
50
+ raise NotImplementedError
51
+ end
52
+
53
+ # Returns the score for the current document matching the query.
54
+ # Initially invalid, until #next?() is called the first time.
55
+ def score()
56
+ raise NotImplementedError
57
+ end
58
+
59
+ # Skips to the first match beyond the current whose document number is
60
+ # greater than or equal to a given target.
61
+ #
62
+ # When this method is used the #explain(int) method should not be used.
63
+ #
64
+ # target:: The target document number.
65
+ # returns:: true iff there is such a match.
66
+ #
67
+ # Behaves as if written:
68
+ #
69
+ # def skip_to(target)
70
+ # begin
71
+ # return false if not next?()
72
+ # end while (target > doc())
73
+ # return true
74
+ # end
75
+ #
76
+ # Most implementations are considerably more efficient than that.
77
+ def skip_to(target)
78
+ raise NotImplementedError
79
+ end
80
+
81
+ # Returns an explanation of the score for a document.
82
+ #
83
+ # When this method is used, the #next?(), #skip_to(int) and
84
+ # #score(HitCollector) methods should not be used.
85
+ #
86
+ # doc:: The document number for the explanation.
87
+ def explain(doc)
88
+ raise NotImplementedError
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,278 @@
1
+ module Ferret::Search
2
+ # Expert: Scoring API.
3
+ # Subclasses implement search scoring.
4
+ #
5
+ # The score of query *q* for document *d* is defined
6
+ # in terms of these methods as follows:
7
+ #
8
+ # <table cellpadding="0" cellspacing="0" border="0">
9
+ # <tr>
10
+ # <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
11
+ # <td valign="middle" align="center">
12
+ # <big><big><big><big><big>&Sigma</big></big></big></big></big></td>
13
+ # <td valign="middle"><small>
14
+ # #tf(int) tf(t in d)#
15
+ # #idf_term(Term,Searcher) idf(t)#
16
+ # Field#getBoost getBoost(t.field in d)#
17
+ # #length_norm(String,int) length_norm(t.field in d)
18
+ # </small></td>
19
+ # <td valign="middle" rowspan="2">&nbsp*
20
+ # #coord(int,int) coord(q,d)#
21
+ # #query_norm(float) query_norm(q)
22
+ # </td>
23
+ # </tr>
24
+ # <tr>
25
+ # <td valign="top" align="right">
26
+ # <small>t in q</small>
27
+ # </td>
28
+ # </tr>
29
+ # </table>
30
+ #
31
+ # See #set_default
32
+ # See IndexWriter#set_similarity
33
+ # See Searcher#set_similarity
34
+ class Similarity
35
+
36
+ def Similarity.byte_to_float(b)
37
+ if (b == 0)
38
+ return 0.0
39
+ end
40
+ mantissa = b & 0x07 # 0x07 = 7 = 0b00000111
41
+ exponent = (b >> 3) & 0x1F # 0x1f = 31 = 0b00011111
42
+ return [0,0,(mantissa << 5),(exponent+48)].pack("cccc").unpack("f")[0]
43
+ end
44
+
45
+ def Similarity.float_to_byte(f)
46
+ if (f <= 0.0) then return 0 end
47
+
48
+ bits = [f].pack("f").unpack("cccc")
49
+ mantissa = (bits[2] & 0xEf) >> 5
50
+ exponent = (bits[3] - 48)
51
+
52
+ if (exponent > 0x1f)
53
+ exponent = 0x1f # 0x1f = 31 = 0b00011111
54
+ mantissa = 0x07 # 0x07 = 7 = 0b00000111
55
+ end
56
+
57
+ if (exponent < 0)
58
+ exponent = 0
59
+ mantissa = 1
60
+ end
61
+
62
+ return ((exponent<<3) | mantissa)
63
+ end
64
+
65
+ # Cache of decoded bytes
66
+ NORM_TABLE = Array.new(256) { |i| Similarity.byte_to_float(i) }
67
+
68
+ # Decodes a normalization factor stored in an index.
69
+ # See Similarity#encode_norm(float)
70
+ def Similarity.decode_norm(b)
71
+ return NORM_TABLE[b & 0xFF]
72
+ end
73
+
74
+ # Decodes a normalization factor stored in an index.
75
+ # See Similarity#encode_norm(float)
76
+ def decode_norm(b)
77
+ return self.class.decode_norm(b)
78
+ end
79
+
80
+ # Computes the normalization value for a field given the total number of
81
+ # terms contained in a field. These values, together with field boosts, are
82
+ # stored in an index and multipled into scores for hits on each field by the
83
+ # search code.
84
+ #
85
+ # Matches in longer fields are less precise, so implemenations of this
86
+ # method usually return smaller values when *num_tokens* is large,
87
+ # and larger values when *num_tokens* is small.
88
+ #
89
+ # That these values are computed under
90
+ # IndexWriter#add_document and stored then using
91
+ # #encode_norm(float). Thus they have limited precision, and documents
92
+ # must be re-indexed if this method is altered.
93
+ #
94
+ # field:: the name of the field
95
+ # num_tokens:: the total number of tokens contained in fields named
96
+ # _field_ of _doc_.
97
+ #
98
+ # See Field#set_boost
99
+ def length_norm
100
+ raise NotImplementedError
101
+ end
102
+
103
+ # Computes the normalization value for a query given the sum of the squared
104
+ # weights of each of the query terms. This value is then multipled into the
105
+ # weight of each query term.
106
+ #
107
+ # This does not affect ranking, but rather just attempts to make scores
108
+ # from different queries comparable.
109
+ #
110
+ # sum_of_squared_weights:: the sum of the squares of query term weights
111
+ # Return:: a normalization factor for query weights
112
+ def query_norm
113
+ raise NotImplementedError
114
+ end
115
+
116
+ # Encodes a normalization factor for storage in an index.
117
+ #
118
+ # The encoding uses a five-bit exponent and three-bit mantissa, thus
119
+ # representing values from around 7x10^9 to 2x10^-9 with about one
120
+ # significant decimal digit of accuracy. Zero is also represented.
121
+ # Negative numbers are rounded up to zero. Values too large to represent
122
+ # are rounded down to the largest representable value. Positive values too
123
+ # small to represent are rounded up to the smallest positive representable
124
+ # value.
125
+ #
126
+ # See Field#boost=
127
+ def Similarity.encode_norm(f)
128
+ return Similarity.float_to_byte(f)
129
+ end
130
+
131
+ def encode_norm(f)
132
+ return self.class.float_to_byte(f)
133
+ end
134
+
135
+ # Computes a score factor based on a term or phrase's frequency in a
136
+ # document. This value is multiplied by the #idf_term(Term, Searcher)
137
+ # factor for each term in the query and these products are then summed to
138
+ # form the initial score for a document.
139
+ #
140
+ # Terms and phrases repeated in a document indicate the topic of the
141
+ # document, so implementations of this method usually return larger values
142
+ # when _freq_ is large, and smaller values when _freq_
143
+ # is small.
144
+ #
145
+ # The default implementation calls #tf(float)
146
+ #
147
+ # freq:: the frequency of a term within a document
148
+ # Return:: a score factor based on a term's within-document frequency
149
+ def tf
150
+ raise NotImplementedError
151
+ end
152
+
153
+ # Computes the amount of a sloppy phrase match, based on an edit distance.
154
+ # This value is summed for each sloppy phrase match in a document to form
155
+ # the frequency that is passed to #tf(float).
156
+ #
157
+ # A phrase match with a small edit distance to a document passage more
158
+ # closely matches the document, so implementations of this method usually
159
+ # return larger values when the edit distance is small and smaller values
160
+ # when it is large.
161
+ #
162
+ # See PhraseQuery#slop(int)
163
+ # distance:: the edit distance of this sloppy phrase match
164
+ # Return:: the frequency increment for this match
165
+ def sloppy_freq
166
+ raise NotImplementedError
167
+ end
168
+
169
+ # Computes a score factor for a simple term.
170
+ #
171
+ # The default implementation is:
172
+ # return idf(searcher.doc_freq(term), searcher.max_doc())
173
+ #
174
+ # Note that Searcher#max_doc() is used instead of
175
+ # IndexReader#num_docs() because it is proportional to
176
+ # Searcher#doc_freq(Term) , i.e., when one is inaccurate,
177
+ # so is the other, and in the same direction.
178
+ #
179
+ # term:: the term in question
180
+ # searcher:: the document collection being searched
181
+ # Return:: a score factor for the term
182
+ def idf_term(term, searcher)
183
+ return idf(searcher.doc_freq(term), searcher.max_doc())
184
+ end
185
+
186
+ # Computes a score factor for a phrase.
187
+ #
188
+ # The default implementation sums the #idf(Term,Searcher) factor
189
+ # for each term in the phrase.
190
+ #
191
+ # terms:: the terms in the phrase
192
+ # searcher:: the document collection being searched
193
+ # Return:: a score factor for the phrase
194
+ def idf_phrase(terms, searcher)
195
+ idf = 0.0
196
+ terms.each { |term| idf += idf_term(term, searcher) }
197
+ return idf
198
+ end
199
+
200
+ # Computes a score factor based on a term's document frequency (the number
201
+ # of documents which contain the term). This value is multiplied by the
202
+ # #tf(int) factor for each term in the query and these products are
203
+ # then summed to form the initial score for a document.
204
+ #
205
+ # Terms that occur in fewer documents are better indicators of topic, so
206
+ # implemenations of this method usually return larger values for rare terms,
207
+ # and smaller values for common terms.
208
+ #
209
+ # doc_freq:: the number of documents which contain the term
210
+ # num_docs:: the total number of documents in the collection
211
+ # Return:: a score factor based on the term's document frequency
212
+ def idf
213
+ raise NotImplementedError
214
+ end
215
+
216
+ # Computes a score factor based on the fraction of all query terms that a
217
+ # document contains. This value is multiplied into scores.
218
+ #
219
+ # The presence of a large portion of the query terms indicates a better
220
+ # match with the query, so implemenations of this method usually return
221
+ # larger values when the ratio between these parameters is large and smaller
222
+ # values when the ratio between them is small.
223
+ #
224
+ # overlap:: the number of query terms matched in the document
225
+ # max_overlap:: the total number of terms in the query
226
+ # Return:: a score factor based on term overlap with the query
227
+ def coord
228
+ raise NotImplementedError
229
+ end
230
+ end
231
+
232
+ # Expert: Default scoring implementation.
233
+ class DefaultSimilarity < Similarity
234
+ # See source
235
+ def length_norm(field, num_terms)
236
+ return 1.0 / Math.sqrt(num_terms)
237
+ end
238
+
239
+ # See source
240
+ def query_norm(sum_of_squared_weights)
241
+ return 1.0 / Math.sqrt(sum_of_squared_weights)
242
+ end
243
+
244
+ # See source
245
+ def tf(freq)
246
+ return Math.sqrt(freq)
247
+ end
248
+
249
+ # See source
250
+ def sloppy_freq(distance)
251
+ return 1.0 / (distance + 1)
252
+ end
253
+
254
+ # See source
255
+ def idf(doc_freq, num_docs)
256
+ return 0.0 if num_docs == 0
257
+ return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
258
+ end
259
+
260
+ # See source
261
+ def coord(overlap, max_overlap)
262
+ return overlap.to_f / max_overlap
263
+ end
264
+ end
265
+
266
+ class Similarity
267
+ # The Similarity implementation used by default.
268
+ @@default = DefaultSimilarity.new()
269
+
270
+ def Similarity.default
271
+ return @@default
272
+ end
273
+
274
+ def Similarity.default=(default)
275
+ @@default = default
276
+ end
277
+ end
278
+ end