ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,125 @@
1
+ module Ferret::Search
2
+ # A Scorer for queries with a required subscorer and an excluding (prohibited)
3
+ # subscorer.
4
+ #
5
+ # This +Scorer+ implements Scorer#skip_to(int), and it uses the skip_to() on
6
+ # the given scorers.
7
+ class ReqExclScorer < Scorer
8
+ # Construct a +ReqExclScorer+.
9
+ # req_scorer:: The scorer that must match, except where
10
+ # excl_scorer:: indicates exclusion.
11
+ def initialize(req_scorer, excl_scorer)
12
+ super(nil) # No similarity used.
13
+ @req_scorer = req_scorer
14
+ @excl_scorer = excl_scorer
15
+
16
+ @first_time = true
17
+ end
18
+
19
+
20
+ def next?
21
+ if @first_time
22
+ if not @excl_scorer.next?
23
+ @excl_scorer = nil # exhausted at start
24
+ end
25
+ @first_time = false
26
+ end
27
+ if @req_scorer == nil
28
+ return false
29
+ end
30
+ if not @req_scorer.next?
31
+ @req_scorer = nil; # exhausted, nothing left
32
+ return false
33
+ end
34
+ if @excl_scorer == nil
35
+ return true # @req_scorer.next? already returned true
36
+ end
37
+ return to_non_excluded()
38
+ end
39
+
40
+ # Advance to non excluded doc.
41
+ # On entry:
42
+ #
43
+ # * @req_scorer != nil
44
+ # * @excl_scorer != nil
45
+ # * @req_scorer was advanced once via next? or skip_to() and
46
+ # @req_scorer.doc() may still be excluded.
47
+ #
48
+ # Advances @req_scorer a non excluded required doc, if any.
49
+ #
50
+ # returns:: true iff there is a non excluded required doc.
51
+ def to_non_excluded()
52
+ excl_doc = @excl_scorer.doc
53
+ begin
54
+ req_doc = @req_scorer.doc # may be excluded
55
+ if (req_doc < excl_doc)
56
+ return true # @req_scorer advanced to before @excl_scorer, ie. not excluded
57
+ elsif (req_doc > excl_doc)
58
+ unless @excl_scorer.skip_to(req_doc)
59
+ @excl_scorer = nil # exhausted, no more exclusions
60
+ return true
61
+ end
62
+ excl_doc = @excl_scorer.doc
63
+ if excl_doc > req_doc
64
+ return true; # not excluded
65
+ end
66
+ end
67
+ end while @req_scorer.next?
68
+ @req_scorer = nil; # exhausted, nothing left
69
+ return false
70
+ end
71
+
72
+ # @req_scorer may be nil when next? or skip_to() already return false so
73
+ # only call when you know that a doc exists
74
+ def doc()
75
+ return @req_scorer.doc
76
+ end
77
+
78
+ # Returns the score of the current document matching the query.
79
+ #
80
+ # Initially invalid, until #next? is called the first time.
81
+ #
82
+ # returns:: The score of the required scorer.
83
+ def score()
84
+ return @req_scorer.score()
85
+ end
86
+
87
+ # Skips to the first match beyond the current whose document number is
88
+ # greater than or equal to a given target.
89
+ #
90
+ # When this method is used the #explain(int) method should not be used.
91
+ #
92
+ # target:: The target document number.
93
+ # returns:: true iff there is such a match.
94
+ def skip_to(target)
95
+ if (@first_time)
96
+ @first_time = false
97
+ if (! @excl_scorer.skip_to(target))
98
+ @excl_scorer = nil; # exhausted
99
+ end
100
+ end
101
+ if (@req_scorer == nil)
102
+ return false
103
+ end
104
+ if (@excl_scorer == nil)
105
+ return @req_scorer.skip_to(target)
106
+ end
107
+ if (! @req_scorer.skip_to(target))
108
+ @req_scorer = nil
109
+ return false
110
+ end
111
+ return to_non_excluded()
112
+ end
113
+
114
+ def explain(doc)
115
+ e = Explanation.new()
116
+ if @excl_scorer.skip_to(doc) and @excl_scorer.doc == doc
117
+ e.description = "excluded"
118
+ else
119
+ e.description = "not excluded"
120
+ e.details << @req_scorer.explain(doc)
121
+ end
122
+ return e
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,70 @@
1
+ module Ferret::Search
2
+ # A Scorer for queries with a required part and an optional part.
3
+ # Delays skip_to() on the optional part until a score() is needed.
4
+ #
5
+ # This +Scorer+ implements Scorer#skip_to(int).
6
+ class ReqOptSumScorer < Scorer
7
+ # The scorers passed from the constructor.
8
+ # These are set to nil as soon as their next? or skip_to() returns false.
9
+ #
10
+ # Construct a +ReqOptScorer+.
11
+ # req_scorer:: The required scorer. This must match.
12
+ # opt_scorer:: The optional scorer. This is used for scoring only.
13
+ def initialize(req_scorer, opt_scorer)
14
+ super(nil) # No similarity used.
15
+ @req_scorer = req_scorer
16
+ @opt_scorer = opt_scorer
17
+
18
+ @first_time_opt_scorer = true
19
+ end
20
+
21
+
22
+ def next?
23
+ return @req_scorer.next?
24
+ end
25
+
26
+ def skip_to(target)
27
+ return @req_scorer.skip_to(target)
28
+ end
29
+
30
+ def doc()
31
+ return @req_scorer.doc()
32
+ end
33
+
34
+ # Returns the score of the current document matching the query.
35
+ # Initially invalid, until #next? is called the first time.
36
+ #
37
+ # returns:: The score of the required scorer, eventually increased by the
38
+ # score of the optional scorer when it also matches the current
39
+ # document.
40
+ def score()
41
+ cur_doc = @req_scorer.doc
42
+ req_score = @req_scorer.score
43
+ if @first_time_opt_scorer
44
+ @first_time_opt_scorer = false
45
+ if not @opt_scorer.skip_to(cur_doc)
46
+ @opt_scorer = nil
47
+ return req_score
48
+ end
49
+ elsif @opt_scorer.nil?
50
+ return req_score
51
+ elsif @opt_scorer.doc < cur_doc and not @opt_scorer.skip_to(cur_doc)
52
+ @opt_scorer = nil
53
+ return req_score
54
+ end
55
+ # assert (@opt_scorer != nil) and (@opt_scorer.doc() >= cur_doc)
56
+ return (@opt_scorer.doc == cur_doc) ? req_score + @opt_scorer.score() : req_score
57
+ end
58
+
59
+ # Explain the score of a document.
60
+ # @todo Also show the total score.
61
+ # See BooleanScorer.explain() on how to do this.
62
+ def explain(doc)
63
+ e = Explanation.new()
64
+ e.description = "required, optional"
65
+ e.details << @req_scorer.explain(doc)
66
+ e.details << @opt_scorer.explain(doc)
67
+ return e
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,38 @@
1
+ module Ferret::Search
2
+ # Expert: Returned by low-level search implementations.
3
+ # See TopDocs
4
+ class ScoreDoc
5
+ include Comparable
6
+ # Expert: The score of this document for the query.
7
+ attr_accessor :score
8
+
9
+ # Expert: A hit document's number.
10
+ attr_accessor :doc
11
+
12
+ # Expert: Constructs a ScoreDoc.
13
+ def initialize(doc, score)
14
+ @doc = doc
15
+ @score = score
16
+ end
17
+
18
+ # returns a hash value for storage in a Hash
19
+ def hash()
20
+ return 100 * doc * score
21
+ end
22
+
23
+ # score_docA < score_docB if score_docA.score < score_docB.score or
24
+ # score_docA.doc > score_docB.doc
25
+ def <=>(other)
26
+ result = @score.<=>(other.score)
27
+ if (result == 0)
28
+ return other.doc.<=>(@doc)
29
+ else
30
+ return result
31
+ end
32
+ end
33
+
34
+ def to_s
35
+ "#{@doc} -> %0.2f" % @score
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,114 @@
1
+ module Ferret::Search
2
+ # Expert: Compares two ScoreDoc objects for sorting.
3
+ class ScoreDocComparator
4
+
5
+ # Special comparator for sorting hits according to computed relevance (score).
6
+ RELEVANCE = ScoreDocComparator.new()
7
+ class <<RELEVANCE
8
+ def compare(i, j)
9
+ return -(i.score <=> j.score)
10
+ end
11
+ def sort_value(i)
12
+ return i.score
13
+ end
14
+ def sort_type()
15
+ return SortField::SortType::SCORE
16
+ end
17
+ end
18
+
19
+
20
+ # Special comparator for sorting hits according to index order (number).
21
+ INDEX_ORDER = ScoreDocComparator.new()
22
+ class <<INDEX_ORDER
23
+ def compare(i, j)
24
+ return i.doc <=> j.doc
25
+ end
26
+ def sort_value(i)
27
+ return i.doc
28
+ end
29
+ def sort_type()
30
+ return SortField::SortType::DOC
31
+ end
32
+ end
33
+
34
+
35
+ # Compares two ScoreDoc objects and returns a result indicating their
36
+ # sort order.
37
+ # i:: First ScoreDoc
38
+ # j:: Second ScoreDoc
39
+ # returns:: +-1+ if +i+ should come before +j+
40
+ # +1+ if +i+ should come after +j+
41
+ # +0+ if they are equal
42
+ def compare(i, j)
43
+ return NotImplementedError
44
+ end
45
+
46
+
47
+ # Returns the value used to sort the given document. The object returned
48
+ # must implement the java.io.Serializable interface. This is used by
49
+ # multisearchers to determine how to collate results from their searchers.
50
+ #
51
+ # See FieldDoc
52
+ # i:: Document
53
+ # returns:: Serializable object
54
+ def sort_value(i)
55
+ return NotImplementedError
56
+ end
57
+
58
+
59
+ # Returns the type of sort. Should return +SortField.SCORE+,
60
+ # +SortField.DOC+, +SortField.STRING+, +SortField.INTEGER+,
61
+ # +SortField.FLOAT+ or +SortField.CUSTOM+. It is not valid to return
62
+ # +SortField.AUTO+.
63
+ # This is used by multisearchers to determine how to collate results from
64
+ # their searchers. returns:: One of the constants in SortField.
65
+ # See SortField
66
+ def sort_type()
67
+ return NotImplementedError
68
+ end
69
+ end
70
+
71
+ class SimpleFieldComparator < ScoreDocComparator
72
+ def initialize(index, sort_type)
73
+ @index = index
74
+ @sort_type = sort_type
75
+ end
76
+
77
+ def compare(j, i)
78
+ return @index[i.doc] <=> @index[j.doc]
79
+ end
80
+ def sort_value(i)
81
+ return @index[i.doc]
82
+ end
83
+ def sort_type()
84
+ return @sort_type
85
+ end
86
+ end
87
+
88
+ class SpecialFieldComparator < SimpleFieldComparator
89
+ def initialize(index, sort_type, comparator)
90
+ super(index, sort_type)
91
+ @comparator = comparator
92
+ end
93
+ def compare(j, i)
94
+ return @comparator.call(@index[i.doc], @index[j.doc])
95
+ end
96
+ end
97
+
98
+ class StringFieldComparator < ScoreDocComparator
99
+ def initialize(index)
100
+ @str_index = index.str_index
101
+ @str_map = index.str_map
102
+ end
103
+
104
+ def compare(i, j)
105
+ return @str_index[i.doc] <=> @str_index[j.doc]
106
+ end
107
+ def sort_value(i)
108
+ return @str_map[@str_index[i.doc]]
109
+ end
110
+ def sort_type()
111
+ return SortField::SortType::STRING
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,91 @@
1
+ module Ferret::Search
2
+ # Expert: Common scoring functionality for different types of queries.
3
+ #
4
+ # A +Scorer+ either iterates over documents matching a query, or provides an
5
+ # explanation of the score for a query for a given document.
6
+ #
7
+ # Document scores are computed using a given +Similarity+ implementation.
8
+ class Scorer
9
+ attr_reader :similarity
10
+ MAX_DOCS = 0x7FFFFFFF
11
+
12
+ # Constructs a Scorer.
13
+ # similarity:: The +Similarity+ implementation used by this scorer.
14
+ def initialize(similarity)
15
+ @similarity = similarity
16
+ end
17
+
18
+ # Expert: Iterates over matching all documents, yielding the document
19
+ # number and the score.
20
+ #
21
+ # returns:: true if more matching documents may remain.
22
+ def each_hit() # :yields: doc, score
23
+ while next?
24
+ yield(doc(), score())
25
+ end
26
+ end
27
+
28
+ # Expert: Iterates over matching documents in a range.
29
+ #
30
+ # max:: Do not score documents past this. Default will search all documents
31
+ # avaliable.
32
+ # returns:: true if more matching documents may remain.
33
+ def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
34
+ while (next? and doc() < max)
35
+ yield(doc(), score())
36
+ end
37
+ return doc() < max
38
+ end
39
+
40
+ # Advances to the next document matching the query.
41
+ # returns:: true iff there is another document matching the query.
42
+ # When this method is used the #explain(int) method should not be used.
43
+ def next?()
44
+ raise NotImplementedError
45
+ end
46
+
47
+ # Returns the current document number matching the query.
48
+ # Initially invalid, until #next?() is called the first time.
49
+ def doc()
50
+ raise NotImplementedError
51
+ end
52
+
53
+ # Returns the score for the current document matching the query.
54
+ # Initially invalid, until #next?() is called the first time.
55
+ def score()
56
+ raise NotImplementedError
57
+ end
58
+
59
+ # Skips to the first match beyond the current whose document number is
60
+ # greater than or equal to a given target.
61
+ #
62
+ # When this method is used the #explain(int) method should not be used.
63
+ #
64
+ # target:: The target document number.
65
+ # returns:: true iff there is such a match.
66
+ #
67
+ # Behaves as if written:
68
+ #
69
+ # def skip_to(target)
70
+ # begin
71
+ # return false if not next?()
72
+ # end while (target > doc())
73
+ # return true
74
+ # end
75
+ #
76
+ # Most implementations are considerably more efficient than that.
77
+ def skip_to(target)
78
+ raise NotImplementedError
79
+ end
80
+
81
+ # Returns an explanation of the score for a document.
82
+ #
83
+ # When this method is used, the #next?(), #skip_to(int) and
84
+ # #score(HitCollector) methods should not be used.
85
+ #
86
+ # doc:: The document number for the explanation.
87
+ def explain(doc)
88
+ raise NotImplementedError
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,278 @@
1
+ module Ferret::Search
2
+ # Expert: Scoring API.
3
+ # Subclasses implement search scoring.
4
+ #
5
+ # The score of query *q* for document *d* is defined
6
+ # in terms of these methods as follows:
7
+ #
8
+ # <table cellpadding="0" cellspacing="0" border="0">
9
+ # <tr>
10
+ # <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td>
11
+ # <td valign="middle" align="center">
12
+ # <big><big><big><big><big>&Sigma</big></big></big></big></big></td>
13
+ # <td valign="middle"><small>
14
+ # #tf(int) tf(t in d)#
15
+ # #idf_term(Term,Searcher) idf(t)#
16
+ # Field#getBoost getBoost(t.field in d)#
17
+ # #length_norm(String,int) length_norm(t.field in d)
18
+ # </small></td>
19
+ # <td valign="middle" rowspan="2">&nbsp*
20
+ # #coord(int,int) coord(q,d)#
21
+ # #query_norm(float) query_norm(q)
22
+ # </td>
23
+ # </tr>
24
+ # <tr>
25
+ # <td valign="top" align="right">
26
+ # <small>t in q</small>
27
+ # </td>
28
+ # </tr>
29
+ # </table>
30
+ #
31
+ # See #set_default
32
+ # See IndexWriter#set_similarity
33
+ # See Searcher#set_similarity
34
+ class Similarity
35
+
36
+ def Similarity.byte_to_float(b)
37
+ if (b == 0)
38
+ return 0.0
39
+ end
40
+ mantissa = b & 0x07 # 0x07 = 7 = 0b00000111
41
+ exponent = (b >> 3) & 0x1F # 0x1f = 31 = 0b00011111
42
+ return [0,0,(mantissa << 5),(exponent+48)].pack("cccc").unpack("f")[0]
43
+ end
44
+
45
+ def Similarity.float_to_byte(f)
46
+ if (f <= 0.0) then return 0 end
47
+
48
+ bits = [f].pack("f").unpack("cccc")
49
+ mantissa = (bits[2] & 0xEf) >> 5
50
+ exponent = (bits[3] - 48)
51
+
52
+ if (exponent > 0x1f)
53
+ exponent = 0x1f # 0x1f = 31 = 0b00011111
54
+ mantissa = 0x07 # 0x07 = 7 = 0b00000111
55
+ end
56
+
57
+ if (exponent < 0)
58
+ exponent = 0
59
+ mantissa = 1
60
+ end
61
+
62
+ return ((exponent<<3) | mantissa)
63
+ end
64
+
65
+ # Cache of decoded bytes
66
+ NORM_TABLE = Array.new(256) { |i| Similarity.byte_to_float(i) }
67
+
68
+ # Decodes a normalization factor stored in an index.
69
+ # See Similarity#encode_norm(float)
70
+ def Similarity.decode_norm(b)
71
+ return NORM_TABLE[b & 0xFF]
72
+ end
73
+
74
+ # Decodes a normalization factor stored in an index.
75
+ # See Similarity#encode_norm(float)
76
+ def decode_norm(b)
77
+ return self.class.decode_norm(b)
78
+ end
79
+
80
+ # Computes the normalization value for a field given the total number of
81
+ # terms contained in a field. These values, together with field boosts, are
82
+ # stored in an index and multipled into scores for hits on each field by the
83
+ # search code.
84
+ #
85
+ # Matches in longer fields are less precise, so implemenations of this
86
+ # method usually return smaller values when *num_tokens* is large,
87
+ # and larger values when *num_tokens* is small.
88
+ #
89
+ # That these values are computed under
90
+ # IndexWriter#add_document and stored then using
91
+ # #encode_norm(float). Thus they have limited precision, and documents
92
+ # must be re-indexed if this method is altered.
93
+ #
94
+ # field:: the name of the field
95
+ # num_tokens:: the total number of tokens contained in fields named
96
+ # _field_ of _doc_.
97
+ #
98
+ # See Field#set_boost
99
+ def length_norm
100
+ raise NotImplementedError
101
+ end
102
+
103
+ # Computes the normalization value for a query given the sum of the squared
104
+ # weights of each of the query terms. This value is then multipled into the
105
+ # weight of each query term.
106
+ #
107
+ # This does not affect ranking, but rather just attempts to make scores
108
+ # from different queries comparable.
109
+ #
110
+ # sum_of_squared_weights:: the sum of the squares of query term weights
111
+ # Return:: a normalization factor for query weights
112
+ def query_norm
113
+ raise NotImplementedError
114
+ end
115
+
116
+ # Encodes a normalization factor for storage in an index.
117
+ #
118
+ # The encoding uses a five-bit exponent and three-bit mantissa, thus
119
+ # representing values from around 7x10^9 to 2x10^-9 with about one
120
+ # significant decimal digit of accuracy. Zero is also represented.
121
+ # Negative numbers are rounded up to zero. Values too large to represent
122
+ # are rounded down to the largest representable value. Positive values too
123
+ # small to represent are rounded up to the smallest positive representable
124
+ # value.
125
+ #
126
+ # See Field#boost=
127
+ def Similarity.encode_norm(f)
128
+ return Similarity.float_to_byte(f)
129
+ end
130
+
131
+ def encode_norm(f)
132
+ return self.class.float_to_byte(f)
133
+ end
134
+
135
+ # Computes a score factor based on a term or phrase's frequency in a
136
+ # document. This value is multiplied by the #idf_term(Term, Searcher)
137
+ # factor for each term in the query and these products are then summed to
138
+ # form the initial score for a document.
139
+ #
140
+ # Terms and phrases repeated in a document indicate the topic of the
141
+ # document, so implementations of this method usually return larger values
142
+ # when _freq_ is large, and smaller values when _freq_
143
+ # is small.
144
+ #
145
+ # The default implementation calls #tf(float)
146
+ #
147
+ # freq:: the frequency of a term within a document
148
+ # Return:: a score factor based on a term's within-document frequency
149
+ def tf
150
+ raise NotImplementedError
151
+ end
152
+
153
+ # Computes the amount of a sloppy phrase match, based on an edit distance.
154
+ # This value is summed for each sloppy phrase match in a document to form
155
+ # the frequency that is passed to #tf(float).
156
+ #
157
+ # A phrase match with a small edit distance to a document passage more
158
+ # closely matches the document, so implementations of this method usually
159
+ # return larger values when the edit distance is small and smaller values
160
+ # when it is large.
161
+ #
162
+ # See PhraseQuery#slop(int)
163
+ # distance:: the edit distance of this sloppy phrase match
164
+ # Return:: the frequency increment for this match
165
+ def sloppy_freq
166
+ raise NotImplementedError
167
+ end
168
+
169
+ # Computes a score factor for a simple term.
170
+ #
171
+ # The default implementation is:
172
+ # return idf(searcher.doc_freq(term), searcher.max_doc())
173
+ #
174
+ # Note that Searcher#max_doc() is used instead of
175
+ # IndexReader#num_docs() because it is proportional to
176
+ # Searcher#doc_freq(Term) , i.e., when one is inaccurate,
177
+ # so is the other, and in the same direction.
178
+ #
179
+ # term:: the term in question
180
+ # searcher:: the document collection being searched
181
+ # Return:: a score factor for the term
182
+ def idf_term(term, searcher)
183
+ return idf(searcher.doc_freq(term), searcher.max_doc())
184
+ end
185
+
186
+ # Computes a score factor for a phrase.
187
+ #
188
+ # The default implementation sums the #idf(Term,Searcher) factor
189
+ # for each term in the phrase.
190
+ #
191
+ # terms:: the terms in the phrase
192
+ # searcher:: the document collection being searched
193
+ # Return:: a score factor for the phrase
194
+ def idf_phrase(terms, searcher)
195
+ idf = 0.0
196
+ terms.each { |term| idf += idf_term(term, searcher) }
197
+ return idf
198
+ end
199
+
200
+ # Computes a score factor based on a term's document frequency (the number
201
+ # of documents which contain the term). This value is multiplied by the
202
+ # #tf(int) factor for each term in the query and these products are
203
+ # then summed to form the initial score for a document.
204
+ #
205
+ # Terms that occur in fewer documents are better indicators of topic, so
206
+ # implemenations of this method usually return larger values for rare terms,
207
+ # and smaller values for common terms.
208
+ #
209
+ # doc_freq:: the number of documents which contain the term
210
+ # num_docs:: the total number of documents in the collection
211
+ # Return:: a score factor based on the term's document frequency
212
+ def idf
213
+ raise NotImplementedError
214
+ end
215
+
216
+ # Computes a score factor based on the fraction of all query terms that a
217
+ # document contains. This value is multiplied into scores.
218
+ #
219
+ # The presence of a large portion of the query terms indicates a better
220
+ # match with the query, so implemenations of this method usually return
221
+ # larger values when the ratio between these parameters is large and smaller
222
+ # values when the ratio between them is small.
223
+ #
224
+ # overlap:: the number of query terms matched in the document
225
+ # max_overlap:: the total number of terms in the query
226
+ # Return:: a score factor based on term overlap with the query
227
+ def coord
228
+ raise NotImplementedError
229
+ end
230
+ end
231
+
232
+ # Expert: Default scoring implementation.
233
+ class DefaultSimilarity < Similarity
234
+ # See source
235
+ def length_norm(field, num_terms)
236
+ return 1.0 / Math.sqrt(num_terms)
237
+ end
238
+
239
+ # See source
240
+ def query_norm(sum_of_squared_weights)
241
+ return 1.0 / Math.sqrt(sum_of_squared_weights)
242
+ end
243
+
244
+ # See source
245
+ def tf(freq)
246
+ return Math.sqrt(freq)
247
+ end
248
+
249
+ # See source
250
+ def sloppy_freq(distance)
251
+ return 1.0 / (distance + 1)
252
+ end
253
+
254
+ # See source
255
+ def idf(doc_freq, num_docs)
256
+ return 0.0 if num_docs == 0
257
+ return Math.log(num_docs.to_f/(doc_freq+1)) + 1.0
258
+ end
259
+
260
+ # See source
261
+ def coord(overlap, max_overlap)
262
+ return overlap.to_f / max_overlap
263
+ end
264
+ end
265
+
266
+ class Similarity
267
+ # The Similarity implementation used by default.
268
+ @@default = DefaultSimilarity.new()
269
+
270
+ def Similarity.default
271
+ return @@default
272
+ end
273
+
274
+ def Similarity.default=(default)
275
+ @@default = default
276
+ end
277
+ end
278
+ end