ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,294 +0,0 @@
1
- module Ferret::Search
2
- # An alternative to BooleanScorer.
3
- #
4
- # Uses ConjunctionScorer, DisjunctionScorer, ReqOptScorer and ReqExclScorer.
5
- #
6
- # Implements skip_to(), and has no limitations on the numbers of added scorers.
7
- class BooleanScorer < Scorer
8
- attr_reader :required_scorers, :coordinator
9
-
10
- class Coordinator
11
- attr_accessor :max_coord, :nr_matchers
12
-
13
- def initialize(similarity)
14
- @max_coord = 0 # to be increased for each non prohibited scorer
15
- @coord_factors = nil
16
- @similarity = similarity
17
- end
18
-
19
-
20
- def init() # use after all scorers have been added.
21
- @coord_factors = Array.new(@max_coord + 1)
22
-
23
- (@max_coord+1).times do |i|
24
- @coord_factors[i] = @similarity.coord(i, @max_coord)
25
- end
26
- end
27
-
28
-
29
- def init_doc()
30
- @nr_matchers = 0
31
- end
32
-
33
- def coord_factor()
34
- return @coord_factors[@nr_matchers]
35
- end
36
- end
37
-
38
- # The scorer to which all scoring will be delegated,
39
- # except for computing and using the coordination factor.
40
-
41
- def initialize(similarity)
42
- super(similarity)
43
- @required_scorers = []
44
- @optional_scorers = []
45
- @prohibited_scorers = []
46
- @counting_sum_scorer = nil
47
- @coordinator = Coordinator.new(similarity)
48
- end
49
-
50
- def add_scorer(scorer, occur)
51
- unless occur == BooleanClause::Occur::MUST_NOT
52
- @coordinator.max_coord += 1
53
- end
54
-
55
- case occur
56
- when BooleanClause::Occur::MUST: @required_scorers << scorer
57
- when BooleanClause::Occur::SHOULD: @optional_scorers << scorer
58
- when BooleanClause::Occur::MUST_NOT: @prohibited_scorers << scorer
59
- end
60
- end
61
-
62
- # Initialize the match counting scorer that sums all the
63
- # scores.
64
- # When "counting" is used in a name it means counting the number
65
- # of matching scorers.<br>
66
- # When "sum" is used in a name it means score value summing
67
- # over the matching scorers
68
- def init_counting_sum_scorer()
69
- @coordinator.init()
70
- @counting_sum_scorer = make_counting_sum_scorer()
71
- end
72
-
73
- # Count a scorer as a single match.
74
- class SingleMatchScorer < Scorer
75
- def initialize(parent_scorer, scorer)
76
- super(scorer.similarity)
77
- @scorer = scorer
78
- @parent_scorer = parent_scorer
79
- end
80
- def score()
81
- @parent_scorer.coordinator.nr_matchers += 1
82
- return @scorer.score
83
- end
84
- def doc()
85
- return @scorer.doc
86
- end
87
- def next?
88
- return @scorer.next?
89
- end
90
- def skip_to(doc_num)
91
- return @scorer.skip_to(doc_num)
92
- end
93
- def explain(doc_num)
94
- return @scorer.explain(doc_num)
95
- end
96
- end
97
-
98
- class CountingDisjunctionSumScorer < DisjunctionSumScorer
99
- def initialize(parent_scorer, scorers)
100
- super(scorers)
101
- @parent_scorer = parent_scorer
102
- end
103
- def score
104
- @parent_scorer.coordinator.nr_matchers += @nr_matchers
105
- return super
106
- end
107
- end
108
-
109
- def counting_disjunction_sum_scorer(scorers)
110
- # each scorer from the list counted as a single matcher
111
-
112
- return CountingDisjunctionSumScorer.new(self, scorers)
113
- end
114
-
115
- class CountingConjunctionScorer < ConjunctionScorer
116
- def initialize(parent_scorer, similarity)
117
- super(similarity)
118
- @parent_scorer = parent_scorer
119
- @required_num_matchers = parent_scorer.required_scorers.size
120
- @last_scored_doc = -1
121
- end
122
- def score
123
- if (@parent_scorer.doc() > @last_scored_doc)
124
- @last_scored_doc = @parent_scorer.doc()
125
- @parent_scorer.coordinator.nr_matchers += @required_num_matchers
126
- end
127
-
128
- return super
129
- end
130
- end
131
-
132
- def counting_conjunction_sum_scorer(required_scorers)
133
- # each scorer from the list counted as a single matcher
134
-
135
- required_num_matchers = required_scorers.size
136
- ccs = CountingConjunctionScorer.new(self, Similarity.default)
137
- @required_scorers.each do |scorer|
138
- ccs << scorer
139
- end
140
- return ccs
141
- end
142
-
143
- # Returns the scorer to be used for match counting and score summing.
144
- # Uses required_scorers, optional_scorers and prohibited_scorers.
145
- def make_counting_sum_scorer()
146
- # each scorer counted as a single matcher
147
- if @required_scorers.size == 0
148
- if @optional_scorers.size == 0
149
- return NonMatchingScorer.new # only prohibited scorers
150
- elsif @optional_scorers.size == 1
151
- return make_counting_sum_scorer2( # the only optional scorer is required
152
- SingleMatchScorer.new(self, @optional_scorers[0]),
153
- []) # no optional scorers left
154
- else # more than 1 @optional_scorers, no required scorers
155
- return make_counting_sum_scorer2( # at least one optional scorer is required
156
- counting_disjunction_sum_scorer(@optional_scorers),
157
- []) # no optional scorers left
158
- end
159
- elsif @required_scorers.size == 1 # 1 required
160
- return make_counting_sum_scorer2(
161
- SingleMatchScorer.new(self, @required_scorers[0]),
162
- @optional_scorers)
163
- else # more required scorers
164
- return make_counting_sum_scorer2(
165
- counting_conjunction_sum_scorer(@required_scorers),
166
- @optional_scorers)
167
- end
168
- end
169
-
170
- # Returns the scorer to be used for match counting and score summing.
171
- # Uses the arguments and prohibited_scorers.
172
- # required_counting_sum_scorer:: A required scorer already built.
173
- # @optional_scorers:: A list of optional scorers, possibly empty.
174
- def make_counting_sum_scorer2(required_counting_sum_scorer, optional_scorers)
175
-
176
- if (optional_scorers.size == 0)
177
- if (@prohibited_scorers.size == 0)
178
- return required_counting_sum_scorer
179
- elsif (@prohibited_scorers.size == 1)
180
- return ReqExclScorer.new(required_counting_sum_scorer,
181
- @prohibited_scorers[0])
182
- else # no optional, more than 1 prohibited
183
- return ReqExclScorer.new(
184
- required_counting_sum_scorer,
185
- DisjunctionSumScorer.new(@prohibited_scorers))
186
- end
187
- elsif (optional_scorers.size == 1)
188
- return make_counting_sum_scorer3(
189
- required_counting_sum_scorer,
190
- SingleMatchScorer.new(self, optional_scorers[0]))
191
- else # more optional
192
- return make_counting_sum_scorer3(
193
- required_counting_sum_scorer,
194
- counting_disjunction_sum_scorer(optional_scorers))
195
- end
196
- end
197
-
198
- # Returns the scorer to be used for match counting and score summing.
199
- # Uses the arguments and prohibited_scorers.
200
- # required_counting_sum_scorer:: A required scorer already built.
201
- # optional_counting_sum_scorer:: An optional scorer already built.
202
- def make_counting_sum_scorer3(required_counting_sum_scorer,
203
- optional_counting_sum_scorer)
204
- if (@prohibited_scorers.size == 0) # no prohibited
205
- return ReqOptSumScorer.new(required_counting_sum_scorer,
206
- optional_counting_sum_scorer)
207
- elsif (@prohibited_scorers.size == 1) # 1 prohibited
208
- return ReqOptSumScorer.new(
209
- ReqExclScorer.new(required_counting_sum_scorer,
210
- @prohibited_scorers[0]),
211
- optional_counting_sum_scorer)
212
- else # more prohibited
213
- return ReqOptSumScorer.new(
214
- ReqExclScorer.new(required_counting_sum_scorer,
215
- DisjunctionSumScorer.new(@prohibited_scorers)),
216
- optional_counting_sum_scorer)
217
- end
218
- end
219
-
220
- # Expert: Iterates over matching all documents, yielding the document
221
- # number and the score.
222
- #
223
- # returns:: true if more matching documents may remain.
224
- def each_hit() # :yields: doc, score
225
- if @counting_sum_scorer.nil?
226
- init_counting_sum_scorer()
227
- end
228
- while @counting_sum_scorer.next?
229
- yield(@counting_sum_scorer.doc, score())
230
- end
231
- end
232
-
233
- # Expert: Iterates over matching documents in a range.
234
- #
235
- # NOTE: that #next? needs to be called first.
236
- #
237
- # max:: Do not score documents past this. Default will search all documents
238
- # avaliable.
239
- # returns:: true if more matching documents may remain.
240
- def each_hit_up_to(max = MAX_DOCS) # :yields: doc, score
241
- # nil pointer exception when next? was not called before:
242
- doc_num = @counting_sum_scorer.doc()
243
- while (doc_num < max)
244
- yield(doc_num, score())
245
- if not @counting_sum_scorer.next?
246
- return false
247
- end
248
- doc_num = @counting_sum_scorer.doc()
249
- end
250
- return true
251
- end
252
-
253
- def doc()
254
- return @counting_sum_scorer.doc
255
- end
256
-
257
- def next?
258
- if (@counting_sum_scorer == nil)
259
- init_counting_sum_scorer()
260
- end
261
- return @counting_sum_scorer.next?
262
- end
263
-
264
- def score()
265
- @coordinator.init_doc()
266
- sum = @counting_sum_scorer.score()
267
- return sum * @coordinator.coord_factor()
268
- end
269
-
270
- # Skips to the first match beyond the current whose document number is
271
- # greater than or equal to a given target.
272
- #
273
- # When this method is used the #explain(int) method should not be used.
274
- #
275
- # target:: The target document number.
276
- # returns:: true iff there is such a match.
277
- def skip_to(target)
278
- if (@counting_sum_scorer == nil)
279
- init_counting_sum_scorer()
280
- end
281
- return @counting_sum_scorer.skip_to(target)
282
- end
283
-
284
- # TODO: Implement an explanation of the coordination factor.
285
- # doc:: The document number for the explanation.
286
- # raises:: UnsupportedOperationException
287
- def explain(doc)
288
- raise NotImplementedError
289
- # How to explain the coordination factor?
290
- #init_counting_sum_scorer()
291
- #return @counting_sum_scorer.explain(doc); # misses coord factor.
292
- end
293
- end
294
- end
@@ -1,40 +0,0 @@
1
- module Ferret::Search
2
- require 'monitor'
3
-
4
- # Wraps another filter's result and caches it. The caching
5
- # behavior is like QueryFilter. The purpose is to allow
6
- # filters to simply filter, and then wrap with this class to add
7
- # caching, keeping the two concerns decoupled yet composable.
8
- class CachingWrapperFilter < Filter
9
- # filter:: Filter to cache results of
10
- def initialize(filter)
11
- @filter = filter
12
- @cache = nil
13
- end
14
-
15
- def bits(reader)
16
- if (@cache == nil)
17
- @cache = Ferret::Utils::WeakKeyHash.new
18
- end
19
-
20
- @cache.synchronize() do # check cache
21
- bits = @cache[reader]
22
- if bits
23
- return bits
24
- end
25
- end
26
-
27
- bits = @filter.bits(reader)
28
-
29
- @cache.synchronize() do # update cache
30
- @cache[reader] = bits
31
- end
32
-
33
- return bits
34
- end
35
-
36
- def to_s()
37
- return "CachingWrapperFilter(#{@filter})"
38
- end
39
- end
40
- end
@@ -1,99 +0,0 @@
1
- require 'set'
2
- module Ferret::Search
3
- # Scorer for conjunctions, sets of queries, all of which are required.
4
- class ConjunctionScorer < Scorer
5
-
6
- def initialize(similarity)
7
- super
8
- @scorers = []
9
- @first_time = true
10
- @more = true
11
- end
12
-
13
- def add(scorer)
14
- @scorers << scorer
15
- end
16
- alias :<< :add
17
-
18
- def first()
19
- return @scorers.first
20
- end
21
-
22
- def last()
23
- return @scorers.last
24
- end
25
-
26
- def doc()
27
- return first().doc()
28
- end
29
-
30
- def next?()
31
- if (@first_time)
32
- init(true)
33
- elsif (@more)
34
- @more = last().next? # trigger further scanning
35
- end
36
- return do_next()
37
- end
38
-
39
- def do_next()
40
- while @more and first().doc < last().doc # find doc w/ all clauses
41
- @more = first().skip_to(last().doc) # skip first upto last
42
- @scorers << @scorers.shift # move first to last
43
- end
44
- return @more # found a doc with all clauses
45
- end
46
-
47
- def skip_to(target)
48
- if(@first_time)
49
- init(false)
50
- end
51
-
52
- @scorers.each do |scorer|
53
- break if not @more
54
- @more = scorer.skip_to(target)
55
- end
56
-
57
- sort_scorers() if @more # resort the scorers
58
-
59
- return do_next()
60
- end
61
-
62
- # Sums the scores of all of the scorers for the current document.
63
- def score()
64
- score = 0.0 # sum scores
65
- @scorers.each do |scorer|
66
- score += scorer.score
67
- end
68
- score *= @coord
69
- return score
70
- end
71
-
72
- def init(init_scorers)
73
- # compute coord factor
74
- @coord = similarity().coord(@scorers.size(), @scorers.size())
75
-
76
- @more = @scorers.size() > 0
77
-
78
- if init_scorers
79
- # move each scorer to its first entry
80
- @scorers.each do |scorer|
81
- break if not @more
82
- @more = scorer.next?
83
- end
84
- sort_scorers() if @more
85
- end
86
-
87
- @first_time = false
88
- end
89
-
90
- def sort_scorers()
91
- # move @scorers to an array
92
- @scorers.sort! {|a,b| a.doc <=> b.doc }
93
- end
94
-
95
- def explain(doc)
96
- raise NotImplementedError
97
- end
98
- end
99
- end
@@ -1,205 +0,0 @@
1
- module Ferret::Search
2
- # A Scorer for OR like queries, counterpart of Lucene's +ConjunctionScorer+.
3
- # This Scorer implements Scorer#skip_to(int) and uses skip_to() on the given Scorers.
4
- class DisjunctionSumScorer < Scorer
5
- # the sub-scorers
6
- attr_accessor :sub_scorers
7
-
8
- # Construct a +DisjunctionScorer+.
9
- # sub_scorers:: A collection of at least two subscorers.
10
- #
11
- # minimum_nr_matchers:: The positive minimum number of subscorers that
12
- # should match to match this query.
13
- #
14
- # When +@minimum_nr_matchers+ is bigger than the number
15
- # of +sub_scorers+,no matches will be produced.
16
- #
17
- # When @minimum_nr_matchers equals the number of
18
- # sub_scorers, it more efficient to use
19
- # +ConjunctionScorer+.
20
- def initialize(sub_scorers, minimum_nr_matchers = 1)
21
- super(nil)
22
-
23
- # The number of subscorers.
24
- @nr_scorers = sub_scorers.size
25
-
26
- # The document number of the current match.
27
- @current_doc = -1
28
- @curret_score = nil
29
- # The number of subscorers that provide the current match.
30
- @nr_matchers = -1
31
-
32
- if (minimum_nr_matchers <= 0)
33
- raise ArgumentError, "Minimum nr of matchers must be positive"
34
- end
35
- if (@nr_scorers <= 1)
36
- raise ArgumentError, "There must be at least 2 sub_scorers"
37
- end
38
-
39
- @minimum_nr_matchers = minimum_nr_matchers
40
- @sub_scorers = sub_scorers
41
-
42
- # The @scorer_queue contains all subscorers ordered by their current
43
- # doc, with the minimum at the top.
44
- #
45
- # The @scorer_queue is initialized the first time next? or skip_to() is
46
- # called.
47
- #
48
- # An exhausted scorer is immediately removed from the @scorer_queue.
49
- #
50
- # If less than the @minimum_nr_matchers scorers remain in the
51
- # @scorer_queue next? and skip_to() return false.
52
- #
53
- # After each to call to next? or skip_to()
54
- # +currentSumScore+ is the total score of the current matching doc,
55
- # +@nr_matchers+ is the number of matching scorers,
56
- # and all scorers are after the matching doc, or are exhausted.
57
- @scorer_queue = nil
58
- end
59
-
60
- # Called the first time next? or skip_to() is called to
61
- # initialize +@scorer_queue+.
62
- def init_scorer_queue()
63
- @scorer_queue = ScorerQueue.new(@nr_scorers)
64
- @sub_scorers.each do |sub_scorer|
65
- if (sub_scorer.next?) # doc() method will be used in @scorer_queue.
66
- @scorer_queue.insert(sub_scorer)
67
- end
68
- end
69
- end
70
-
71
- # A +PriorityQueue+ that orders by Scorer#doc().
72
- class ScorerQueue < Ferret::Utils::PriorityQueue
73
- def less_than(scorer1, scorer2)
74
- return scorer1.doc < scorer2.doc
75
- end
76
- end
77
-
78
- def next?
79
- if (@scorer_queue == nil)
80
- init_scorer_queue()
81
- end
82
-
83
- if (@scorer_queue.size < @minimum_nr_matchers)
84
- return false
85
- else
86
- return advance_after_current()
87
- end
88
- end
89
-
90
-
91
- # Advance all subscorers after the current document determined by the
92
- # top of the +@scorer_queue+.
93
- # Repeat until at least the minimum number of subscorers match on the same
94
- # document and all subscorers are after that document or are exhausted.
95
- #
96
- # On entry the +@scorer_queue+ has at least +@minimum_nr_matchers+
97
- # available. At least the scorer with the minimum document number will be advanced.
98
- # returns:: true iff there is a match.
99
- #
100
- # In case there is a match, +@current_doc+, +currentSumScore+,
101
- # and +@nr_matchers+ describe the match.
102
- #
103
- # TODO Investigate whether it is possible to use skip_to() when
104
- # the minimum number of matchers is bigger than one, ie. begin and use the
105
- # character of ConjunctionScorer for the minimum number of matchers.
106
- def advance_after_current()
107
- begin # repeat until minimum nr of matchers
108
- top = @scorer_queue.top
109
- @current_doc = top.doc
110
- @current_score = top.score
111
- @nr_matchers = 1
112
- begin # Until all subscorers are after @current_doc
113
- if top.next?
114
- @scorer_queue.adjust_top()
115
- else
116
- @scorer_queue.pop()
117
- if (@scorer_queue.size < (@minimum_nr_matchers - @nr_matchers))
118
- # Not enough subscorers left for a match on this document,
119
- # and also no more chance of any further match.
120
- return false
121
- end
122
- if (@scorer_queue.size == 0)
123
- break # nothing more to advance, check for last match.
124
- end
125
- end
126
- top = @scorer_queue.top
127
- if top.doc != @current_doc
128
- break # All remaining subscorers are after @current_doc.
129
- else
130
- @current_score += top.score
131
- @nr_matchers += 1
132
- end
133
- end while (true)
134
-
135
- if (@nr_matchers >= @minimum_nr_matchers)
136
- return true
137
- elsif (@scorer_queue.size < @minimum_nr_matchers)
138
- return false
139
- end
140
- end while (true)
141
- end
142
-
143
- # Returns the score of the current document matching the query.
144
- # Initially invalid, until #next? is called the first time.
145
- def score()
146
- return @current_score
147
- end
148
-
149
- # Returns the document number of the current document matching the query.
150
- # Initially invalid, until #next? is called the first time.
151
- def doc()
152
- return @current_doc
153
- end
154
-
155
- # Returns the number of subscorers matching the current document.
156
- # Initially invalid, until #next? is called the first time.
157
- def number_of_matchers()
158
- return @nr_matchers
159
- end
160
-
161
- # Skips to the first match beyond the current whose document number is
162
- # greater than or equal to a given target.
163
- #
164
- # When this method is used the #explain(int) method should not be used.
165
- #
166
- # The implementation uses the skip_to() method on the subscorers.
167
- # target:: The target document number.
168
- # returns:: true iff there is such a match.
169
- def skip_to(target)
170
- if @scorer_queue.nil?
171
- init_scorer_queue()
172
- end
173
- if @scorer_queue.size < @minimum_nr_matchers
174
- return false
175
- end
176
- if target <= @current_doc
177
- target = @current_doc + 1
178
- end
179
- begin
180
- top = @scorer_queue.top
181
- if top.doc >= target
182
- return advance_after_current()
183
- elsif top.skip_to(target)
184
- @scorer_queue.adjust_top()
185
- else
186
- @scorer_queue.pop()
187
- if (@scorer_queue.size < @minimum_nr_matchers)
188
- return false
189
- end
190
- end
191
- end while (true)
192
- end
193
-
194
- # Gives and explanation for the score of a given document.
195
- # TODO Show the resulting score. See BooleanScorer.explain() on how to do this.
196
- def explain(doc)
197
- e = Explanation.new()
198
- e.description = "At least " + @minimum_nr_matchers + " of"
199
- @sub_scorers.each do |sub_scorer|
200
- e.details << sub_scorer.explain(doc)
201
- end
202
- return e
203
- end
204
- end
205
- end