ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,152 +0,0 @@
1
- module Ferret::Search
2
- class PhraseScorer < Scorer
3
- attr_reader :first, :last
4
- protected :first, :last
5
-
6
- def initialize(weight, tps, positions, similarity, norms)
7
- super(similarity)
8
- @norms = norms
9
- @weight = weight
10
- @value = weight.value
11
- @first_time = true
12
- @more = true
13
-
14
- # convert tps to a list
15
- tps.length.times do |i|
16
- pp = PhrasePositions.new(tps[i], positions[i])
17
- if (@last != nil) # add next to end of list
18
- @last.next = pp
19
- else
20
- @first = pp
21
- end
22
- @last = pp
23
- end
24
-
25
- @pq = PhraseQueue.new(tps.length) # construct empty pq
26
- end
27
-
28
- def doc()
29
- return @first.doc
30
- end
31
-
32
- def next?
33
- if (@first_time)
34
- init()
35
- @first_time = false
36
- elsif (@more)
37
- @more = @last.next? # trigger further scanning
38
- end
39
- return do_next()
40
- end
41
-
42
- # next without initial increment
43
- def do_next()
44
- while (@more)
45
- while (@more and @first.doc < @last.doc) # find doc w/ all the terms
46
- @more = @first.skip_to(@last.doc) # skip first upto last
47
- first_to_last() # and move it to the end
48
- end
49
-
50
- if (@more)
51
- # found a doc with all of the terms
52
- @freq = phrase_freq() # check for phrase
53
- if (@freq == 0.0) # no match
54
- @more = @last.next? # trigger further scanning
55
- else
56
- return true # found a match
57
- end
58
- end
59
- end
60
- return false # no more matches
61
- end
62
-
63
- def each()
64
- pp = @first
65
- while (pp != nil)
66
- yield pp
67
- pp = pp.next
68
- end
69
- end
70
-
71
- def score()
72
- raw = similarity().tf(@freq) * @value # raw score
73
- return raw * Similarity.decode_norm(@norms[@first.doc]) # normalize
74
- end
75
-
76
- def skip_to(target)
77
- each() { |pp| break if not @more = pp.skip_to(target) }
78
- sort() if @more # re-sort
79
- return do_next()
80
- end
81
-
82
- def phrase_freq()
83
- raise NotImplementedError
84
- end
85
-
86
- def init()
87
- each do |pp|
88
- break if not @more = pp.next?
89
- end
90
- if @more
91
- sort()
92
- end
93
- end
94
-
95
- def sort()
96
- @pq.clear()
97
- each() do |pp|
98
- @pq.push(pp)
99
- end
100
- pq_to_list()
101
- end
102
-
103
- def pq_to_list()
104
- @last = @first = nil
105
- while (@pq.top() != nil)
106
- pp = @pq.pop()
107
- if (@last != nil) # add next to end of list
108
- @last.next = pp
109
- else
110
- @first = pp
111
- end
112
- @last = pp
113
- pp.next = nil
114
- end
115
- end
116
-
117
- def first_to_last()
118
- @last.next = @first # move first to end of list
119
- @last = @first
120
- @first = @first.next
121
- @last.next = nil
122
- end
123
-
124
- def explain(doc)
125
- tf_explanation = Explanation.new()
126
-
127
- while (next? and doc() < doc)
128
- end
129
-
130
- phrase_freq = (doc() == doc) ? @freq : 0.0
131
- tf_explanation.value = @similarity.tf(phrase_freq)
132
- tf_explanation.description = "tf(phrase_freq=#{phrase_freq})"
133
-
134
- return tf_explanation
135
- end
136
-
137
- def to_s() return "phrase_scorer(#{@weight})" end
138
-
139
- end
140
-
141
-
142
- class PhraseQueue < Ferret::Utils::PriorityQueue
143
- def less_than(pp1, pp2)
144
- if (pp1.doc == pp2.doc)
145
- return pp1.position < pp2.position
146
- else
147
- return pp1.doc < pp2.doc
148
- end
149
- end
150
- end
151
-
152
- end
@@ -1,54 +0,0 @@
1
- module Ferret::Search
2
- # A Query that matches documents containing terms with a specified prefix. A
3
- # PrefixQuery is built by QueryParser for input like +app*+.
4
- class PrefixQuery < Query
5
- attr_reader :prefix
6
- # Constructs a query for terms starting with +prefix+.
7
- def initialize(prefix)
8
- super()
9
- @prefix = prefix
10
- end
11
-
12
- def rewrite(reader)
13
- bq = BooleanQuery.new(true)
14
- enumerator = reader.terms_from(@prefix)
15
- begin
16
- prefix_text = @prefix.text
17
- prefix_length = prefix_text.length
18
- prefix_field = @prefix.field
19
- begin
20
- term = enumerator.term
21
- if (term.nil? or
22
- term.field != prefix_field or
23
- term.text[0,prefix_length] != prefix_text)
24
- break
25
- end
26
- tq = TermQuery.new(term) # found a match
27
- tq.boost = boost() # set the boost
28
- bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
29
- #puts("added " + term)
30
- end while (enumerator.next?)
31
- ensure
32
- enumerator.close()
33
- end
34
- return bq
35
- end
36
-
37
- # Prints a user-readable version of this query.
38
- def to_s(f)
39
- buffer = ""
40
- buffer << "#{@prefix.field}:" if @prefix.field != f
41
- buffer << "#{@prefix.text}*"
42
- buffer << "^#{boost()}" if boost() != 1.0
43
- return buffer
44
- end
45
-
46
- def eql?(o)
47
- (@prefix == o.prefix and boost() == o.boost)
48
- end
49
-
50
- def hash()
51
- boost().hash ^ @prefix.hash
52
- end
53
- end
54
- end
@@ -1,140 +0,0 @@
1
- module Ferret::Search
2
- # The abstract base class for queries.
3
- # Instantiable subclasses are:
4
- # * TermQuery
5
- # * MultiTermQuery
6
- # * BooleanQuery
7
- # * WildcardQuery
8
- # * PhraseQuery
9
- # * PrefixQuery
10
- # * MultiPhraseQuery
11
- # * FuzzyQuery
12
- # * RangeQuery
13
- # * Span::SpanQuery
14
- #
15
- # A parser for queries is contained in:
16
- # * Ferret::QueryParser::QueryParser
17
- #
18
- class Query
19
- # documents matching this query clause will (in addition to the normal
20
- # weightings) have their score multiplied by the boost factor. It is
21
- # 1.0 be default.
22
- attr_accessor :boost
23
-
24
- def initialize()
25
- @boost = 1.0
26
- end
27
-
28
- # Prints a query to a string, with +field+ as the default field for
29
- # terms. The representation used is one that is supposed to be readable
30
- # by Ferret::QueryParser::QueryParser. However, there are the following
31
- # limitations:
32
- # * If the query was created by the parser, the printed representation
33
- # may not be exactly what was parsed. For example, characters that need
34
- # to be escaped will be represented without the required backslash.
35
- # * Some of the more complicated queries (e.g. span queries)
36
- # don't have a representation that can be parsed by QueryParser.
37
- def to_s(field=nil)
38
- raise NotImplementedError
39
- end
40
-
41
- # Expert: Constructs an appropriate Weight implementation for this query.
42
- #
43
- # Only implemented by primitive queries, which re-write to themselves.
44
- def create_weight(searcher)
45
- raise NotImplementedError
46
- end
47
-
48
- # Expert: Constructs and initializes a Weight for a top-level query.
49
- def weight(searcher)
50
- query = searcher.rewrite(self)
51
- weight = query.create_weight(searcher)
52
- sum = weight.sum_of_squared_weights()
53
- norm = similarity(searcher).query_norm(sum)
54
- weight.normalize(norm)
55
- return weight
56
- end
57
-
58
- # Expert: called to re-write queries into primitive queries.
59
- def rewrite(reader)
60
- return self
61
- end
62
-
63
- # Expert: called when re-writing queries under MultiSearcher.
64
- #
65
- # Create a single query suitable for use by all subsearchers (in 1-1
66
- # correspondence with queries). This is an optimization of the OR of
67
- # all queries. We handle the common optimization cases of equal
68
- # queries and overlapping clauses of boolean OR queries (as generated
69
- # by MultiTermQuery.rewrite() and RangeQuery.rewrite()).
70
- # Be careful overriding this method as queries[0] determines which
71
- # method will be called and is not necessarily of the same type as
72
- # the other queries.
73
- def combine(queries)
74
- uniques = Set.new
75
- queries.each do |query|
76
- clauses = []
77
- # check if we can split the query into clauses
78
- splittable = query.respond_to? :clauses
79
- if splittable
80
- splittable = query.coord_disabled?
81
- clauses = query.clauses
82
- clauses.each do |clause|
83
- splittable = clause.occur == BooleanClause::Occur::SHOULD
84
- break unless splittable
85
- end
86
- end
87
- if splittable
88
- clauses.each { |clause| uniques << clause.query }
89
- else
90
- uniques << query
91
- end
92
- end
93
- # optimization: if we have just one query, just return it
94
- if uniques.size == 1
95
- uniques.each { |query| return query }
96
- end
97
-
98
- result = BooleanQuery.new(true)
99
- uniques.each do |query|
100
- result.add_query(query, BooleanClause::Occur::SHOULD)
101
- end
102
- return result
103
- end
104
-
105
- # Expert: adds all terms occuring in this query to the terms set
106
- def extract_terms(terms)
107
- raise NotImplementedError
108
- end
109
-
110
-
111
- # Expert: merges the clauses of a set of BooleanQuery's into a single
112
- # BooleanQuery.
113
- #
114
- # A utility for use by #combine() implementations.
115
- def merge_boolean_queries(queries)
116
- all_clauses = Set.new
117
- queries.each do |query|
118
- query.clauses.each do |clause|
119
- all_clauses << clause
120
- end
121
- end
122
-
123
- coord_disabled = queries.size==0 ? false : queries[0].coord_disabled?
124
- result = BooleanQuery.new(coord_disabled)
125
- all_clauses.each do |clause|
126
- result << clause
127
- end
128
- return result
129
- end
130
-
131
- # Expert: Returns the Similarity implementation to be used for this
132
- # query. Subclasses may override this method to specify their own
133
- # Similarity implementation, perhaps one that delegates through that of
134
- # the Searcher. By default the Searcher's Similarity implementation is
135
- # returned.
136
- def similarity(searcher)
137
- return searcher.similarity
138
- end
139
- end
140
- end
@@ -1,51 +0,0 @@
1
- module Ferret::Search
2
- require 'monitor'
3
- # Constrains search results to only match those which also match a provided
4
- # query. Results are cached, so that searches after the first on the same
5
- # index using this filter are much faster.
6
- #
7
- # This could be used, for example, with a RangeQuery on a suitably formatted
8
- # date field to implement date filtering. One could re-use a single
9
- # QueryFilter that matches, e.g., only documents modified within the last
10
- # week. The QueryFilter and RangeQuery would only need to be reconstructed
11
- # once per day.
12
- class QueryFilter < Filter
13
-
14
- # Constructs a filter which only matches documents matching
15
- # +query+.
16
- def initialize(query)
17
- @query = query
18
- @cache = nil
19
- end
20
-
21
- def bits(reader)
22
-
23
- if (@cache == nil)
24
- @cache = Ferret::Utils::WeakKeyHash.new
25
- end
26
-
27
- @cache.synchronize() do # check cache
28
- bits = @cache[reader]
29
- if bits
30
- return bits
31
- end
32
- end
33
-
34
- bits = Ferret::Utils::BitVector.new()
35
-
36
- IndexSearcher.new(reader).search_each(@query) do |doc, score|
37
- bits.set(doc) # set bit for hit
38
- end
39
-
40
- @cache.synchronize() do # update cache
41
- @cache[reader] = bits
42
- end
43
-
44
- return bits
45
- end
46
-
47
- def to_s()
48
- return "QueryFilter(#{@query})"
49
- end
50
- end
51
- end
@@ -1,103 +0,0 @@
1
- module Ferret::Search
2
- # A Filter that restricts search results to a range of values in a given
3
- # field.
4
- #
5
- # This code borrows heavily from RangeQuery, but is implemented as a Filter.
6
- class RangeFilter < Filter
7
- include Ferret::Index
8
-
9
- # field_name:: The field this range applies to
10
- # lower_term:: The lower bound on this range
11
- # upper_term:: The upper bound on this range
12
- # include_lower:: Does this range include the lower bound?
13
- # include_upper:: Does this range include the upper bound?
14
- def initialize(field_name, lower_term, upper_term, include_lower, include_upper)
15
- @field_name = field_name
16
- @lower_term = lower_term
17
- @upper_term = upper_term
18
- @include_lower = include_lower
19
- @include_upper = include_upper
20
-
21
- if (lower_term.nil? and upper_term.nil?)
22
- raise ArgumentError, "At least one value must be non-nil"
23
- end
24
- if (include_lower and lower_term.nil?)
25
- raise ArgumentError, "The lower bound must be non-nil to be inclusive"
26
- end
27
- if (include_upper and upper_term.nil?)
28
- raise ArgumentError, "The upper bound must be non-nil to be inclusive"
29
- end
30
- if (upper_term and lower_term and upper_term < lower_term)
31
- raise ArgumentError, "The lower bound must less than the upper bound"
32
- end
33
- end
34
-
35
- # Constructs a filter for field +field_name+ matching less than or equal to
36
- # +upper_term+.
37
- def RangeFilter.new_less(field_name, upper_term, include_upper = true)
38
- return RangeFilter.new(field_name, nil, upper_term, false, include_upper)
39
- end
40
-
41
- # Constructs a filter for field +field_name+ matching greater than or equal
42
- # to +lower_term+.
43
- def RangeFilter.new_more(field_name, lower_term, include_lower = true)
44
- return RangeFilter.new(field_name, lower_term, nil, include_lower, false)
45
- end
46
-
47
- # Returns a BitVector with true for documents which should be permitted in
48
- # search results, and false for those that should not.
49
- def bits(reader)
50
- bits = Ferret::Utils::BitVector.new()
51
- term_enum = reader.terms_from(Term.new(@field_name, @lower_term||""))
52
-
53
- begin
54
- if (term_enum.term() == nil)
55
- return bits
56
- end
57
- check_lower = !@include_lower # make adjustments to set to exclusive
58
-
59
- term_docs = reader.term_docs
60
- begin
61
- begin
62
- term = term_enum.term()
63
- break if (term.nil? or term.field != @field_name)
64
-
65
- if (!check_lower or @lower_term.nil? or term.text > @lower_term)
66
- check_lower = false
67
- if @upper_term
68
- compare = @upper_term <=> term.text
69
- # if beyond the upper term, or is exclusive and
70
- # this is equal to the upper term, break out
71
- if ((compare < 0) or (!@include_upper and compare == 0))
72
- break
73
- end
74
- end
75
- # we have a good term, find the docs
76
-
77
- term_docs.seek(term_enum)
78
- while term_docs.next?
79
- bits.set(term_docs.doc)
80
- end
81
- end
82
- end while term_enum.next?
83
- ensure
84
- term_docs.close()
85
- end
86
- ensure
87
- term_enum.close()
88
- end
89
-
90
- return bits
91
- end
92
-
93
- def to_s()
94
- buffer = "#{@field_name}:"
95
- buffer << "[" if @include_lower
96
- buffer << @lower_term if @lower_term
97
- buffer << "-"
98
- buffer << @upper_term if @upper_term
99
- buffer << @include_upper ? "]" : "end"
100
- return buffer
101
- end
102
- end
103
- end