ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,152 +0,0 @@
1
- module Ferret::Search
2
- class PhraseScorer < Scorer
3
- attr_reader :first, :last
4
- protected :first, :last
5
-
6
- def initialize(weight, tps, positions, similarity, norms)
7
- super(similarity)
8
- @norms = norms
9
- @weight = weight
10
- @value = weight.value
11
- @first_time = true
12
- @more = true
13
-
14
- # convert tps to a list
15
- tps.length.times do |i|
16
- pp = PhrasePositions.new(tps[i], positions[i])
17
- if (@last != nil) # add next to end of list
18
- @last.next = pp
19
- else
20
- @first = pp
21
- end
22
- @last = pp
23
- end
24
-
25
- @pq = PhraseQueue.new(tps.length) # construct empty pq
26
- end
27
-
28
- def doc()
29
- return @first.doc
30
- end
31
-
32
- def next?
33
- if (@first_time)
34
- init()
35
- @first_time = false
36
- elsif (@more)
37
- @more = @last.next? # trigger further scanning
38
- end
39
- return do_next()
40
- end
41
-
42
- # next without initial increment
43
- def do_next()
44
- while (@more)
45
- while (@more and @first.doc < @last.doc) # find doc w/ all the terms
46
- @more = @first.skip_to(@last.doc) # skip first upto last
47
- first_to_last() # and move it to the end
48
- end
49
-
50
- if (@more)
51
- # found a doc with all of the terms
52
- @freq = phrase_freq() # check for phrase
53
- if (@freq == 0.0) # no match
54
- @more = @last.next? # trigger further scanning
55
- else
56
- return true # found a match
57
- end
58
- end
59
- end
60
- return false # no more matches
61
- end
62
-
63
- def each()
64
- pp = @first
65
- while (pp != nil)
66
- yield pp
67
- pp = pp.next
68
- end
69
- end
70
-
71
- def score()
72
- raw = similarity().tf(@freq) * @value # raw score
73
- return raw * Similarity.decode_norm(@norms[@first.doc]) # normalize
74
- end
75
-
76
- def skip_to(target)
77
- each() { |pp| break if not @more = pp.skip_to(target) }
78
- sort() if @more # re-sort
79
- return do_next()
80
- end
81
-
82
- def phrase_freq()
83
- raise NotImplementedError
84
- end
85
-
86
- def init()
87
- each do |pp|
88
- break if not @more = pp.next?
89
- end
90
- if @more
91
- sort()
92
- end
93
- end
94
-
95
- def sort()
96
- @pq.clear()
97
- each() do |pp|
98
- @pq.push(pp)
99
- end
100
- pq_to_list()
101
- end
102
-
103
- def pq_to_list()
104
- @last = @first = nil
105
- while (@pq.top() != nil)
106
- pp = @pq.pop()
107
- if (@last != nil) # add next to end of list
108
- @last.next = pp
109
- else
110
- @first = pp
111
- end
112
- @last = pp
113
- pp.next = nil
114
- end
115
- end
116
-
117
- def first_to_last()
118
- @last.next = @first # move first to end of list
119
- @last = @first
120
- @first = @first.next
121
- @last.next = nil
122
- end
123
-
124
- def explain(doc)
125
- tf_explanation = Explanation.new()
126
-
127
- while (next? and doc() < doc)
128
- end
129
-
130
- phrase_freq = (doc() == doc) ? @freq : 0.0
131
- tf_explanation.value = @similarity.tf(phrase_freq)
132
- tf_explanation.description = "tf(phrase_freq=#{phrase_freq})"
133
-
134
- return tf_explanation
135
- end
136
-
137
- def to_s() return "phrase_scorer(#{@weight})" end
138
-
139
- end
140
-
141
-
142
- class PhraseQueue < Ferret::Utils::PriorityQueue
143
- def less_than(pp1, pp2)
144
- if (pp1.doc == pp2.doc)
145
- return pp1.position < pp2.position
146
- else
147
- return pp1.doc < pp2.doc
148
- end
149
- end
150
- end
151
-
152
- end
@@ -1,54 +0,0 @@
1
- module Ferret::Search
2
- # A Query that matches documents containing terms with a specified prefix. A
3
- # PrefixQuery is built by QueryParser for input like +app*+.
4
- class PrefixQuery < Query
5
- attr_reader :prefix
6
- # Constructs a query for terms starting with +prefix+.
7
- def initialize(prefix)
8
- super()
9
- @prefix = prefix
10
- end
11
-
12
- def rewrite(reader)
13
- bq = BooleanQuery.new(true)
14
- enumerator = reader.terms_from(@prefix)
15
- begin
16
- prefix_text = @prefix.text
17
- prefix_length = prefix_text.length
18
- prefix_field = @prefix.field
19
- begin
20
- term = enumerator.term
21
- if (term.nil? or
22
- term.field != prefix_field or
23
- term.text[0,prefix_length] != prefix_text)
24
- break
25
- end
26
- tq = TermQuery.new(term) # found a match
27
- tq.boost = boost() # set the boost
28
- bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
29
- #puts("added " + term)
30
- end while (enumerator.next?)
31
- ensure
32
- enumerator.close()
33
- end
34
- return bq
35
- end
36
-
37
- # Prints a user-readable version of this query.
38
- def to_s(f)
39
- buffer = ""
40
- buffer << "#{@prefix.field}:" if @prefix.field != f
41
- buffer << "#{@prefix.text}*"
42
- buffer << "^#{boost()}" if boost() != 1.0
43
- return buffer
44
- end
45
-
46
- def eql?(o)
47
- (@prefix == o.prefix and boost() == o.boost)
48
- end
49
-
50
- def hash()
51
- boost().hash ^ @prefix.hash
52
- end
53
- end
54
- end
@@ -1,140 +0,0 @@
1
- module Ferret::Search
2
- # The abstract base class for queries.
3
- # Instantiable subclasses are:
4
- # * TermQuery
5
- # * MultiTermQuery
6
- # * BooleanQuery
7
- # * WildcardQuery
8
- # * PhraseQuery
9
- # * PrefixQuery
10
- # * MultiPhraseQuery
11
- # * FuzzyQuery
12
- # * RangeQuery
13
- # * Span::SpanQuery
14
- #
15
- # A parser for queries is contained in:
16
- # * Ferret::QueryParser::QueryParser
17
- #
18
- class Query
19
- # documents matching this query clause will (in addition to the normal
20
- # weightings) have their score multiplied by the boost factor. It is
21
- # 1.0 be default.
22
- attr_accessor :boost
23
-
24
- def initialize()
25
- @boost = 1.0
26
- end
27
-
28
- # Prints a query to a string, with +field+ as the default field for
29
- # terms. The representation used is one that is supposed to be readable
30
- # by Ferret::QueryParser::QueryParser. However, there are the following
31
- # limitations:
32
- # * If the query was created by the parser, the printed representation
33
- # may not be exactly what was parsed. For example, characters that need
34
- # to be escaped will be represented without the required backslash.
35
- # * Some of the more complicated queries (e.g. span queries)
36
- # don't have a representation that can be parsed by QueryParser.
37
- def to_s(field=nil)
38
- raise NotImplementedError
39
- end
40
-
41
- # Expert: Constructs an appropriate Weight implementation for this query.
42
- #
43
- # Only implemented by primitive queries, which re-write to themselves.
44
- def create_weight(searcher)
45
- raise NotImplementedError
46
- end
47
-
48
- # Expert: Constructs and initializes a Weight for a top-level query.
49
- def weight(searcher)
50
- query = searcher.rewrite(self)
51
- weight = query.create_weight(searcher)
52
- sum = weight.sum_of_squared_weights()
53
- norm = similarity(searcher).query_norm(sum)
54
- weight.normalize(norm)
55
- return weight
56
- end
57
-
58
- # Expert: called to re-write queries into primitive queries.
59
- def rewrite(reader)
60
- return self
61
- end
62
-
63
- # Expert: called when re-writing queries under MultiSearcher.
64
- #
65
- # Create a single query suitable for use by all subsearchers (in 1-1
66
- # correspondence with queries). This is an optimization of the OR of
67
- # all queries. We handle the common optimization cases of equal
68
- # queries and overlapping clauses of boolean OR queries (as generated
69
- # by MultiTermQuery.rewrite() and RangeQuery.rewrite()).
70
- # Be careful overriding this method as queries[0] determines which
71
- # method will be called and is not necessarily of the same type as
72
- # the other queries.
73
- def combine(queries)
74
- uniques = Set.new
75
- queries.each do |query|
76
- clauses = []
77
- # check if we can split the query into clauses
78
- splittable = query.respond_to? :clauses
79
- if splittable
80
- splittable = query.coord_disabled?
81
- clauses = query.clauses
82
- clauses.each do |clause|
83
- splittable = clause.occur == BooleanClause::Occur::SHOULD
84
- break unless splittable
85
- end
86
- end
87
- if splittable
88
- clauses.each { |clause| uniques << clause.query }
89
- else
90
- uniques << query
91
- end
92
- end
93
- # optimization: if we have just one query, just return it
94
- if uniques.size == 1
95
- uniques.each { |query| return query }
96
- end
97
-
98
- result = BooleanQuery.new(true)
99
- uniques.each do |query|
100
- result.add_query(query, BooleanClause::Occur::SHOULD)
101
- end
102
- return result
103
- end
104
-
105
- # Expert: adds all terms occuring in this query to the terms set
106
- def extract_terms(terms)
107
- raise NotImplementedError
108
- end
109
-
110
-
111
- # Expert: merges the clauses of a set of BooleanQuery's into a single
112
- # BooleanQuery.
113
- #
114
- # A utility for use by #combine() implementations.
115
- def merge_boolean_queries(queries)
116
- all_clauses = Set.new
117
- queries.each do |query|
118
- query.clauses.each do |clause|
119
- all_clauses << clause
120
- end
121
- end
122
-
123
- coord_disabled = queries.size==0 ? false : queries[0].coord_disabled?
124
- result = BooleanQuery.new(coord_disabled)
125
- all_clauses.each do |clause|
126
- result << clause
127
- end
128
- return result
129
- end
130
-
131
- # Expert: Returns the Similarity implementation to be used for this
132
- # query. Subclasses may override this method to specify their own
133
- # Similarity implementation, perhaps one that delegates through that of
134
- # the Searcher. By default the Searcher's Similarity implementation is
135
- # returned.
136
- def similarity(searcher)
137
- return searcher.similarity
138
- end
139
- end
140
- end
@@ -1,51 +0,0 @@
1
- module Ferret::Search
2
- require 'monitor'
3
- # Constrains search results to only match those which also match a provided
4
- # query. Results are cached, so that searches after the first on the same
5
- # index using this filter are much faster.
6
- #
7
- # This could be used, for example, with a RangeQuery on a suitably formatted
8
- # date field to implement date filtering. One could re-use a single
9
- # QueryFilter that matches, e.g., only documents modified within the last
10
- # week. The QueryFilter and RangeQuery would only need to be reconstructed
11
- # once per day.
12
- class QueryFilter < Filter
13
-
14
- # Constructs a filter which only matches documents matching
15
- # +query+.
16
- def initialize(query)
17
- @query = query
18
- @cache = nil
19
- end
20
-
21
- def bits(reader)
22
-
23
- if (@cache == nil)
24
- @cache = Ferret::Utils::WeakKeyHash.new
25
- end
26
-
27
- @cache.synchronize() do # check cache
28
- bits = @cache[reader]
29
- if bits
30
- return bits
31
- end
32
- end
33
-
34
- bits = Ferret::Utils::BitVector.new()
35
-
36
- IndexSearcher.new(reader).search_each(@query) do |doc, score|
37
- bits.set(doc) # set bit for hit
38
- end
39
-
40
- @cache.synchronize() do # update cache
41
- @cache[reader] = bits
42
- end
43
-
44
- return bits
45
- end
46
-
47
- def to_s()
48
- return "QueryFilter(#{@query})"
49
- end
50
- end
51
- end
@@ -1,103 +0,0 @@
1
- module Ferret::Search
2
- # A Filter that restricts search results to a range of values in a given
3
- # field.
4
- #
5
- # This code borrows heavily from RangeQuery, but is implemented as a Filter.
6
- class RangeFilter < Filter
7
- include Ferret::Index
8
-
9
- # field_name:: The field this range applies to
10
- # lower_term:: The lower bound on this range
11
- # upper_term:: The upper bound on this range
12
- # include_lower:: Does this range include the lower bound?
13
- # include_upper:: Does this range include the upper bound?
14
- def initialize(field_name, lower_term, upper_term, include_lower, include_upper)
15
- @field_name = field_name
16
- @lower_term = lower_term
17
- @upper_term = upper_term
18
- @include_lower = include_lower
19
- @include_upper = include_upper
20
-
21
- if (lower_term.nil? and upper_term.nil?)
22
- raise ArgumentError, "At least one value must be non-nil"
23
- end
24
- if (include_lower and lower_term.nil?)
25
- raise ArgumentError, "The lower bound must be non-nil to be inclusive"
26
- end
27
- if (include_upper and upper_term.nil?)
28
- raise ArgumentError, "The upper bound must be non-nil to be inclusive"
29
- end
30
- if (upper_term and lower_term and upper_term < lower_term)
31
- raise ArgumentError, "The lower bound must less than the upper bound"
32
- end
33
- end
34
-
35
- # Constructs a filter for field +field_name+ matching less than or equal to
36
- # +upper_term+.
37
- def RangeFilter.new_less(field_name, upper_term, include_upper = true)
38
- return RangeFilter.new(field_name, nil, upper_term, false, include_upper)
39
- end
40
-
41
- # Constructs a filter for field +field_name+ matching greater than or equal
42
- # to +lower_term+.
43
- def RangeFilter.new_more(field_name, lower_term, include_lower = true)
44
- return RangeFilter.new(field_name, lower_term, nil, include_lower, false)
45
- end
46
-
47
- # Returns a BitVector with true for documents which should be permitted in
48
- # search results, and false for those that should not.
49
- def bits(reader)
50
- bits = Ferret::Utils::BitVector.new()
51
- term_enum = reader.terms_from(Term.new(@field_name, @lower_term||""))
52
-
53
- begin
54
- if (term_enum.term() == nil)
55
- return bits
56
- end
57
- check_lower = !@include_lower # make adjustments to set to exclusive
58
-
59
- term_docs = reader.term_docs
60
- begin
61
- begin
62
- term = term_enum.term()
63
- break if (term.nil? or term.field != @field_name)
64
-
65
- if (!check_lower or @lower_term.nil? or term.text > @lower_term)
66
- check_lower = false
67
- if @upper_term
68
- compare = @upper_term <=> term.text
69
- # if beyond the upper term, or is exclusive and
70
- # this is equal to the upper term, break out
71
- if ((compare < 0) or (!@include_upper and compare == 0))
72
- break
73
- end
74
- end
75
- # we have a good term, find the docs
76
-
77
- term_docs.seek(term_enum)
78
- while term_docs.next?
79
- bits.set(term_docs.doc)
80
- end
81
- end
82
- end while term_enum.next?
83
- ensure
84
- term_docs.close()
85
- end
86
- ensure
87
- term_enum.close()
88
- end
89
-
90
- return bits
91
- end
92
-
93
- def to_s()
94
- buffer = "#{@field_name}:"
95
- buffer << "[" if @include_lower
96
- buffer << @lower_term if @lower_term
97
- buffer << "-"
98
- buffer << @upper_term if @upper_term
99
- buffer << @include_upper ? "]" : "end"
100
- return buffer
101
- end
102
- end
103
- end