ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/lib/ferret/search.rb DELETED
@@ -1,50 +0,0 @@
1
- require 'ferret/search/similarity.rb'
2
- require 'ferret/search/boolean_clause.rb'
3
- require 'ferret/search/scorer.rb'
4
- require 'ferret/search/score_doc.rb'
5
- require 'ferret/search/score_doc_comparator.rb'
6
- require 'ferret/search/weight.rb'
7
- require 'ferret/search/query.rb'
8
- require 'ferret/search/term_query.rb'
9
- require 'ferret/search/term_scorer.rb'
10
- require 'ferret/search/top_docs.rb'
11
- require 'ferret/search/boolean_query.rb'
12
- require 'ferret/search/conjunction_scorer.rb'
13
- require 'ferret/search/disjunction_sum_scorer.rb'
14
- require 'ferret/search/multi_term_query.rb'
15
- require 'ferret/search/phrase_query.rb'
16
- require 'ferret/search/multi_phrase_query.rb'
17
- require 'ferret/search/prefix_query.rb'
18
- require 'ferret/search/range_query.rb'
19
- require 'ferret/search/filtered_term_enum.rb'
20
- require 'ferret/search/wildcard_term_enum.rb'
21
- require 'ferret/search/wildcard_query.rb'
22
- require 'ferret/search/fuzzy_term_enum.rb'
23
- require 'ferret/search/fuzzy_query.rb'
24
- require 'ferret/search/phrase_positions.rb'
25
- require 'ferret/search/phrase_scorer.rb'
26
- require 'ferret/search/exact_phrase_scorer.rb'
27
- require 'ferret/search/sloppy_phrase_scorer.rb'
28
- require 'ferret/search/boolean_scorer.rb'
29
- require 'ferret/search/explanation.rb'
30
- require 'ferret/search/field_doc.rb'
31
- require 'ferret/search/hit_collector.rb'
32
- require 'ferret/search/hit_queue.rb'
33
- require 'ferret/search/non_matching_scorer.rb'
34
- require 'ferret/search/req_excl_scorer.rb'
35
- require 'ferret/search/req_opt_sum_scorer.rb'
36
- require 'ferret/search/score_doc.rb'
37
- require 'ferret/search/score_doc_comparator.rb'
38
- require 'ferret/search/sort_field.rb'
39
- require 'ferret/search/sort.rb'
40
- require 'ferret/search/field_cache.rb'
41
- require 'ferret/search/field_sorted_hit_queue.rb'
42
- require 'ferret/search/filter.rb'
43
- require 'ferret/search/range_filter.rb'
44
- require 'ferret/search/query_filter.rb'
45
- require 'ferret/search/caching_wrapper_filter.rb'
46
- require 'ferret/search/filtered_query.rb'
47
- require 'ferret/search/match_all_query.rb'
48
- require 'ferret/search/spans.rb'
49
- require 'ferret/search/index_searcher.rb'
50
- require 'ferret/search/multi_searcher.rb'
@@ -1,100 +0,0 @@
1
-
2
- module Ferret::Search
3
-
4
- # A clause in a BooleanQuery.
5
- class BooleanClause
6
-
7
- class Occur < Ferret::Utils::Parameter
8
-
9
- def to_s()
10
- return "+" if (self == MUST)
11
- return "-" if (self == MUST_NOT)
12
- return ""
13
- end
14
-
15
- # Use this operator for terms that _must_ appear in the matching
16
- # documents.
17
- MUST = Occur.new("MUST")
18
-
19
- # Use this operator for terms that _should_ appear in the matching
20
- # documents. For a BooleanQuery with two +SHOULD+ subqueries, at
21
- # least one of the queries must appear in the matching documents.
22
- SHOULD = Occur.new("SHOULD")
23
-
24
- # Use this operator for terms that _must not_ appear in the matching
25
- # documents. Note that it is not possible to search for queries that
26
- # only consist of a +MUST_NOT+ query.
27
- MUST_NOT = Occur.new("MUST_NOT")
28
- end
29
-
30
- # The query whose matching documents are combined by the boolean query.
31
- attr_accessor :query
32
-
33
- # If true, documents documents which _do not_ match this sub-query will
34
- # _not_ match the boolean query.
35
- attr_writer :required
36
- def required?
37
- @required
38
- end
39
-
40
- # If true, documents documents which _do_ match this sub-query will _not_
41
- # match the boolean query.
42
- attr_writer :prohibited
43
- def prohibited?
44
- @prohibited
45
- end
46
-
47
- # See BooleanQuery::Occur for values for this attribute
48
- attr_reader :occur
49
- def occur=(occur)
50
- @occur = occur
51
- set_fields(occur)
52
- end
53
-
54
- # Constructs a BooleanClause. Default value for occur is Occur::SHOULD
55
- def initialize(query, occur = Occur::SHOULD)
56
- @query = query
57
- @occur = occur
58
- set_fields(occur)
59
- end
60
-
61
-
62
- # Returns true iff +other+ is equal to this.
63
- def eql?(other)
64
- if not other.instance_of?(BooleanClause)
65
- return false
66
- end
67
- return (@query == other.query and
68
- @required == other.required? and
69
- @prohibited == other.prohibited?)
70
- end
71
- alias :== :eql?
72
-
73
- # Returns a hash code value for this object.
74
- def hash()
75
- return @query.hash() ^ (@required ? 1 : 0) ^ (@prohibited ? 2 : 0)
76
- end
77
-
78
- # represent a boolean clause as a string
79
- def to_s()
80
- return @occur.to_s() + @query.to_s()
81
- end
82
-
83
- private
84
-
85
- def set_fields(occur)
86
- if (occur == Occur::MUST)
87
- @required = true
88
- @prohibited = false
89
- elsif (occur == Occur::SHOULD)
90
- @required = false
91
- @prohibited = false
92
- elsif (occur == Occur::MUST_NOT)
93
- @required = false
94
- @prohibited = true
95
- else
96
- raise ArgumentError, "Unknown operator " + occur
97
- end
98
- end
99
- end
100
- end
@@ -1,299 +0,0 @@
1
- module Ferret::Search
2
- # A Query that matches documents matching boolean combinations of other
3
- # queries, e.g. TermQuerys, PhraseQuerys or other BooleanQuerys.
4
- class BooleanQuery < Query
5
-
6
- # The maximum number of clauses permitted. Default value is 1024.
7
- #
8
- # TermQuery clauses are generated from for example prefix queries and
9
- # fuzzy queries. Each TermQuery needs some buffer space during search,
10
- # so this parameter indirectly controls the maximum buffer requirements
11
- # for query search.
12
- #
13
- # When this parameter becomes a bottleneck for a Query one can use a
14
- # Filter. For example instead of a RangeQuery one can use a RangeFilter.
15
- #
16
- # Attempts to add more than the permitted number of clauses cause
17
- # TooManyClauses to be raisen.
18
- attr_accessor :max_clause_count
19
- attr_accessor :clauses
20
- DEFAULT_MAX_CLAUSE_COUNT = 1024
21
-
22
- @@max_clause_count = DEFAULT_MAX_CLAUSE_COUNT
23
- def BooleanQuery.max_clause_count
24
- return @@max_clause_count
25
- end
26
- def BooleanQuery.max_clause_count=(mcc)
27
- @@max_clause_count = mcc
28
- end
29
-
30
- # Thrown when an attempt is made to add more than #max_clause_count()
31
- # clauses. This typically happens if a PrefixQuery, FuzzyQuery,
32
- # WildcardQuery, or RangeQuery is expanded to many terms during search.
33
- class TooManyClauses < Exception
34
- end
35
-
36
- # Constructs an empty boolean query.
37
- #
38
- # Similarity#coord(int,int) may be disabled in scoring, as appropriate.
39
- # For example, this score factor does not make sense for most automatically
40
- # generated queries, like WildcardQuery and FuzzyQuery.
41
- #
42
- # coord_disabled:: disables Similarity#coord(int,int) in scoring.
43
- def initialize(coord_disabled = false)
44
- super()
45
- @coord_disabled = coord_disabled
46
- @clauses = []
47
- end
48
-
49
- # Returns true iff Similarity#coord(int,int) is disabled in scoring for
50
- # this query instance.
51
- # See #BooleanQuery(boolean)
52
- def coord_disabled?()
53
- return @coord_disabled
54
- end
55
-
56
- def similarity(searcher)
57
- sim = super
58
- if (@coord_disabled) # disable coord as requested
59
- class <<sim
60
- def coord(overlap, max_overlap)
61
- return 1.0
62
- end
63
- end
64
- end
65
- return sim
66
- end
67
-
68
- # Adds a clause to a boolean query. Clauses may be:
69
- #
70
- # required:: which means that documents which _do not_ match this
71
- # sub-query will _not_ match the boolean query
72
- # prohibited:: which means that documents which _do_ match this
73
- # sub-query will _not_ match the boolean query; or
74
- # neither:: in which case matched documents are neither prohibited
75
- # from nor required to match the sub-query. However, a
76
- # document must match at least 1 sub-query to match the
77
- # boolean query.
78
- #
79
- # * For +required+ use add(query, BooleanClause::Occur::MUST)
80
- # * For +prohibited+ use add(query, BooleanClause::Occur::MUST_NOT)
81
- # * For +neither+ use add(query, BooleanClause::Occur::SHOULD)
82
- #
83
- # raises:: TooManyClauses if the new number of clauses exceeds the
84
- # maximum clause number #max_clause_count()
85
- def add_query(query, occur=BooleanClause::Occur::SHOULD)
86
- add_clause(BooleanClause.new(query, occur))
87
- end
88
-
89
- # Adds a clause to a boolean query.
90
- # raises:: TooManyClauses if the new number of clauses exceeds the
91
- # maximum clause number. See #max_clause_count()
92
- def add_clause(clause)
93
- if @clauses.size >= @@max_clause_count
94
- raise TooManyClauses
95
- end
96
-
97
- @clauses << clause
98
- self
99
- end
100
- alias :<< :add_clause
101
-
102
- class BooleanWeight < Weight
103
- attr_accessor :similarity
104
- attr_accessor :weights
105
- attr_reader :query
106
-
107
- def initialize(query, searcher)
108
- @query = query
109
- @weights = []
110
-
111
- @similarity = query.similarity(searcher)
112
- query.clauses.each do |clause|
113
- @weights << clause.query.create_weight(searcher)
114
- end
115
- end
116
-
117
- def value()
118
- return @query.boost()
119
- end
120
-
121
- def sum_of_squared_weights()
122
- sum = 0
123
- @weights.each_with_index do |weight, i|
124
- clause = @query.clauses[i]
125
- if not clause.prohibited?
126
- sum += weight.sum_of_squared_weights() # sum sub weights
127
- end
128
- end
129
-
130
- sum *= @query.boost() * @query.boost() # boost each sub-weight
131
-
132
- return sum
133
- end
134
-
135
-
136
- def normalize(norm)
137
- norm *= @query.boost()
138
- @weights.each_with_index do |weight, i|
139
- clause = @query.clauses[i]
140
- if not clause.prohibited?
141
- weight.normalize(norm)
142
- end
143
- end
144
- end
145
-
146
- # returns:: An alternative Scorer that uses and provides skip_to(),
147
- # and scores documents in document number order.
148
- def scorer(reader)
149
- result = BooleanScorer.new(@similarity)
150
-
151
- @weights.each_with_index do |weight, i|
152
- clause = @query.clauses[i]
153
- sub_scorer = weight.scorer(reader)
154
- if (sub_scorer != nil)
155
- result.add_scorer(sub_scorer, clause.occur)
156
- elsif (clause.required?())
157
- return nil
158
- end
159
- end
160
-
161
- return result
162
- end
163
-
164
- def explain(reader, doc)
165
-
166
- sum_expl = Explanation.new()
167
- sum_expl.description = "sum of:"
168
- coord = 0
169
- max_coord = 0
170
- sum = 0.0
171
-
172
- @weights.each_with_index do |weight, i|
173
- clause = @query.clauses[i]
174
- explanation = weight.explain(reader, doc)
175
- max_coord += 1 if not clause.prohibited?
176
- if explanation.value > 0
177
- if not clause.prohibited?
178
- sum_expl << explanation
179
- sum += explanation.value
180
- coord += 1
181
- else
182
- return Explanation.new(0.0, "match prohibited")
183
- end
184
- elsif clause.required?
185
- return Explanation.new(0.0, "match required")
186
- end
187
- end
188
- sum_expl.value = sum
189
-
190
- if (coord == 1) # only one clause matched
191
- sum_expl = sum_expl.details[0] # eliminate wrapper
192
- end
193
-
194
- coord_factor = @similarity.coord(coord, max_coord)
195
- if (coord_factor == 1.0) # coord is no-op
196
- return sum_expl # eliminate wrapper
197
- else
198
- result = Explanation.new()
199
- result.description = "product of:"
200
- result << sum_expl
201
- result << Explanation.new(coord_factor, "coord(#{coord}/#{max_coord})")
202
- result.value = sum * coord_factor
203
- return result
204
- end
205
- end
206
- end #end BooleanWeight
207
-
208
- def create_weight(searcher)
209
- return BooleanWeight.new(self, searcher)
210
- end
211
-
212
- def rewrite(reader)
213
- if @clauses.size == 1 # optimize 1-clause queries
214
- clause = @clauses[0]
215
- if not clause.prohibited? # just return clause
216
-
217
- query = clause.query.rewrite(reader) # rewrite first
218
-
219
- if boost() != 1.0 # incorporate boost
220
- if query == clause.query # if rewrite was no-op
221
- query = query.clone # then clone before boost
222
- end
223
- query.boost = boost() * query.boost()
224
- end
225
-
226
- return query
227
- end
228
- end
229
-
230
- clone = nil # recursively rewrite
231
- @clauses.each_with_index do |clause, i|
232
- query = clause.query().rewrite(reader)
233
- if query != clause.query() # clause rewrote: must clone
234
- clone ||= clone()
235
- clone.clauses[i] = BooleanClause.new(query, clause.occur)
236
- end
237
- end
238
- if (clone != nil)
239
- return clone # some clauses rewrote
240
- else
241
- return self # no clauses rewrote
242
- end
243
- end
244
-
245
- def extract_terms(terms)
246
- @clauses.each do |clause|
247
- clause.query.extract_terms(terms)
248
- end
249
- end
250
-
251
- def initialize_copy(o)
252
- super
253
- @clauses = o.clauses.clone
254
- end
255
-
256
- # Prints a user-readable version of this query.
257
- def to_s(field = nil)
258
- buffer = ""
259
- buffer << "(" if boost != 1.0
260
-
261
- @clauses.each_with_index do |clause, i|
262
- if clause.prohibited?
263
- buffer << "-"
264
- elsif clause.required?
265
- buffer << "+"
266
- end
267
-
268
- sub_query = clause.query
269
- if sub_query.instance_of? BooleanQuery # wrap sub-bools in parens
270
- buffer << "(#{clause.query.to_s(field)})"
271
- else
272
- buffer << clause.query.to_s(field)
273
- end
274
-
275
- if i != (@clauses.size - 1)
276
- buffer << " "
277
- end
278
- end
279
-
280
- buffer << ")^#{boost}" if boost() != 1.0
281
-
282
- return buffer
283
- end
284
-
285
- # Returns true iff +o+ is equal to this.
286
- def eql?(other)
287
- if not other.instance_of?(BooleanQuery)
288
- return false
289
- end
290
- return (boost() == other.boost() and @clauses == other.clauses)
291
- end
292
- alias :== :eql?
293
-
294
- # Returns a hash code value for this object.
295
- def hash()
296
- return boost().hash ^ @clauses.hash
297
- end
298
- end
299
- end