ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,261 +0,0 @@
1
- module Ferret::Search
2
-
3
- # Implements searching multiple IndexSearchers at once
4
- #
5
- # Applications usually need only call the @link #search(Query)
6
- # or @link #search(Query,Filter) methods. For performance reasons it is
7
- # recommended to open only one Searcher and use it for all of your searches.
8
- class MultiSearcher
9
- include Ferret::Index
10
-
11
- attr_accessor :similarity, :searchers
12
-
13
- # Creates a MultiSearcher searching across all the searchers
14
- # in the provided array.
15
- #
16
- def initialize(args)
17
- @searchers = Array.new(args)
18
- @similarity = Similarity.default
19
-
20
- # initialize reader lookup array
21
- @max_doc = 0
22
- @starts = Array.new(@searchers.size + 1)
23
- @searchers.each_with_index { |searcher, i|
24
- @starts[i] = @max_doc
25
- @max_doc += searcher.max_doc
26
- }
27
- @starts[@searchers.size] = @max_doc
28
- end
29
-
30
- # closes all underlying Searchers
31
- def close()
32
- @searchers.each { |searcher| searcher.close() }
33
- end
34
-
35
- # Expert: Returns the number of documents containing +term+.
36
- # Called by search code to compute term weights.
37
- # See IndexReader#doc_freq
38
- def doc_freq(term)
39
- return @searchers.inject(0) { |df, searcher|
40
- df + searcher.doc_freq(term)
41
- }
42
- end
43
-
44
- # Expert: For each term in the terms array, calculates the number of
45
- # documents containing +term+. Returns an array with these
46
- # document frequencies. Used to minimize number of remote calls.
47
- def doc_freqs(terms)
48
- result = Array.new
49
- terms.each {|term, i| result << doc_freq(term)}
50
- return result
51
- end
52
-
53
- # Expert: Returns the stored fields of document +n+.
54
- #
55
- # See IndexReader#get_document
56
- def doc(n)
57
- i = sub_searcher(n)
58
- return @searchers[i].doc(n - @starts[i])
59
- end
60
-
61
- # Returns index of the searcher for document <code>n</code> in the
62
- # array used to construct this searcher.
63
- def sub_searcher(n)
64
- lo = 0 # search starts array
65
- hi = @searchers.size - 1 # for first element less
66
- # than n, return its index
67
- while hi >= lo do
68
- mid = (lo + hi) >> 1
69
- midValue = @starts[mid]
70
- if n < midValue
71
- hi = mid - 1;
72
- elsif n > midValue
73
- lo = mid + 1;
74
- else # found a match
75
- while mid+1 < @searchers.size && @starts[mid+1] == midValue do
76
- mid += 1 # scan to last match
77
- end
78
- return mid
79
- end
80
- end
81
- return hi
82
- end
83
-
84
- # Returns the document number of document <code>n</code> within its
85
- # sub-index.
86
- def sub_doc(n)
87
- return n - @starts[sub_searcher(n)]
88
- end
89
-
90
- # Expert: Returns one greater than the largest possible document number.
91
- # Called by search code to compute term weights.
92
- # See IndexReader#max_doc
93
- def max_doc
94
- return @max_doc
95
- end
96
-
97
- # Create weight in multiple index scenario.
98
- #
99
- # Distributed query processing is done in the following steps:
100
- # 1. rewrite query
101
- # 2. extract necessary terms
102
- # 3. collect dfs for these terms from the Searchables
103
- # 4. create query weight using aggregate dfs.
104
- # 5. distribute that weight to Searchables
105
- # 6. merge results
106
- #
107
- # Steps 1-4 are done here, 5+6 in the search() methods
108
- def create_weight(query)
109
- # step 1
110
- rewritten_query = self.rewrite(query)
111
-
112
- # step 2
113
- terms = Set.new
114
- rewritten_query.extract_terms(terms)
115
-
116
- # step 3
117
- aggregated_dfs = Array.new(terms.size, 0)
118
- @searchers.each { |searcher|
119
- dfs = searcher.doc_freqs(terms)
120
- dfs.each_with_index { |df,i|
121
- aggregated_dfs[i] += df
122
- }
123
- }
124
-
125
- df_map = Hash.new
126
- terms.each_with_index { |term,i|
127
- df_map[term] = aggregated_dfs[i]
128
- }
129
-
130
- # step 4
131
- cache_sim = CachedDfSource.new(df_map, self.max_doc, self.similarity)
132
-
133
- return rewritten_query.weight(cache_sim)
134
- end
135
-
136
-
137
- def search(query, options = {})
138
- filter = options[:filter]
139
- first_doc = options[:first_doc]||0
140
- num_docs = options[:num_docs]||10
141
- max_size = first_doc + num_docs
142
- sort = options[:sort]
143
-
144
- if (num_docs <= 0)
145
- raise ArgumentError, "num_docs must be > 0 to run a search"
146
- end
147
-
148
- if (first_doc < 0)
149
- raise ArgumentError, "first_doc must be >= 0 to run a search"
150
- end
151
-
152
-
153
- if (sort)
154
- raise NotImplementedError
155
- #fields = sort.is_a?(Array) ? sort : sort.fields
156
- #hq = FieldDocSortedHitQueue.new(fields, max_size)
157
- else
158
- hq = HitQueue.new(max_size)
159
- end
160
-
161
- total_hits = 0
162
- weight = create_weight(query)
163
- @searchers.each_with_index { |searcher,i| # search each searcher
164
- docs = searcher.search(weight,
165
- :filter => filter,
166
- #:sort => sort,
167
- :num_docs => max_size,
168
- :first_doc => 0)
169
- total_hits += docs.total_hits # update total_hits
170
- docs.score_docs.each { |score_doc|
171
- score_doc.doc += @starts[i] # convert doc
172
- break unless hq.insert(score_doc) # no more scores > min_score
173
- }
174
- }
175
-
176
- score_docs = []
177
- if (hq.size > first_doc)
178
- if (hq.size - first_doc) < num_docs
179
- num_docs = hq.size - first_doc
180
- end
181
- num_docs.times do
182
- score_docs.unshift(hq.pop)
183
- end
184
- end
185
- hq.clear
186
-
187
- return TopDocs.new(total_hits, score_docs)
188
- end
189
-
190
- def search_each(query, filter = nil, &block)
191
- weight = create_weight(query)
192
- @searchers.each { |searcher| # search each searcher
193
- searcher.search_each(weight, filter, &block)
194
- }
195
- end
196
-
197
- # rewrites the query into a query that can be processed by the search
198
- # methods. For example, a Fuzzy query is turned into a massive boolean
199
- # query.
200
- #
201
- # original:: The original query to be rewritten.
202
- def rewrite(original)
203
- #print "multi_searcher#rewrite: #{original}\n"
204
- queries = []
205
- @searchers.each { |searcher|
206
- queries << searcher.rewrite(original)
207
- }
208
- return queries.first.combine(queries)
209
- end
210
-
211
- # Returns an Explanation that describes how +doc+ scored against
212
- # +query+.
213
- #
214
- # This is intended to be used in developing Similarity implementations,
215
- # and, for good performance, should not be displayed with every hit.
216
- # Computing an explanation is as expensive as executing the query over the
217
- # entire index.
218
- def explain(query, doc)
219
- i = sub_searcher(doc)
220
- return @searchers[i].explain(create_weight(query), doc-@starts[i])
221
- end
222
-
223
- end
224
-
225
-
226
- # Document Frequency cache acting as a Dummy-Searcher.
227
- # This class is no full-fledged Searcher, but only supports
228
- # the methods necessary to initialize Weights.
229
- class CachedDfSource
230
-
231
- attr_reader :max_doc, :similarity
232
-
233
- def initialize(df_map, max_doc, similarity)
234
- @df_map = df_map
235
- @max_doc = max_doc
236
- @similarity = similarity
237
- end
238
-
239
- def doc_freq(term)
240
- return @df_map[term]
241
- end
242
-
243
- def doc_freqs(terms)
244
- result = Array.new
245
- terms.each { |term|
246
- result << doc_freq(term)
247
- }
248
- return result
249
- end
250
-
251
- def rewrite(query)
252
- # this is a bit of a hack. We know that a query which
253
- # creates a Weight based on this Dummy-Searcher is
254
- # always already rewritten (see preparedWeight()).
255
- # Therefore we just return the unmodified query here
256
- return query
257
- end
258
-
259
- end
260
-
261
- end
@@ -1,65 +0,0 @@
1
- module Ferret::Search
2
- # A Query that matches documents containing a subset of terms provided
3
- # by a FilteredTermEnum enumeration.
4
- #
5
- # +MultiTermQuery+ is not designed to be used by itself. The reason being
6
- # that it is not intialized with a FilteredTermEnum enumeration. A
7
- # FilteredTermEnum enumeration needs to be provided.
8
- #
9
- # For example, WildcardQuery and FuzzyQuery extend +MultiTermQuery+ to
10
- # provide WildcardTermEnum and FuzzyTermEnum, respectively.
11
- class MultiTermQuery < Query
12
- attr_reader :term
13
-
14
- # Constructs a query for terms matching +term+.
15
- def initialize(term)
16
- super()
17
- @term = term
18
- end
19
-
20
- # Construct the enumeration to be used, expanding the pattern term.
21
- def get_term_enum(reader)
22
- raise NotImplementedError
23
- end
24
-
25
-
26
- def rewrite(reader)
27
- enumerator = get_term_enum(reader)
28
- bq = BooleanQuery.new(true)
29
- begin
30
- begin
31
- t = enumerator.term()
32
- if (t != nil)
33
- tq = TermQuery.new(t) # found a match
34
- tq.boost = boost() * enumerator.difference() # set the boost
35
- bq.add_query(tq, BooleanClause::Occur::SHOULD) # add to query
36
- end
37
- end while enumerator.next?
38
- ensure
39
- enumerator.close()
40
- end
41
- return bq
42
- end
43
-
44
- # Prints a user-readable version of this query.
45
- def to_s(field = nil)
46
- buffer = ""
47
- buffer << "#{@term.field}:" if @term.field != field
48
- buffer << @term.text
49
- buffer << "^#{boost()}" if (boost() != 1.0)
50
- return buffer
51
- end
52
-
53
- def eql?(o)
54
- if not o.instance_of? MultiTermQuery
55
- return false
56
- end
57
- return term == o.term
58
- end
59
- alias :== :eql?
60
-
61
- def hash()
62
- return term.hash()
63
- end
64
- end
65
- end
@@ -1,22 +0,0 @@
1
- module Ferret::Search
2
- # A scorer that matches no document at all.
3
- class NonMatchingScorer < Scorer
4
- def initialize()
5
- super(nil) # no similarity used
6
- end
7
-
8
- def next?
9
- return false
10
- end
11
-
12
- def skip_to(target)
13
- return false
14
- end
15
-
16
- def explain(doc)
17
- e = Explanation.new()
18
- e.description = "No document matches."
19
- return e
20
- end
21
- end
22
- end
@@ -1,55 +0,0 @@
1
- module Ferret::Search
2
- class PhrasePositions
3
- attr_reader :doc, :position
4
- attr_accessor :next
5
-
6
- def initialize(tp_enum, offset)
7
- @tp_enum = tp_enum
8
- @offset = offset
9
- @count = @position = @doc = -1
10
- @next = nil
11
- end
12
-
13
- def next?()
14
- if not @tp_enum.next?
15
- @tp_enum.close() # close stream
16
- @doc = Scorer::MAX_DOCS # sentinel value
17
- return false
18
- end
19
- @doc = @tp_enum.doc
20
- @position = 0
21
- return true
22
- end
23
-
24
- def skip_to(target)
25
- if not @tp_enum.skip_to(target)
26
- @tp_enum.close() # close stream
27
- @doc = Scorer::MAX_DOCS # sentinel value
28
- return false
29
- end
30
- @doc = @tp_enum.doc
31
- @position = 0
32
- return true
33
- end
34
-
35
-
36
- def first_position()
37
- @count = @tp_enum.freq # read first pos
38
- next_position()
39
- end
40
-
41
- def next_position()
42
- @count -= 1
43
- if @count >= 0 # read subsequent pos's
44
- @position = @tp_enum.next_position() - @offset
45
- return true
46
- else
47
- return false
48
- end
49
- end
50
-
51
- def to_s
52
- "pp->(doc => #{@doc}, position => #{position})"
53
- end
54
- end
55
- end
@@ -1,214 +0,0 @@
1
- module Ferret::Search
2
- # A Query that matches documents containing a particular sequence of terms.
3
- # A PhraseQuery is built by QueryParser for input like +"new york"+.
4
- #
5
- # This query may be combined with other terms or queries with a BooleanQuery.
6
- class PhraseQuery < Query
7
- def initialize()
8
- super
9
- @slop = 0
10
- @terms = []
11
- @positions = []
12
- @field = nil
13
- end
14
-
15
- # Sets the number of other words permitted between words in query phrase.
16
- # If zero, then this is an exact phrase search. For larger values this
17
- # works like a +WITHIN+ or +NEAR+ operator.
18
- #
19
- # The slop is in fact an edit-distance, where the units correspond to
20
- # moves of terms in the query phrase out of position. For example, to
21
- # switch the order of two words requires two moves (the first move places
22
- # the words atop one another), so to permit re-orderings of phrases, the
23
- # slop must be at least two.
24
- #
25
- # More exact matches are scored higher than sloppier matches, thus search
26
- # results are sorted by exactness.
27
- #
28
- # The slop is zero by default, requiring exact matches.
29
- attr_accessor :slop
30
- attr_reader :terms, :positions, :field
31
-
32
- # Adds a term to the end of the query phrase.
33
- #
34
- # The relative position of the term is the one immediately after the last
35
- # term added, unless explicitly specified. By specifying explicitly,
36
- # you can have phrases with more than one term at the same position or
37
- # phrases with gaps (e.g. in connection with stopwords).
38
- #
39
- # term:: the term to search for
40
- # position:: the relative position of the term to the rest of the terms
41
- # int the query.
42
- def add(term, position = nil, pos_inc = 1)
43
- if position.nil?
44
- position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
45
- end
46
-
47
- if @terms.size == 0
48
- @field = term.field
49
- elsif (term.field != @field)
50
- raise ArgumentError, "All phrase terms must be in the same field: #{term}"
51
- end
52
-
53
- @terms << term
54
- @positions << position
55
- end
56
-
57
- def <<(term)
58
- add(term)
59
- return self
60
- end
61
-
62
- class PhraseWeight < Weight
63
- attr_reader :query, :value
64
-
65
- def initialize(query, searcher)
66
- @query = query
67
- @similarity = query.similarity(searcher)
68
- @idf = @similarity.idf_phrase(@query.terms, searcher)
69
- end
70
-
71
- def to_s() return "phrase_weight(#{@value})" end
72
-
73
- def sum_of_squared_weights()
74
- @query_weight = @idf * @query.boost() # compute query weight
75
- return @query_weight * @query_weight # square it
76
- end
77
-
78
- def normalize(query_norm)
79
- @query_norm = query_norm
80
- @query_weight *= query_norm # normalize query weight
81
- @value = @query_weight * @idf # idf for document
82
- end
83
-
84
- def scorer(reader)
85
- return nil if @query.terms.size == 0 # optimize zero-term case
86
-
87
- tps = []
88
- @query.terms.each do |term|
89
- tp = reader.term_positions_for(term)
90
- return nil if tp.nil?
91
- tps << tp
92
- end
93
-
94
- if (@query.slop == 0) # optimize exact case
95
- return ExactPhraseScorer.new(self, tps, @query.positions,
96
- @similarity,
97
- reader.get_norms(@query.field))
98
- else
99
- return SloppyPhraseScorer.new(self, tps, @query.positions,
100
- @similarity,
101
- @query.slop,
102
- reader.get_norms(@query.field))
103
- end
104
- end
105
-
106
- def explain(reader, doc)
107
- result = Explanation.new()
108
- result.description = "weight(#{@query} in #{doc}), product of:"
109
-
110
- doc_freqs = @query.terms.map do |term|
111
- "#{term.text}=#{reader.doc_freq(term)}"
112
- end.join(", ")
113
-
114
- idf_expl = Explanation.new(@idf, "idf(#{@query.field}:<#{doc_freqs}>)")
115
-
116
- # explain query weight
117
- query_expl = Explanation.new()
118
- query_expl.description = "query_weight(#{@query}), product of:"
119
-
120
- boost = @query.boost()
121
- if boost != 1.0
122
- boost_expl = Explanation.new(boost, "boost")
123
- query_expl << boost_expl
124
- end
125
- query_expl << idf_expl
126
-
127
- query_norm_expl = Explanation.new(@query_norm, "query_norm")
128
- query_expl << query_norm_expl
129
-
130
- query_expl.value = boost * @idf * @query_norm
131
-
132
- result << query_expl
133
-
134
- # explain field weight
135
- field_expl = Explanation.new()
136
- field_expl.description =
137
- "field_weight(#{query} in #{doc}), product of:"
138
-
139
- tf_expl = scorer(reader).explain(doc)
140
- field_expl << tf_expl
141
- field_expl << idf_expl
142
-
143
- field_norm_expl = Explanation.new()
144
- field_norms = reader.get_norms(@query.field)
145
- field_norm =
146
- field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
147
- field_norm_expl.value = field_norm
148
- field_norm_expl.description =
149
- "field_norm(field=#{@query.field}, doc=#{doc})"
150
- field_expl << field_norm_expl
151
-
152
- field_expl.value = tf_expl.value * @idf * field_norm
153
- result << field_expl
154
-
155
- if (query_expl.value == 1.0)
156
- return field_expl
157
- else
158
- result.value = query_expl.value * field_expl.value
159
- return result
160
- end
161
- end
162
- end
163
-
164
- def create_weight(searcher)
165
- if @terms.size == 1 # optimize one-term case
166
- term = @terms[0]
167
- tq = TermQuery.new(term)
168
- tq.boost = boost()
169
- return tq.create_weight(searcher)
170
- end
171
- return PhraseWeight.new(self, searcher)
172
- end
173
-
174
- # See Query#extract_terms()
175
- def extract_terms(query_terms)
176
- query_terms.merge(@terms)
177
- end
178
-
179
- # Prints a user-readable version of this query.
180
- def to_s(f=nil)
181
- buffer = ""
182
- buffer << "#{@field}:" if @field != f
183
- buffer << '"'
184
- last_pos = -1
185
- @terms.each_index do |i|
186
- term = @terms[i]
187
- pos = @positions[i]
188
- last_pos.upto(pos-2) {buffer << "<> "}
189
- last_pos = pos
190
- buffer << "#{term.text} "
191
- end
192
- buffer.rstrip!
193
- buffer << '"'
194
- buffer << "~#{slop}" if (slop != 0)
195
- buffer << "^#{boost()}" if boost() != 1.0
196
- return buffer
197
- end
198
-
199
- # Returns true iff +o+ is equal to this.
200
- def eql?(o)
201
- if not o.instance_of? PhraseQuery
202
- return false
203
- end
204
- return (boost() == o.boost() and @slop == o.slop and
205
- @terms == o.terms and @positions == o.positions)
206
- end
207
- alias :== :eql?
208
-
209
- # Returns a hash code value for this object.
210
- def hash()
211
- return boost().hash ^ slop.hash ^ @terms.hash ^ @positions.hash
212
- end
213
- end
214
- end