ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,34 +0,0 @@
1
- module Ferret::Search
2
- # Lower-level search API.
3
- #
4
- # HitCollectors are primarily meant to be used to implement queries, sorting
5
- # and filtering.
6
- #
7
- # See Searcher#search(Query, HitCollector)
8
- class HitCollector
9
- # Called once for every non-zero scoring document, with the document number
10
- # and its score.
11
- #
12
- # If, for example, an application wished to collect all of the hits for a
13
- # query in a BitSet, then it might:
14
- #
15
- # searcher = IndexSearcher.new(index_reader)
16
- # bits = BitSet.new(index_reader.max_doc())
17
- # searcher.search(query, HitCollector.new()
18
- # def collect(doc, score)
19
- # bits.set(doc)
20
- # end
21
- # end
22
- #
23
- # NOTE: This is called in an inner search loop. For good search
24
- # performance, implementations of this method should not call
25
- # Searcher#doc(int) or IndexReader#document(int) on every document number
26
- # encountered. Doing so can slow searches by an order of magnitude or more.
27
- #
28
- # NOTE: The +score+ passed to this method is a raw score. In other words,
29
- # the score will not necessarily be a float whose value is between 0 and 1.
30
- def collect(doc, score)
31
- raise NotImplementedError
32
- end
33
- end
34
- end
@@ -1,11 +0,0 @@
1
- module Ferret::Search
2
- class HitQueue < Ferret::Utils::PriorityQueue
3
- def less_than(hit1, hit2)
4
- if (hit1.score == hit2.score)
5
- return hit1.doc > hit2.doc
6
- else
7
- return hit1.score < hit2.score
8
- end
9
- end
10
- end
11
- end
@@ -1,200 +0,0 @@
1
- module Ferret::Search
2
-
3
- # Implements search over a single IndexReader.
4
- #
5
- # Applications usually need only call the inherited @link #search(Query)end
6
- # or @link #search(Query,Filter)endmethods. For performance reasons it is
7
- # recommended to open only one IndexSearcher and use it for all of your searches.
8
- class IndexSearcher
9
- include Ferret::Index
10
-
11
- attr_accessor :similarity, :reader
12
-
13
- # Creates a searcher searching the index in the provided directory.
14
- #
15
- # You need to pass one argument which should be one of the following:
16
- #
17
- # * An index reader which the searcher will search
18
- # * A directory where the searcher will open an index reader to search
19
- # * A string which represents a path to the directory to be searched
20
- #
21
- def initialize(arg)
22
- if arg.is_a?(IndexReader)
23
- @reader = arg
24
- elsif arg.is_a?(Ferret::Store::Directory)
25
- @reader = IndexReader.open(arg, false)
26
- elsif arg.is_a?(String)
27
- @dir = Ferret::Store::FSDirectory.new(arg, false)
28
- @reader = IndexReader.open(@dir, true)
29
- else
30
- raise ArgumentError, "Unknown argument passed to initialize IndexReader"
31
- end
32
-
33
- @similarity = Similarity.default
34
- end
35
-
36
- # IndexSearcher was constructed with IndexSearcher(r).
37
- # If the IndexReader was supplied implicitly by specifying a directory, then
38
- # the IndexReader gets closed.
39
- def close()
40
- @reader.close()
41
- end
42
-
43
- # Expert: Returns the number of documents containing +term+.
44
- # Called by search code to compute term weights.
45
- # See IndexReader#doc_freq
46
- def doc_freq(term)
47
- return @reader.doc_freq(term)
48
- end
49
-
50
- # Expert: For each term in the terms array, calculates the number of
51
- # documents containing +term+. Returns an array with these
52
- # document frequencies. Used to minimize number of remote calls.
53
- def doc_freqs(terms)
54
- result = Array.new(terms.length)
55
- terms.each_with_index {|term, i| result[i] = doc_freq(term)}
56
- return result
57
- end
58
-
59
- # Expert: Returns the stored fields of document +i+.
60
- #
61
- # See IndexReader#get_document
62
- def doc(i)
63
- return @reader.get_document(i)
64
- end
65
-
66
- # Expert: Returns one greater than the largest possible document number.
67
- # Called by search code to compute term weights.
68
- # See IndexReader#max_doc
69
- def max_doc()
70
- return @reader.max_doc()
71
- end
72
-
73
- # Creates a weight for +query+
74
- # returns:: new weight
75
- def create_weight(query)
76
- return query.weight(self)
77
- end
78
-
79
- # The main search method for the index. You need to create a query to
80
- # pass to this method. You can also pass a hash with one or more of the
81
- # following; {filter, num_docs, first_doc, sort}
82
- #
83
- # query:: The query to run on the index
84
- # filter:: filters docs from the search result
85
- # first_doc:: The index in the results of the first doc retrieved.
86
- # Default is 0
87
- # num_docs:: The number of results returned. Default is 10
88
- # sort:: An array of SortFields describing how to sort the results.
89
- def search(query, options = {})
90
- filter = options[:filter]
91
- first_doc = options[:first_doc]||0
92
- num_docs = options[:num_docs]||10
93
- max_size = first_doc + num_docs
94
- sort = options[:sort]
95
- if sort and not sort.kind_of?(Sort)
96
- sort = Sort.new(sort)
97
- end
98
-
99
- if (num_docs <= 0)
100
- raise ArgumentError, "num_docs must be > 0 to run a search"
101
- end
102
-
103
- if (first_doc < 0)
104
- raise ArgumentError, "first_doc must be >= 0 to run a search"
105
- end
106
-
107
- # for MultiSearcher: the weight is computed across all searchers
108
- if query.is_a? Weight
109
- scorer = query.scorer(@reader)
110
- else
111
- scorer = query.weight(self).scorer(@reader)
112
- end
113
-
114
- if (scorer == nil)
115
- return TopDocs.new(0, [])
116
- end
117
-
118
- bits = (filter.nil? ? nil : filter.bits(@reader))
119
- if (sort)
120
- fields = sort.is_a?(Array) ? sort : sort.fields
121
- hq = FieldSortedHitQueue.new(@reader, fields, max_size)
122
- else
123
- hq = HitQueue.new(max_size)
124
- end
125
- total_hits = 0
126
- scorer.each_hit() do |doc, score|
127
- if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
128
- total_hits += 1
129
- hq.insert(ScoreDoc.new(doc, score))
130
- end
131
- end
132
-
133
- score_docs = []
134
- if (hq.size > first_doc)
135
- if (hq.size - first_doc) < num_docs
136
- num_docs = hq.size - first_doc
137
- end
138
- num_docs.times do
139
- score_docs.unshift(hq.pop)
140
- end
141
- end
142
- hq.clear
143
-
144
- return TopDocs.new(total_hits, score_docs)
145
- end
146
-
147
- # Accepts a block and iterates through all of results yielding the doc
148
- # number and the score for that hit. The hits are unsorted. This is the
149
- # fastest way to get all of the hits from a search. However, you will
150
- # usually want your hits sorted at least by score so you should use the
151
- # #search method.
152
- def search_each(query, filter = nil)
153
- # for MultiSearcher: the weight is computed across all searchers
154
- if query.is_a? Weight
155
- scorer = query.scorer(@reader)
156
- else
157
- scorer = query.weight(self).scorer(@reader)
158
- end
159
- return if scorer == nil
160
- bits = (filter.nil? ? nil : filter.bits(@reader))
161
- scorer.each_hit() do |doc, score|
162
- if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
163
- yield(doc, score)
164
- end
165
- end
166
- end
167
-
168
- # rewrites the query into a query that can be processed by the search
169
- # methods. For example, a Fuzzy query is turned into a massive boolean
170
- # query.
171
- #
172
- # original:: The original query to be rewritten.
173
- def rewrite(original)
174
- query = original
175
- rewritten_query = query.rewrite(@reader)
176
- while query != rewritten_query
177
- query = rewritten_query
178
- rewritten_query = query.rewrite(@reader)
179
- end
180
- return query
181
- end
182
-
183
- # Returns an Explanation that describes how +doc+ scored against
184
- # +query+.
185
- # A weight may be given as first parameter instead of the query, too.
186
- #
187
- # This is intended to be used in developing Similarity implementations,
188
- # and, for good performance, should not be displayed with every hit.
189
- # Computing an explanation is as expensive as executing the query over the
190
- # entire index.
191
- def explain(query, doc)
192
- if query.is_a? Weight
193
- weight = query
194
- else
195
- weight = query.weight(self)
196
- end
197
- return weight.explain(@reader, doc)
198
- end
199
- end
200
- end
@@ -1,104 +0,0 @@
1
- module Ferret::Search
2
- # A query that matches all documents.
3
- class MatchAllQuery < Query
4
-
5
- def initialize()
6
- super
7
- end
8
-
9
- class MatchAllScorer < Scorer
10
-
11
- def initialize(reader, similarity)
12
- super(similarity)
13
- @reader = reader
14
- @count = -1
15
- @max_doc = reader.max_doc
16
- end
17
-
18
- def doc()
19
- return @count
20
- end
21
-
22
- def explain(doc)
23
- return Explanation.new(1.0, "MatchAllQuery")
24
- end
25
-
26
- def next?
27
- while (@count < (@max_doc - 1))
28
- @count += 1
29
- if (!@reader.deleted?(@count))
30
- return true
31
- end
32
- end
33
- return false
34
- end
35
-
36
- def score()
37
- return 1.0
38
- end
39
-
40
- def skip_to(target)
41
- @count = target - 1
42
- return next?
43
- end
44
- end
45
-
46
- class MatchAllWeight < Weight
47
- attr_reader :query
48
- def initialize(query, searcher)
49
- @query = query
50
- @searcher = searcher
51
- end
52
-
53
- def to_s()
54
- return "weight(#{@query})"
55
- end
56
-
57
- def value()
58
- return 1.0
59
- end
60
-
61
- def sum_of_squared_weights()
62
- return 1.0
63
- end
64
-
65
- def normalize(query_norm)
66
- end
67
-
68
- def scorer(reader)
69
- return MatchAllScorer.new(reader, @query.similarity(@searcher))
70
- end
71
-
72
- def explain(reader, doc)
73
- # explain query weight
74
- query_expl = Explanation.new(1.0, "MatchAllQuery")
75
- boost_expl = Explanation.new(@query.boost, "boost")
76
- if (boost_expl.value != 1.0)
77
- query_expl << boost_expl
78
- query_expl.value = boost_expl.value
79
- end
80
-
81
- return query_expl
82
- end
83
- end
84
-
85
- def create_weight(searcher)
86
- return MatchAllWeight.new(self, searcher)
87
- end
88
-
89
- def to_s(field)
90
- buffer = "MatchAllQuery"
91
- buffer << "^#{boost}" if (boost() != 1.0)
92
- return buffer
93
- end
94
-
95
- def eql?(o)
96
- return (o.instance_of?(MatchAllQuery) and boost == o.boost)
97
- end
98
- alias :== :eql?
99
-
100
- def hash
101
- return boost.hash
102
- end
103
- end
104
- end
@@ -1,216 +0,0 @@
1
- module Ferret::Search
2
- # MultiPhraseQuery is a generalized version of PhraseQuery, with an added
3
- # method #add(Term[]).
4
- #
5
- # To use this class, to search for the phrase "Microsoft app*" first use
6
- # add(Term) on the term "Microsoft", then find all terms that have "app" as
7
- # prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[]
8
- # terms) to add them to the query.
9
- #
10
- # Author Anders Nielsen
11
- class MultiPhraseQuery < Query
12
- include Ferret::Index
13
-
14
- attr_accessor :slop
15
- attr_reader :positions, :term_arrays, :field
16
-
17
- def initialize()
18
- super()
19
- @slop = 0
20
- @term_arrays = []
21
- @positions = []
22
- @field = nil
23
- end
24
-
25
- # Allows to specify the relative position of terms within the phrase.
26
- #
27
- # See PhraseQuery#add(Term, int)
28
- # terms:: the array of terms to search for or a single term
29
- # position:: the position to search for these terms
30
- def add(terms, position = nil, pos_inc = 1)
31
- if position.nil?
32
- position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
33
- end
34
-
35
- if terms.instance_of?(Term)
36
- terms = [terms]
37
- end
38
-
39
- if (@term_arrays.size == 0)
40
- @field = terms[0].field
41
- end
42
-
43
- terms.each do |term|
44
- if (term.field != @field)
45
- raise ArgumentError,
46
- "All phrase terms must be in the same field (#{@field}): #{term}"
47
- end
48
- end
49
-
50
- if i = @positions.index(position)
51
- term_arrays[i] += terms
52
- else
53
- @term_arrays << terms
54
- @positions << position
55
- end
56
- end
57
- alias :<< :add
58
-
59
- class MultiPhraseWeight < Weight
60
- include Ferret::Index
61
-
62
- attr_reader :query, :value
63
-
64
- def initialize(query, searcher)
65
- @query = query
66
- @term_arrays = query.term_arrays
67
- @positions = query.positions
68
- @similarity = query.similarity(searcher)
69
- @idf = 0.0
70
-
71
- # compute idf
72
- query.term_arrays.each do |terms|
73
- terms.each do |term|
74
- @idf += @similarity.idf_term(term, searcher)
75
- end
76
- end
77
- end
78
-
79
- def sum_of_squared_weights()
80
- @query_weight = @idf * @query.boost() # compute query weight
81
- return @query_weight * @query_weight # square it
82
- end
83
-
84
- def normalize(query_norm)
85
- @query_norm = query_norm
86
- @query_weight *= query_norm # normalize query weight
87
- @value = @query_weight * @idf # idf for document
88
- end
89
-
90
- def scorer(reader)
91
- return nil if (@term_arrays.size == 0) # optimize zero-term case
92
- tps = []
93
- @term_arrays.each do |terms|
94
- p = []
95
- if (terms.length > 1)
96
- p = MultipleTermDocPosEnum.new(reader, terms)
97
- else
98
- p = reader.term_positions_for(terms[0])
99
- end
100
-
101
- return nil if (p == nil)
102
-
103
- tps << p
104
- end
105
-
106
- if (@query.slop == 0)
107
- return ExactPhraseScorer.new(self, tps, @positions, @similarity,
108
- reader.get_norms(@query.field))
109
- else
110
- return SloppyPhraseScorer.new(self, tps, @positions, @similarity,
111
- @query.slop, reader.get_norms(@query.field))
112
- end
113
- end
114
-
115
- def explain(reader, doc)
116
-
117
- result = Explanation.new()
118
- result.description = "weight(#{@query} in #{doc}), product of:"
119
-
120
- idf_expl = Explanation.new(@idf, "idf(#{@query})")
121
-
122
- # explain query weight
123
- query_expl = Explanation.new()
124
- query_expl.description = "query_weight(#{@query}), product of:"
125
-
126
- boost = @query.boost()
127
- if boost != 1.0
128
- boost_expl = Explanation.new(boost, "boost")
129
- query_expl << boost_expl
130
- end
131
- query_expl << idf_expl
132
-
133
- query_norm_expl = Explanation.new(@query_norm,"query_norm")
134
- query_expl << query_norm_expl
135
-
136
- query_expl.value = boost * @idf * @query_norm
137
-
138
- result << query_expl
139
-
140
- # explain field weight
141
- field_expl = Explanation.new()
142
- field_expl.description =
143
- "field_weight(#{@query} in #{doc}), product of:"
144
-
145
- tf_expl = scorer(reader).explain(doc)
146
- field_expl << tf_expl
147
- field_expl << idf_expl
148
-
149
- field_norm_expl = Explanation.new()
150
- field_norms = reader.get_norms(@query.field)
151
- field_norm =
152
- field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
153
- field_norm_expl.value = field_norm
154
- field_norm_expl.description =
155
- "field_norm(field=#{@query.field}, doc=#{doc})"
156
- field_expl << field_norm_expl
157
-
158
- field_expl.value = tf_expl.value * idf_expl.value * field_norm_expl.value
159
- result << field_expl
160
-
161
- if (query_expl.value == 1.0)
162
- return field_expl
163
- else
164
- result.value = query_expl.value * field_expl.value
165
- return result
166
- end
167
- end
168
- end
169
-
170
- def rewrite(reader)
171
- if (@term_arrays.size() == 1) # optimize one-term case
172
- terms = @term_arrays[0]
173
- bq = BooleanQuery.new(true)
174
- terms.each do |term|
175
- bq.add_query(TermQuery.new(term), BooleanClause::Occur::SHOULD)
176
- end
177
- bq.boost = boost()
178
- return bq
179
- else
180
- return self
181
- end
182
- end
183
-
184
- # See Query#extract_terms()
185
- def extract_terms(query_terms)
186
- @term_arrays.each { |terms|
187
- query_terms.merge(terms)
188
- }
189
- end
190
-
191
- def create_weight(searcher)
192
- return MultiPhraseWeight.new(self, searcher)
193
- end
194
-
195
- # Prints a user-readable version of this query.
196
- def to_s(f = nil)
197
- buffer = ""
198
- buffer << "#{@field}:" if @field != f
199
- buffer << '"'
200
- last_pos = -1
201
- @term_arrays.each_index do |i|
202
- terms = @term_arrays[i]
203
- pos = @positions[i]
204
- last_pos.upto(pos-2) {buffer << "<> "}
205
- last_pos = pos
206
- buffer << "#{terms.map {|term| term.text}.join("|")} "
207
- end
208
- buffer.rstrip!
209
- buffer << '"'
210
-
211
- buffer << "~#{@slop}" if (@slop != 0)
212
- buffer << "^#{boost()}" if boost() != 1.0
213
- return buffer
214
- end
215
- end
216
- end