ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,34 +0,0 @@
1
- module Ferret::Search
2
- # Lower-level search API.
3
- #
4
- # HitCollectors are primarily meant to be used to implement queries, sorting
5
- # and filtering.
6
- #
7
- # See Searcher#search(Query, HitCollector)
8
- class HitCollector
9
- # Called once for every non-zero scoring document, with the document number
10
- # and its score.
11
- #
12
- # If, for example, an application wished to collect all of the hits for a
13
- # query in a BitSet, then it might:
14
- #
15
- # searcher = IndexSearcher.new(index_reader)
16
- # bits = BitSet.new(index_reader.max_doc())
17
- # searcher.search(query, HitCollector.new()
18
- # def collect(doc, score)
19
- # bits.set(doc)
20
- # end
21
- # end
22
- #
23
- # NOTE: This is called in an inner search loop. For good search
24
- # performance, implementations of this method should not call
25
- # Searcher#doc(int) or IndexReader#document(int) on every document number
26
- # encountered. Doing so can slow searches by an order of magnitude or more.
27
- #
28
- # NOTE: The +score+ passed to this method is a raw score. In other words,
29
- # the score will not necessarily be a float whose value is between 0 and 1.
30
- def collect(doc, score)
31
- raise NotImplementedError
32
- end
33
- end
34
- end
@@ -1,11 +0,0 @@
1
- module Ferret::Search
2
- class HitQueue < Ferret::Utils::PriorityQueue
3
- def less_than(hit1, hit2)
4
- if (hit1.score == hit2.score)
5
- return hit1.doc > hit2.doc
6
- else
7
- return hit1.score < hit2.score
8
- end
9
- end
10
- end
11
- end
@@ -1,200 +0,0 @@
1
- module Ferret::Search
2
-
3
- # Implements search over a single IndexReader.
4
- #
5
- # Applications usually need only call the inherited @link #search(Query)end
6
- # or @link #search(Query,Filter)endmethods. For performance reasons it is
7
- # recommended to open only one IndexSearcher and use it for all of your searches.
8
- class IndexSearcher
9
- include Ferret::Index
10
-
11
- attr_accessor :similarity, :reader
12
-
13
- # Creates a searcher searching the index in the provided directory.
14
- #
15
- # You need to pass one argument which should be one of the following:
16
- #
17
- # * An index reader which the searcher will search
18
- # * A directory where the searcher will open an index reader to search
19
- # * A string which represents a path to the directory to be searched
20
- #
21
- def initialize(arg)
22
- if arg.is_a?(IndexReader)
23
- @reader = arg
24
- elsif arg.is_a?(Ferret::Store::Directory)
25
- @reader = IndexReader.open(arg, false)
26
- elsif arg.is_a?(String)
27
- @dir = Ferret::Store::FSDirectory.new(arg, false)
28
- @reader = IndexReader.open(@dir, true)
29
- else
30
- raise ArgumentError, "Unknown argument passed to initialize IndexReader"
31
- end
32
-
33
- @similarity = Similarity.default
34
- end
35
-
36
- # IndexSearcher was constructed with IndexSearcher(r).
37
- # If the IndexReader was supplied implicitly by specifying a directory, then
38
- # the IndexReader gets closed.
39
- def close()
40
- @reader.close()
41
- end
42
-
43
- # Expert: Returns the number of documents containing +term+.
44
- # Called by search code to compute term weights.
45
- # See IndexReader#doc_freq
46
- def doc_freq(term)
47
- return @reader.doc_freq(term)
48
- end
49
-
50
- # Expert: For each term in the terms array, calculates the number of
51
- # documents containing +term+. Returns an array with these
52
- # document frequencies. Used to minimize number of remote calls.
53
- def doc_freqs(terms)
54
- result = Array.new(terms.length)
55
- terms.each_with_index {|term, i| result[i] = doc_freq(term)}
56
- return result
57
- end
58
-
59
- # Expert: Returns the stored fields of document +i+.
60
- #
61
- # See IndexReader#get_document
62
- def doc(i)
63
- return @reader.get_document(i)
64
- end
65
-
66
- # Expert: Returns one greater than the largest possible document number.
67
- # Called by search code to compute term weights.
68
- # See IndexReader#max_doc
69
- def max_doc()
70
- return @reader.max_doc()
71
- end
72
-
73
- # Creates a weight for +query+
74
- # returns:: new weight
75
- def create_weight(query)
76
- return query.weight(self)
77
- end
78
-
79
- # The main search method for the index. You need to create a query to
80
- # pass to this method. You can also pass a hash with one or more of the
81
- # following; {filter, num_docs, first_doc, sort}
82
- #
83
- # query:: The query to run on the index
84
- # filter:: filters docs from the search result
85
- # first_doc:: The index in the results of the first doc retrieved.
86
- # Default is 0
87
- # num_docs:: The number of results returned. Default is 10
88
- # sort:: An array of SortFields describing how to sort the results.
89
- def search(query, options = {})
90
- filter = options[:filter]
91
- first_doc = options[:first_doc]||0
92
- num_docs = options[:num_docs]||10
93
- max_size = first_doc + num_docs
94
- sort = options[:sort]
95
- if sort and not sort.kind_of?(Sort)
96
- sort = Sort.new(sort)
97
- end
98
-
99
- if (num_docs <= 0)
100
- raise ArgumentError, "num_docs must be > 0 to run a search"
101
- end
102
-
103
- if (first_doc < 0)
104
- raise ArgumentError, "first_doc must be >= 0 to run a search"
105
- end
106
-
107
- # for MultiSearcher: the weight is computed across all searchers
108
- if query.is_a? Weight
109
- scorer = query.scorer(@reader)
110
- else
111
- scorer = query.weight(self).scorer(@reader)
112
- end
113
-
114
- if (scorer == nil)
115
- return TopDocs.new(0, [])
116
- end
117
-
118
- bits = (filter.nil? ? nil : filter.bits(@reader))
119
- if (sort)
120
- fields = sort.is_a?(Array) ? sort : sort.fields
121
- hq = FieldSortedHitQueue.new(@reader, fields, max_size)
122
- else
123
- hq = HitQueue.new(max_size)
124
- end
125
- total_hits = 0
126
- scorer.each_hit() do |doc, score|
127
- if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
128
- total_hits += 1
129
- hq.insert(ScoreDoc.new(doc, score))
130
- end
131
- end
132
-
133
- score_docs = []
134
- if (hq.size > first_doc)
135
- if (hq.size - first_doc) < num_docs
136
- num_docs = hq.size - first_doc
137
- end
138
- num_docs.times do
139
- score_docs.unshift(hq.pop)
140
- end
141
- end
142
- hq.clear
143
-
144
- return TopDocs.new(total_hits, score_docs)
145
- end
146
-
147
- # Accepts a block and iterates through all of results yielding the doc
148
- # number and the score for that hit. The hits are unsorted. This is the
149
- # fastest way to get all of the hits from a search. However, you will
150
- # usually want your hits sorted at least by score so you should use the
151
- # #search method.
152
- def search_each(query, filter = nil)
153
- # for MultiSearcher: the weight is computed across all searchers
154
- if query.is_a? Weight
155
- scorer = query.scorer(@reader)
156
- else
157
- scorer = query.weight(self).scorer(@reader)
158
- end
159
- return if scorer == nil
160
- bits = (filter.nil? ? nil : filter.bits(@reader))
161
- scorer.each_hit() do |doc, score|
162
- if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
163
- yield(doc, score)
164
- end
165
- end
166
- end
167
-
168
- # rewrites the query into a query that can be processed by the search
169
- # methods. For example, a Fuzzy query is turned into a massive boolean
170
- # query.
171
- #
172
- # original:: The original query to be rewritten.
173
- def rewrite(original)
174
- query = original
175
- rewritten_query = query.rewrite(@reader)
176
- while query != rewritten_query
177
- query = rewritten_query
178
- rewritten_query = query.rewrite(@reader)
179
- end
180
- return query
181
- end
182
-
183
- # Returns an Explanation that describes how +doc+ scored against
184
- # +query+.
185
- # A weight may be given as first parameter instead of the query, too.
186
- #
187
- # This is intended to be used in developing Similarity implementations,
188
- # and, for good performance, should not be displayed with every hit.
189
- # Computing an explanation is as expensive as executing the query over the
190
- # entire index.
191
- def explain(query, doc)
192
- if query.is_a? Weight
193
- weight = query
194
- else
195
- weight = query.weight(self)
196
- end
197
- return weight.explain(@reader, doc)
198
- end
199
- end
200
- end
@@ -1,104 +0,0 @@
1
- module Ferret::Search
2
- # A query that matches all documents.
3
- class MatchAllQuery < Query
4
-
5
- def initialize()
6
- super
7
- end
8
-
9
- class MatchAllScorer < Scorer
10
-
11
- def initialize(reader, similarity)
12
- super(similarity)
13
- @reader = reader
14
- @count = -1
15
- @max_doc = reader.max_doc
16
- end
17
-
18
- def doc()
19
- return @count
20
- end
21
-
22
- def explain(doc)
23
- return Explanation.new(1.0, "MatchAllQuery")
24
- end
25
-
26
- def next?
27
- while (@count < (@max_doc - 1))
28
- @count += 1
29
- if (!@reader.deleted?(@count))
30
- return true
31
- end
32
- end
33
- return false
34
- end
35
-
36
- def score()
37
- return 1.0
38
- end
39
-
40
- def skip_to(target)
41
- @count = target - 1
42
- return next?
43
- end
44
- end
45
-
46
- class MatchAllWeight < Weight
47
- attr_reader :query
48
- def initialize(query, searcher)
49
- @query = query
50
- @searcher = searcher
51
- end
52
-
53
- def to_s()
54
- return "weight(#{@query})"
55
- end
56
-
57
- def value()
58
- return 1.0
59
- end
60
-
61
- def sum_of_squared_weights()
62
- return 1.0
63
- end
64
-
65
- def normalize(query_norm)
66
- end
67
-
68
- def scorer(reader)
69
- return MatchAllScorer.new(reader, @query.similarity(@searcher))
70
- end
71
-
72
- def explain(reader, doc)
73
- # explain query weight
74
- query_expl = Explanation.new(1.0, "MatchAllQuery")
75
- boost_expl = Explanation.new(@query.boost, "boost")
76
- if (boost_expl.value != 1.0)
77
- query_expl << boost_expl
78
- query_expl.value = boost_expl.value
79
- end
80
-
81
- return query_expl
82
- end
83
- end
84
-
85
- def create_weight(searcher)
86
- return MatchAllWeight.new(self, searcher)
87
- end
88
-
89
- def to_s(field)
90
- buffer = "MatchAllQuery"
91
- buffer << "^#{boost}" if (boost() != 1.0)
92
- return buffer
93
- end
94
-
95
- def eql?(o)
96
- return (o.instance_of?(MatchAllQuery) and boost == o.boost)
97
- end
98
- alias :== :eql?
99
-
100
- def hash
101
- return boost.hash
102
- end
103
- end
104
- end
@@ -1,216 +0,0 @@
1
- module Ferret::Search
2
- # MultiPhraseQuery is a generalized version of PhraseQuery, with an added
3
- # method #add(Term[]).
4
- #
5
- # To use this class, to search for the phrase "Microsoft app*" first use
6
- # add(Term) on the term "Microsoft", then find all terms that have "app" as
7
- # prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[]
8
- # terms) to add them to the query.
9
- #
10
- # Author Anders Nielsen
11
- class MultiPhraseQuery < Query
12
- include Ferret::Index
13
-
14
- attr_accessor :slop
15
- attr_reader :positions, :term_arrays, :field
16
-
17
- def initialize()
18
- super()
19
- @slop = 0
20
- @term_arrays = []
21
- @positions = []
22
- @field = nil
23
- end
24
-
25
- # Allows to specify the relative position of terms within the phrase.
26
- #
27
- # See PhraseQuery#add(Term, int)
28
- # terms:: the array of terms to search for or a single term
29
- # position:: the position to search for these terms
30
- def add(terms, position = nil, pos_inc = 1)
31
- if position.nil?
32
- position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
33
- end
34
-
35
- if terms.instance_of?(Term)
36
- terms = [terms]
37
- end
38
-
39
- if (@term_arrays.size == 0)
40
- @field = terms[0].field
41
- end
42
-
43
- terms.each do |term|
44
- if (term.field != @field)
45
- raise ArgumentError,
46
- "All phrase terms must be in the same field (#{@field}): #{term}"
47
- end
48
- end
49
-
50
- if i = @positions.index(position)
51
- term_arrays[i] += terms
52
- else
53
- @term_arrays << terms
54
- @positions << position
55
- end
56
- end
57
- alias :<< :add
58
-
59
- class MultiPhraseWeight < Weight
60
- include Ferret::Index
61
-
62
- attr_reader :query, :value
63
-
64
- def initialize(query, searcher)
65
- @query = query
66
- @term_arrays = query.term_arrays
67
- @positions = query.positions
68
- @similarity = query.similarity(searcher)
69
- @idf = 0.0
70
-
71
- # compute idf
72
- query.term_arrays.each do |terms|
73
- terms.each do |term|
74
- @idf += @similarity.idf_term(term, searcher)
75
- end
76
- end
77
- end
78
-
79
- def sum_of_squared_weights()
80
- @query_weight = @idf * @query.boost() # compute query weight
81
- return @query_weight * @query_weight # square it
82
- end
83
-
84
- def normalize(query_norm)
85
- @query_norm = query_norm
86
- @query_weight *= query_norm # normalize query weight
87
- @value = @query_weight * @idf # idf for document
88
- end
89
-
90
- def scorer(reader)
91
- return nil if (@term_arrays.size == 0) # optimize zero-term case
92
- tps = []
93
- @term_arrays.each do |terms|
94
- p = []
95
- if (terms.length > 1)
96
- p = MultipleTermDocPosEnum.new(reader, terms)
97
- else
98
- p = reader.term_positions_for(terms[0])
99
- end
100
-
101
- return nil if (p == nil)
102
-
103
- tps << p
104
- end
105
-
106
- if (@query.slop == 0)
107
- return ExactPhraseScorer.new(self, tps, @positions, @similarity,
108
- reader.get_norms(@query.field))
109
- else
110
- return SloppyPhraseScorer.new(self, tps, @positions, @similarity,
111
- @query.slop, reader.get_norms(@query.field))
112
- end
113
- end
114
-
115
- def explain(reader, doc)
116
-
117
- result = Explanation.new()
118
- result.description = "weight(#{@query} in #{doc}), product of:"
119
-
120
- idf_expl = Explanation.new(@idf, "idf(#{@query})")
121
-
122
- # explain query weight
123
- query_expl = Explanation.new()
124
- query_expl.description = "query_weight(#{@query}), product of:"
125
-
126
- boost = @query.boost()
127
- if boost != 1.0
128
- boost_expl = Explanation.new(boost, "boost")
129
- query_expl << boost_expl
130
- end
131
- query_expl << idf_expl
132
-
133
- query_norm_expl = Explanation.new(@query_norm,"query_norm")
134
- query_expl << query_norm_expl
135
-
136
- query_expl.value = boost * @idf * @query_norm
137
-
138
- result << query_expl
139
-
140
- # explain field weight
141
- field_expl = Explanation.new()
142
- field_expl.description =
143
- "field_weight(#{@query} in #{doc}), product of:"
144
-
145
- tf_expl = scorer(reader).explain(doc)
146
- field_expl << tf_expl
147
- field_expl << idf_expl
148
-
149
- field_norm_expl = Explanation.new()
150
- field_norms = reader.get_norms(@query.field)
151
- field_norm =
152
- field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
153
- field_norm_expl.value = field_norm
154
- field_norm_expl.description =
155
- "field_norm(field=#{@query.field}, doc=#{doc})"
156
- field_expl << field_norm_expl
157
-
158
- field_expl.value = tf_expl.value * idf_expl.value * field_norm_expl.value
159
- result << field_expl
160
-
161
- if (query_expl.value == 1.0)
162
- return field_expl
163
- else
164
- result.value = query_expl.value * field_expl.value
165
- return result
166
- end
167
- end
168
- end
169
-
170
- def rewrite(reader)
171
- if (@term_arrays.size() == 1) # optimize one-term case
172
- terms = @term_arrays[0]
173
- bq = BooleanQuery.new(true)
174
- terms.each do |term|
175
- bq.add_query(TermQuery.new(term), BooleanClause::Occur::SHOULD)
176
- end
177
- bq.boost = boost()
178
- return bq
179
- else
180
- return self
181
- end
182
- end
183
-
184
- # See Query#extract_terms()
185
- def extract_terms(query_terms)
186
- @term_arrays.each { |terms|
187
- query_terms.merge(terms)
188
- }
189
- end
190
-
191
- def create_weight(searcher)
192
- return MultiPhraseWeight.new(self, searcher)
193
- end
194
-
195
- # Prints a user-readable version of this query.
196
- def to_s(f = nil)
197
- buffer = ""
198
- buffer << "#{@field}:" if @field != f
199
- buffer << '"'
200
- last_pos = -1
201
- @term_arrays.each_index do |i|
202
- terms = @term_arrays[i]
203
- pos = @positions[i]
204
- last_pos.upto(pos-2) {buffer << "<> "}
205
- last_pos = pos
206
- buffer << "#{terms.map {|term| term.text}.join("|")} "
207
- end
208
- buffer.rstrip!
209
- buffer << '"'
210
-
211
- buffer << "~#{@slop}" if (@slop != 0)
212
- buffer << "^#{boost()}" if boost() != 1.0
213
- return buffer
214
- end
215
- end
216
- end