ferret 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/w32_io.c ADDED
@@ -0,0 +1,150 @@
1
+ #ifdef WIN32
2
+
3
+ #include "global.h"
4
+ #include "store.h"
5
+ #include <stdio.h>
6
+ #include <io.h>
7
+ #include <errno.h>
8
+ #include <string.h>
9
+
10
+ /**
11
+ * Create a filepath for a file in the store using the operating systems
12
+ * default file seperator.
13
+ */
14
+ char *join_path(char *buf, const char *base, const char *filename)
15
+ {
16
+ sprintf(buf, "%s\\%s", base, filename);
17
+ return buf;
18
+ }
19
+
20
+ bool exists(char *path)
21
+ {
22
+ int fd = _open(path, 0);
23
+ if (fd < 0) {
24
+ if (errno != ENOENT) {
25
+ RAISE(IO_ERROR, strerror(errno));
26
+ }
27
+ return false;
28
+ }
29
+ _close(fd);
30
+ return true;
31
+ }
32
+
33
+ int fcount(char *path)
34
+ {
35
+ char buf[MAX_FILE_PATH];
36
+ struct _finddata_t fd;
37
+ intptr_t d;
38
+ int cnt = 0;
39
+
40
+ join_path(buf, path, "*");
41
+
42
+ if ((d = _findfirst(buf, &fd)) < 0) {
43
+ RAISE(IO_ERROR, strerror(errno));
44
+ }
45
+
46
+ do {
47
+ if (fd.name[0] != '.') {
48
+ cnt++;
49
+ }
50
+ } while (_findnext(d, &fd) == 0);
51
+ _findclose(d);
52
+
53
+ return cnt;
54
+ }
55
+
56
+ void dir_each(char *path, void (*func)(char *fname, void *arg), void *arg)
57
+ {
58
+ char buf[MAX_FILE_PATH];
59
+ struct _finddata_t fd;
60
+ intptr_t d;
61
+ join_path(buf, path, "*");
62
+
63
+ if ((d = _findfirst(buf, &fd)) < 0) {
64
+ RAISE(IO_ERROR, strerror(errno));
65
+ }
66
+
67
+ while (_findnext(d, &fd) == 0) {
68
+ if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
69
+ func(fd.name, arg);
70
+ }
71
+ }
72
+ _findclose(d);
73
+ }
74
+
75
+ /**
76
+ * Clear all the locks in the store.
77
+ *
78
+ * @param store the store to clear the locks from
79
+ * @throws IO_ERROR if there is an error opening the directory
80
+ */
81
+ void fs_clear_locks(Store *store)
82
+ {
83
+ char buf[MAX_FILE_PATH];
84
+ struct _finddata_t fd;
85
+ intptr_t d;
86
+ join_path(buf, store->dir.path, "*");
87
+
88
+ if ((d = _findfirst(buf, &fd)) < 0) {
89
+ RAISE(IO_ERROR, strerror(errno));
90
+ }
91
+
92
+ while (_findnext(d, &fd) == 0) {
93
+ if (file_is_lock(fd.name)) {
94
+ remove(join_path(buf, store->dir.path, fd.name));
95
+ }
96
+ }
97
+ _findclose(d);
98
+ }
99
+
100
+ /**
101
+ * Clear all files from the store except the lock files.
102
+ *
103
+ * @param store the store to clear all the files from
104
+ * @throws IO_ERROR if there is an error deleting the files
105
+ */
106
+ void fs_clear(Store *store)
107
+ {
108
+ char buf[MAX_FILE_PATH];
109
+ struct _finddata_t fd;
110
+ intptr_t d;
111
+ join_path(buf, store->dir.path, "*");
112
+
113
+ if ((d = _findfirst(buf, &fd)) < 0) {
114
+ RAISE(IO_ERROR, strerror(errno));
115
+ }
116
+
117
+ while (_findnext(d, &fd) == 0) {
118
+ if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
119
+ remove(join_path(buf, store->dir.path, fd.name));
120
+ }
121
+ }
122
+ _findclose(d);
123
+ }
124
+
125
+ /**
126
+ * Clear all files from the store including the lock files.
127
+ *
128
+ * @param store the store to clear all the files from
129
+ * @throws IO_ERROR if there is an error deleting the files
130
+ */
131
+ void fs_clear_all(Store *store)
132
+ {
133
+ char buf[MAX_FILE_PATH];
134
+ struct _finddata_t fd;
135
+ intptr_t d;
136
+ join_path(buf, store->dir.path, "*");
137
+
138
+ if ((d = _findfirst(buf, &fd)) < 0) {
139
+ RAISE(IO_ERROR, strerror(errno));
140
+ }
141
+
142
+ while (_findnext(d, &fd) == 0) {
143
+ if (fd.name[0] != '.') {
144
+ remove(join_path(buf, store->dir.path, fd.name));
145
+ }
146
+ }
147
+ _findclose(d);
148
+ }
149
+
150
+ #endif
data/lib/ferret.rb CHANGED
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.9.0'
25
+ VERSION = '0.9.2'
26
26
  end
27
27
 
28
28
  # try and load the C extension but it isn't necessary.
@@ -13,14 +13,15 @@ module Ferret::Analysis
13
13
  # addresses, phone numbers, etc.
14
14
 
15
15
  class StandardTokenizer < RegExpTokenizer
16
- ALPHA = /[[:alpha:]]+/
16
+ ALPHA = /[[:alpha:]_-]+/
17
17
  APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
18
18
  ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
19
19
  P = /[_\/.,-]/
20
20
  HASDIGIT = /\w*\d\w*/
21
- TOKEN_RE = /[[:alpha:]]+(('[[:alpha:]]+)+
22
- |\.([[:alpha:]]\.)+
21
+ TOKEN_RE = /#{ALPHA}+(('#{ALPHA}+)+
22
+ |\.(#{ALPHA}\.)+
23
23
  |(@|\&)\w+([-.]\w+)*
24
+ |:\/\/\w+([-.\/]\w+)*
24
25
  )
25
26
  |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
26
27
  |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
@@ -307,6 +307,6 @@ module Ferret::Document
307
307
  str << "omit_norms," if (@omit_norms)
308
308
  str << "binary," if (@binary)
309
309
  str << "<#{@name}:#{@binary ? '=bin_data=' : data}>"
310
- end
310
+ end
311
311
  end
312
312
  end
@@ -104,7 +104,7 @@ module Ferret
104
104
  # Retrieve the field_info object by either field number or field name.
105
105
  def [](index)
106
106
  if index.is_a? Integer
107
- if index == NOT_A_FIELD || index < 0 # < 0 is for C extensions
107
+ if index >= NOT_A_FIELD || index < 0 # < 0 is for C extensions
108
108
  return FieldInfo.new("", false, NOT_A_FIELD, false)
109
109
  end
110
110
  return @fi_array[index]
@@ -18,7 +18,7 @@ module Ferret::Index
18
18
 
19
19
  # Constructs a Term with the given field and text
20
20
  def initialize(fld_name, txt)
21
- @field = fld_name
21
+ @field = fld_name.to_s
22
22
  @text = txt.to_s
23
23
  end
24
24
 
@@ -11,7 +11,7 @@ module Ferret
11
11
 
12
12
  class QueryParser < Racc::Parser
13
13
 
14
- module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a9944', 'lib/ferret/query_parser/query_parser.y', 126
14
+ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id155b60f3fb', 'lib/ferret/query_parser/query_parser.y', 126
15
15
  attr_accessor :default_field, :fields, :handle_parse_errors
16
16
 
17
17
  def initialize(default_field = "*", options = {})
@@ -20,7 +20,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
20
20
  default_field = default_field.split("|")
21
21
  end
22
22
  @field = @default_field = default_field
23
- @analyzer = options[:analyzer] || Analysis::Analyzer.new
23
+ @analyzer = options[:analyzer] || Analysis::StandardAnalyzer.new
24
24
  @wild_lower = options[:wild_lower].nil? ? true : options[:wild_lower]
25
25
  @occur_default = options[:occur_default] || BooleanClause::Occur::SHOULD
26
26
  @default_slop = options[:default_slop] || 0
@@ -170,23 +170,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
170
170
  end
171
171
 
172
172
  def get_bad_query(field, str)
173
- get_term_query(field, str)
174
- #tokens = []
175
- #stream = @analyzer.token_stream(field, str)
176
- #while token = stream.next
177
- # tokens << token
178
- #end
179
- #if tokens.length == 0
180
- # return TermQuery.new(Term.new(field, ""))
181
- #elsif tokens.length == 1
182
- # return TermQuery.new(Term.new(field, tokens[0].text))
183
- #else
184
- # bq = BooleanQuery.new()
185
- # tokens.each do |token|
186
- # bq << BooleanClause.new(TermQuery.new(Term.new(field, token.text)))
187
- # end
188
- # return bq
189
- #end
173
+ get_term_query(field, str) || BooleanQuery.new()
190
174
  end
191
175
 
192
176
  def get_range_query(field, start_word, end_word, inc_upper, inc_lower)
@@ -200,7 +184,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
200
184
  tokens << token
201
185
  end
202
186
  if tokens.length == 0
203
- return TermQuery.new(Term.new(field, ""))
187
+ return nil
204
188
  elsif tokens.length == 1
205
189
  return TermQuery.new(Term.new(field, tokens[0].text))
206
190
  else
@@ -365,14 +349,14 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
365
349
 
366
350
  def get_boolean_query(clauses)
367
351
  # possible that we got all nil clauses so check
368
- return nil if clauses.nil?
352
+ bq = BooleanQuery.new()
353
+ return bq if clauses.nil?
369
354
  clauses.compact!
370
- return nil if clauses.size == 0
355
+ return bq if clauses.size == 0
371
356
 
372
357
  if clauses.size == 1 and not clauses[0].prohibited?
373
358
  return clauses[0].query
374
359
  end
375
- bq = BooleanQuery.new()
376
360
  clauses.each {|clause| bq << clause }
377
361
  return bq
378
362
  end
@@ -414,7 +398,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
414
398
  return qp.parse(query)
415
399
  end
416
400
 
417
- ..end lib/ferret/query_parser/query_parser.y modeval..id94697a9944
401
+ ..end lib/ferret/query_parser/query_parser.y modeval..id155b60f3fb
418
402
 
419
403
  ##### racc 1.4.4 generates ###
420
404
 
data/lib/ferret/search.rb CHANGED
@@ -47,3 +47,4 @@ require 'ferret/search/filtered_query.rb'
47
47
  require 'ferret/search/match_all_query.rb'
48
48
  require 'ferret/search/spans.rb'
49
49
  require 'ferret/search/index_searcher.rb'
50
+ require 'ferret/search/multi_searcher.rb'
@@ -248,10 +248,6 @@ module Ferret::Search
248
248
  end
249
249
  end
250
250
 
251
- def combine(queries)
252
- return Query.merge_boolean_queries(queries)
253
- end
254
-
255
251
  def initialize_copy(o)
256
252
  super
257
253
  @clauses = o.clauses.clone
@@ -104,7 +104,13 @@ module Ferret::Search
104
104
  raise ArgumentError, "first_doc must be >= 0 to run a search"
105
105
  end
106
106
 
107
- scorer = query.weight(self).scorer(@reader)
107
+ # for MultiSearcher: the weight is computed across all searchers
108
+ if query.is_a? Weight
109
+ scorer = query.scorer(@reader)
110
+ else
111
+ scorer = query.weight(self).scorer(@reader)
112
+ end
113
+
108
114
  if (scorer == nil)
109
115
  return TopDocs.new(0, [])
110
116
  end
@@ -117,14 +123,10 @@ module Ferret::Search
117
123
  hq = HitQueue.new(max_size)
118
124
  end
119
125
  total_hits = 0
120
- min_score = 0.0
121
126
  scorer.each_hit() do |doc, score|
122
127
  if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
123
128
  total_hits += 1
124
- if hq.size < max_size or score >= min_score
125
- hq.insert(ScoreDoc.new(doc, score))
126
- min_score = hq.top.score # maintain min_score
127
- end
129
+ hq.insert(ScoreDoc.new(doc, score))
128
130
  end
129
131
  end
130
132
 
@@ -148,7 +150,12 @@ module Ferret::Search
148
150
  # usually want your hits sorted at least by score so you should use the
149
151
  # #search method.
150
152
  def search_each(query, filter = nil)
151
- scorer = query.weight(self).scorer(@reader)
153
+ # for MultiSearcher: the weight is computed across all searchers
154
+ if query.is_a? Weight
155
+ scorer = query.scorer(@reader)
156
+ else
157
+ scorer = query.weight(self).scorer(@reader)
158
+ end
152
159
  return if scorer == nil
153
160
  bits = (filter.nil? ? nil : filter.bits(@reader))
154
161
  scorer.each_hit() do |doc, score|
@@ -175,13 +182,19 @@ module Ferret::Search
175
182
 
176
183
  # Returns an Explanation that describes how +doc+ scored against
177
184
  # +query+.
185
+ # A weight may be given as first parameter instead of the query, too.
178
186
  #
179
187
  # This is intended to be used in developing Similarity implementations,
180
188
  # and, for good performance, should not be displayed with every hit.
181
189
  # Computing an explanation is as expensive as executing the query over the
182
190
  # entire index.
183
191
  def explain(query, doc)
184
- return query.weight(self).explain(@reader, doc)
192
+ if query.is_a? Weight
193
+ weight = query
194
+ else
195
+ weight = query.weight(self)
196
+ end
197
+ return weight.explain(@reader, doc)
185
198
  end
186
199
  end
187
200
  end
@@ -181,6 +181,13 @@ module Ferret::Search
181
181
  end
182
182
  end
183
183
 
184
+ # See Query#extract_terms()
185
+ def extract_terms(query_terms)
186
+ @term_arrays.each { |terms|
187
+ query_terms.merge(terms)
188
+ }
189
+ end
190
+
184
191
  def create_weight(searcher)
185
192
  return MultiPhraseWeight.new(self, searcher)
186
193
  end
@@ -0,0 +1,261 @@
1
+ module Ferret::Search
2
+
3
+ # Implements searching multiple IndexSearchers at once
4
+ #
5
+ # Applications usually need only call the @link #search(Query)
6
+ # or @link #search(Query,Filter) methods. For performance reasons it is
7
+ # recommended to open only one Searcher and use it for all of your searches.
8
+ class MultiSearcher
9
+ include Ferret::Index
10
+
11
+ attr_accessor :similarity, :searchers
12
+
13
+ # Creates a MultiSearcher searching across all the searchers
14
+ # in the provided array.
15
+ #
16
+ def initialize(args)
17
+ @searchers = Array.new(args)
18
+ @similarity = Similarity.default
19
+
20
+ # initialize reader lookup array
21
+ @max_doc = 0
22
+ @starts = Array.new(@searchers.size + 1)
23
+ @searchers.each_with_index { |searcher, i|
24
+ @starts[i] = @max_doc
25
+ @max_doc += searcher.max_doc
26
+ }
27
+ @starts[@searchers.size] = @max_doc
28
+ end
29
+
30
+ # closes all underlying Searchers
31
+ def close()
32
+ @searchers.each { |searcher| searcher.close() }
33
+ end
34
+
35
+ # Expert: Returns the number of documents containing +term+.
36
+ # Called by search code to compute term weights.
37
+ # See IndexReader#doc_freq
38
+ def doc_freq(term)
39
+ return @searchers.inject(0) { |df, searcher|
40
+ df + searcher.doc_freq(term)
41
+ }
42
+ end
43
+
44
+ # Expert: For each term in the terms array, calculates the number of
45
+ # documents containing +term+. Returns an array with these
46
+ # document frequencies. Used to minimize number of remote calls.
47
+ def doc_freqs(terms)
48
+ result = Array.new
49
+ terms.each {|term, i| result << doc_freq(term)}
50
+ return result
51
+ end
52
+
53
+ # Expert: Returns the stored fields of document +n+.
54
+ #
55
+ # See IndexReader#get_document
56
+ def doc(n)
57
+ i = sub_searcher(n)
58
+ return @searchers[i].doc(n - @starts[i])
59
+ end
60
+
61
+ # Returns index of the searcher for document <code>n</code> in the
62
+ # array used to construct this searcher.
63
+ def sub_searcher(n)
64
+ lo = 0 # search starts array
65
+ hi = @searchers.size - 1 # for first element less
66
+ # than n, return its index
67
+ while hi >= lo do
68
+ mid = (lo + hi) >> 1
69
+ midValue = @starts[mid]
70
+ if n < midValue
71
+ hi = mid - 1;
72
+ elsif n > midValue
73
+ lo = mid + 1;
74
+ else # found a match
75
+ while mid+1 < @searchers.size && @starts[mid+1] == midValue do
76
+ mid += 1 # scan to last match
77
+ end
78
+ return mid
79
+ end
80
+ end
81
+ return hi
82
+ end
83
+
84
+ # Returns the document number of document <code>n</code> within its
85
+ # sub-index.
86
+ def sub_doc(n)
87
+ return n - @starts[sub_searcher(n)]
88
+ end
89
+
90
+ # Expert: Returns one greater than the largest possible document number.
91
+ # Called by search code to compute term weights.
92
+ # See IndexReader#max_doc
93
+ def max_doc
94
+ return @max_doc
95
+ end
96
+
97
+ # Create weight in multiple index scenario.
98
+ #
99
+ # Distributed query processing is done in the following steps:
100
+ # 1. rewrite query
101
+ # 2. extract necessary terms
102
+ # 3. collect dfs for these terms from the Searchables
103
+ # 4. create query weight using aggregate dfs.
104
+ # 5. distribute that weight to Searchables
105
+ # 6. merge results
106
+ #
107
+ # Steps 1-4 are done here, 5+6 in the search() methods
108
+ def create_weight(query)
109
+ # step 1
110
+ rewritten_query = self.rewrite(query)
111
+
112
+ # step 2
113
+ terms = Set.new
114
+ rewritten_query.extract_terms(terms)
115
+
116
+ # step 3
117
+ aggregated_dfs = Array.new(terms.size, 0)
118
+ @searchers.each { |searcher|
119
+ dfs = searcher.doc_freqs(terms)
120
+ dfs.each_with_index { |df,i|
121
+ aggregated_dfs[i] += df
122
+ }
123
+ }
124
+
125
+ df_map = Hash.new
126
+ terms.each_with_index { |term,i|
127
+ df_map[term] = aggregated_dfs[i]
128
+ }
129
+
130
+ # step 4
131
+ cache_sim = CachedDfSource.new(df_map, self.max_doc, self.similarity)
132
+
133
+ return rewritten_query.weight(cache_sim)
134
+ end
135
+
136
+
137
+ def search(query, options = {})
138
+ filter = options[:filter]
139
+ first_doc = options[:first_doc]||0
140
+ num_docs = options[:num_docs]||10
141
+ max_size = first_doc + num_docs
142
+ sort = options[:sort]
143
+
144
+ if (num_docs <= 0)
145
+ raise ArgumentError, "num_docs must be > 0 to run a search"
146
+ end
147
+
148
+ if (first_doc < 0)
149
+ raise ArgumentError, "first_doc must be >= 0 to run a search"
150
+ end
151
+
152
+
153
+ if (sort)
154
+ raise NotImplementedError
155
+ #fields = sort.is_a?(Array) ? sort : sort.fields
156
+ #hq = FieldDocSortedHitQueue.new(fields, max_size)
157
+ else
158
+ hq = HitQueue.new(max_size)
159
+ end
160
+
161
+ total_hits = 0
162
+ weight = create_weight(query)
163
+ @searchers.each_with_index { |searcher,i| # search each searcher
164
+ docs = searcher.search(weight,
165
+ :filter => filter,
166
+ #:sort => sort,
167
+ :num_docs => max_size,
168
+ :first_doc => 0)
169
+ total_hits += docs.total_hits # update total_hits
170
+ docs.score_docs.each { |score_doc|
171
+ score_doc.doc += @starts[i] # convert doc
172
+ break unless hq.insert(score_doc) # no more scores > min_score
173
+ }
174
+ }
175
+
176
+ score_docs = []
177
+ if (hq.size > first_doc)
178
+ if (hq.size - first_doc) < num_docs
179
+ num_docs = hq.size - first_doc
180
+ end
181
+ num_docs.times do
182
+ score_docs.unshift(hq.pop)
183
+ end
184
+ end
185
+ hq.clear
186
+
187
+ return TopDocs.new(total_hits, score_docs)
188
+ end
189
+
190
+ def search_each(query, filter = nil, &block)
191
+ weight = create_weight(query)
192
+ @searchers.each { |searcher| # search each searcher
193
+ searcher.search_each(weight, filter, &block)
194
+ }
195
+ end
196
+
197
+ # rewrites the query into a query that can be processed by the search
198
+ # methods. For example, a Fuzzy query is turned into a massive boolean
199
+ # query.
200
+ #
201
+ # original:: The original query to be rewritten.
202
+ def rewrite(original)
203
+ #print "multi_searcher#rewrite: #{original}\n"
204
+ queries = []
205
+ @searchers.each { |searcher|
206
+ queries << searcher.rewrite(original)
207
+ }
208
+ return queries.first.combine(queries)
209
+ end
210
+
211
+ # Returns an Explanation that describes how +doc+ scored against
212
+ # +query+.
213
+ #
214
+ # This is intended to be used in developing Similarity implementations,
215
+ # and, for good performance, should not be displayed with every hit.
216
+ # Computing an explanation is as expensive as executing the query over the
217
+ # entire index.
218
+ def explain(query, doc)
219
+ i = sub_searcher(doc)
220
+ return @searchers[i].explain(create_weight(query), doc-@starts[i])
221
+ end
222
+
223
+ end
224
+
225
+
226
+ # Document Frequency cache acting as a Dummy-Searcher.
227
+ # This class is no full-fledged Searcher, but only supports
228
+ # the methods necessary to initialize Weights.
229
+ class CachedDfSource
230
+
231
+ attr_reader :max_doc, :similarity
232
+
233
+ def initialize(df_map, max_doc, similarity)
234
+ @df_map = df_map
235
+ @max_doc = max_doc
236
+ @similarity = similarity
237
+ end
238
+
239
+ def doc_freq(term)
240
+ return @df_map[term]
241
+ end
242
+
243
+ def doc_freqs(terms)
244
+ result = Array.new
245
+ terms.each { |term|
246
+ result << doc_freq(term)
247
+ }
248
+ return result
249
+ end
250
+
251
+ def rewrite(query)
252
+ # this is a bit of a hack. We know that a query which
253
+ # creates a Weight based on this Dummy-Searcher is
254
+ # always already rewritten (see preparedWeight()).
255
+ # Therefore we just return the unmodified query here
256
+ return query
257
+ end
258
+
259
+ end
260
+
261
+ end