ferret 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/w32_io.c ADDED
@@ -0,0 +1,150 @@
1
+ #ifdef WIN32
2
+
3
+ #include "global.h"
4
+ #include "store.h"
5
+ #include <stdio.h>
6
+ #include <io.h>
7
+ #include <errno.h>
8
+ #include <string.h>
9
+
10
+ /**
11
+ * Create a filepath for a file in the store using the operating systems
12
+ * default file seperator.
13
+ */
14
+ char *join_path(char *buf, const char *base, const char *filename)
15
+ {
16
+ sprintf(buf, "%s\\%s", base, filename);
17
+ return buf;
18
+ }
19
+
20
+ bool exists(char *path)
21
+ {
22
+ int fd = _open(path, 0);
23
+ if (fd < 0) {
24
+ if (errno != ENOENT) {
25
+ RAISE(IO_ERROR, strerror(errno));
26
+ }
27
+ return false;
28
+ }
29
+ _close(fd);
30
+ return true;
31
+ }
32
+
33
+ int fcount(char *path)
34
+ {
35
+ char buf[MAX_FILE_PATH];
36
+ struct _finddata_t fd;
37
+ intptr_t d;
38
+ int cnt = 0;
39
+
40
+ join_path(buf, path, "*");
41
+
42
+ if ((d = _findfirst(buf, &fd)) < 0) {
43
+ RAISE(IO_ERROR, strerror(errno));
44
+ }
45
+
46
+ do {
47
+ if (fd.name[0] != '.') {
48
+ cnt++;
49
+ }
50
+ } while (_findnext(d, &fd) == 0);
51
+ _findclose(d);
52
+
53
+ return cnt;
54
+ }
55
+
56
+ void dir_each(char *path, void (*func)(char *fname, void *arg), void *arg)
57
+ {
58
+ char buf[MAX_FILE_PATH];
59
+ struct _finddata_t fd;
60
+ intptr_t d;
61
+ join_path(buf, path, "*");
62
+
63
+ if ((d = _findfirst(buf, &fd)) < 0) {
64
+ RAISE(IO_ERROR, strerror(errno));
65
+ }
66
+
67
+ while (_findnext(d, &fd) == 0) {
68
+ if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
69
+ func(fd.name, arg);
70
+ }
71
+ }
72
+ _findclose(d);
73
+ }
74
+
75
+ /**
76
+ * Clear all the locks in the store.
77
+ *
78
+ * @param store the store to clear the locks from
79
+ * @throws IO_ERROR if there is an error opening the directory
80
+ */
81
+ void fs_clear_locks(Store *store)
82
+ {
83
+ char buf[MAX_FILE_PATH];
84
+ struct _finddata_t fd;
85
+ intptr_t d;
86
+ join_path(buf, store->dir.path, "*");
87
+
88
+ if ((d = _findfirst(buf, &fd)) < 0) {
89
+ RAISE(IO_ERROR, strerror(errno));
90
+ }
91
+
92
+ while (_findnext(d, &fd) == 0) {
93
+ if (file_is_lock(fd.name)) {
94
+ remove(join_path(buf, store->dir.path, fd.name));
95
+ }
96
+ }
97
+ _findclose(d);
98
+ }
99
+
100
+ /**
101
+ * Clear all files from the store except the lock files.
102
+ *
103
+ * @param store the store to clear all the files from
104
+ * @throws IO_ERROR if there is an error deleting the files
105
+ */
106
+ void fs_clear(Store *store)
107
+ {
108
+ char buf[MAX_FILE_PATH];
109
+ struct _finddata_t fd;
110
+ intptr_t d;
111
+ join_path(buf, store->dir.path, "*");
112
+
113
+ if ((d = _findfirst(buf, &fd)) < 0) {
114
+ RAISE(IO_ERROR, strerror(errno));
115
+ }
116
+
117
+ while (_findnext(d, &fd) == 0) {
118
+ if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
119
+ remove(join_path(buf, store->dir.path, fd.name));
120
+ }
121
+ }
122
+ _findclose(d);
123
+ }
124
+
125
+ /**
126
+ * Clear all files from the store including the lock files.
127
+ *
128
+ * @param store the store to clear all the files from
129
+ * @throws IO_ERROR if there is an error deleting the files
130
+ */
131
+ void fs_clear_all(Store *store)
132
+ {
133
+ char buf[MAX_FILE_PATH];
134
+ struct _finddata_t fd;
135
+ intptr_t d;
136
+ join_path(buf, store->dir.path, "*");
137
+
138
+ if ((d = _findfirst(buf, &fd)) < 0) {
139
+ RAISE(IO_ERROR, strerror(errno));
140
+ }
141
+
142
+ while (_findnext(d, &fd) == 0) {
143
+ if (fd.name[0] != '.') {
144
+ remove(join_path(buf, store->dir.path, fd.name));
145
+ }
146
+ }
147
+ _findclose(d);
148
+ }
149
+
150
+ #endif
data/lib/ferret.rb CHANGED
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.9.0'
25
+ VERSION = '0.9.2'
26
26
  end
27
27
 
28
28
  # try and load the C extension but it isn't necessary.
@@ -13,14 +13,15 @@ module Ferret::Analysis
13
13
  # addresses, phone numbers, etc.
14
14
 
15
15
  class StandardTokenizer < RegExpTokenizer
16
- ALPHA = /[[:alpha:]]+/
16
+ ALPHA = /[[:alpha:]_-]+/
17
17
  APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
18
18
  ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
19
19
  P = /[_\/.,-]/
20
20
  HASDIGIT = /\w*\d\w*/
21
- TOKEN_RE = /[[:alpha:]]+(('[[:alpha:]]+)+
22
- |\.([[:alpha:]]\.)+
21
+ TOKEN_RE = /#{ALPHA}+(('#{ALPHA}+)+
22
+ |\.(#{ALPHA}\.)+
23
23
  |(@|\&)\w+([-.]\w+)*
24
+ |:\/\/\w+([-.\/]\w+)*
24
25
  )
25
26
  |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
26
27
  |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
@@ -307,6 +307,6 @@ module Ferret::Document
307
307
  str << "omit_norms," if (@omit_norms)
308
308
  str << "binary," if (@binary)
309
309
  str << "<#{@name}:#{@binary ? '=bin_data=' : data}>"
310
- end
310
+ end
311
311
  end
312
312
  end
@@ -104,7 +104,7 @@ module Ferret
104
104
  # Retrieve the field_info object by either field number or field name.
105
105
  def [](index)
106
106
  if index.is_a? Integer
107
- if index == NOT_A_FIELD || index < 0 # < 0 is for C extensions
107
+ if index >= NOT_A_FIELD || index < 0 # < 0 is for C extensions
108
108
  return FieldInfo.new("", false, NOT_A_FIELD, false)
109
109
  end
110
110
  return @fi_array[index]
@@ -18,7 +18,7 @@ module Ferret::Index
18
18
 
19
19
  # Constructs a Term with the given field and text
20
20
  def initialize(fld_name, txt)
21
- @field = fld_name
21
+ @field = fld_name.to_s
22
22
  @text = txt.to_s
23
23
  end
24
24
 
@@ -11,7 +11,7 @@ module Ferret
11
11
 
12
12
  class QueryParser < Racc::Parser
13
13
 
14
- module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a9944', 'lib/ferret/query_parser/query_parser.y', 126
14
+ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id155b60f3fb', 'lib/ferret/query_parser/query_parser.y', 126
15
15
  attr_accessor :default_field, :fields, :handle_parse_errors
16
16
 
17
17
  def initialize(default_field = "*", options = {})
@@ -20,7 +20,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
20
20
  default_field = default_field.split("|")
21
21
  end
22
22
  @field = @default_field = default_field
23
- @analyzer = options[:analyzer] || Analysis::Analyzer.new
23
+ @analyzer = options[:analyzer] || Analysis::StandardAnalyzer.new
24
24
  @wild_lower = options[:wild_lower].nil? ? true : options[:wild_lower]
25
25
  @occur_default = options[:occur_default] || BooleanClause::Occur::SHOULD
26
26
  @default_slop = options[:default_slop] || 0
@@ -170,23 +170,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
170
170
  end
171
171
 
172
172
  def get_bad_query(field, str)
173
- get_term_query(field, str)
174
- #tokens = []
175
- #stream = @analyzer.token_stream(field, str)
176
- #while token = stream.next
177
- # tokens << token
178
- #end
179
- #if tokens.length == 0
180
- # return TermQuery.new(Term.new(field, ""))
181
- #elsif tokens.length == 1
182
- # return TermQuery.new(Term.new(field, tokens[0].text))
183
- #else
184
- # bq = BooleanQuery.new()
185
- # tokens.each do |token|
186
- # bq << BooleanClause.new(TermQuery.new(Term.new(field, token.text)))
187
- # end
188
- # return bq
189
- #end
173
+ get_term_query(field, str) || BooleanQuery.new()
190
174
  end
191
175
 
192
176
  def get_range_query(field, start_word, end_word, inc_upper, inc_lower)
@@ -200,7 +184,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
200
184
  tokens << token
201
185
  end
202
186
  if tokens.length == 0
203
- return TermQuery.new(Term.new(field, ""))
187
+ return nil
204
188
  elsif tokens.length == 1
205
189
  return TermQuery.new(Term.new(field, tokens[0].text))
206
190
  else
@@ -365,14 +349,14 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
365
349
 
366
350
  def get_boolean_query(clauses)
367
351
  # possible that we got all nil clauses so check
368
- return nil if clauses.nil?
352
+ bq = BooleanQuery.new()
353
+ return bq if clauses.nil?
369
354
  clauses.compact!
370
- return nil if clauses.size == 0
355
+ return bq if clauses.size == 0
371
356
 
372
357
  if clauses.size == 1 and not clauses[0].prohibited?
373
358
  return clauses[0].query
374
359
  end
375
- bq = BooleanQuery.new()
376
360
  clauses.each {|clause| bq << clause }
377
361
  return bq
378
362
  end
@@ -414,7 +398,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
414
398
  return qp.parse(query)
415
399
  end
416
400
 
417
- ..end lib/ferret/query_parser/query_parser.y modeval..id94697a9944
401
+ ..end lib/ferret/query_parser/query_parser.y modeval..id155b60f3fb
418
402
 
419
403
  ##### racc 1.4.4 generates ###
420
404
 
data/lib/ferret/search.rb CHANGED
@@ -47,3 +47,4 @@ require 'ferret/search/filtered_query.rb'
47
47
  require 'ferret/search/match_all_query.rb'
48
48
  require 'ferret/search/spans.rb'
49
49
  require 'ferret/search/index_searcher.rb'
50
+ require 'ferret/search/multi_searcher.rb'
@@ -248,10 +248,6 @@ module Ferret::Search
248
248
  end
249
249
  end
250
250
 
251
- def combine(queries)
252
- return Query.merge_boolean_queries(queries)
253
- end
254
-
255
251
  def initialize_copy(o)
256
252
  super
257
253
  @clauses = o.clauses.clone
@@ -104,7 +104,13 @@ module Ferret::Search
104
104
  raise ArgumentError, "first_doc must be >= 0 to run a search"
105
105
  end
106
106
 
107
- scorer = query.weight(self).scorer(@reader)
107
+ # for MultiSearcher: the weight is computed across all searchers
108
+ if query.is_a? Weight
109
+ scorer = query.scorer(@reader)
110
+ else
111
+ scorer = query.weight(self).scorer(@reader)
112
+ end
113
+
108
114
  if (scorer == nil)
109
115
  return TopDocs.new(0, [])
110
116
  end
@@ -117,14 +123,10 @@ module Ferret::Search
117
123
  hq = HitQueue.new(max_size)
118
124
  end
119
125
  total_hits = 0
120
- min_score = 0.0
121
126
  scorer.each_hit() do |doc, score|
122
127
  if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
123
128
  total_hits += 1
124
- if hq.size < max_size or score >= min_score
125
- hq.insert(ScoreDoc.new(doc, score))
126
- min_score = hq.top.score # maintain min_score
127
- end
129
+ hq.insert(ScoreDoc.new(doc, score))
128
130
  end
129
131
  end
130
132
 
@@ -148,7 +150,12 @@ module Ferret::Search
148
150
  # usually want your hits sorted at least by score so you should use the
149
151
  # #search method.
150
152
  def search_each(query, filter = nil)
151
- scorer = query.weight(self).scorer(@reader)
153
+ # for MultiSearcher: the weight is computed across all searchers
154
+ if query.is_a? Weight
155
+ scorer = query.scorer(@reader)
156
+ else
157
+ scorer = query.weight(self).scorer(@reader)
158
+ end
152
159
  return if scorer == nil
153
160
  bits = (filter.nil? ? nil : filter.bits(@reader))
154
161
  scorer.each_hit() do |doc, score|
@@ -175,13 +182,19 @@ module Ferret::Search
175
182
 
176
183
  # Returns an Explanation that describes how +doc+ scored against
177
184
  # +query+.
185
+ # A weight may be given as first parameter instead of the query, too.
178
186
  #
179
187
  # This is intended to be used in developing Similarity implementations,
180
188
  # and, for good performance, should not be displayed with every hit.
181
189
  # Computing an explanation is as expensive as executing the query over the
182
190
  # entire index.
183
191
  def explain(query, doc)
184
- return query.weight(self).explain(@reader, doc)
192
+ if query.is_a? Weight
193
+ weight = query
194
+ else
195
+ weight = query.weight(self)
196
+ end
197
+ return weight.explain(@reader, doc)
185
198
  end
186
199
  end
187
200
  end
@@ -181,6 +181,13 @@ module Ferret::Search
181
181
  end
182
182
  end
183
183
 
184
+ # See Query#extract_terms()
185
+ def extract_terms(query_terms)
186
+ @term_arrays.each { |terms|
187
+ query_terms.merge(terms)
188
+ }
189
+ end
190
+
184
191
  def create_weight(searcher)
185
192
  return MultiPhraseWeight.new(self, searcher)
186
193
  end
@@ -0,0 +1,261 @@
1
+ module Ferret::Search
2
+
3
+ # Implements searching multiple IndexSearchers at once
4
+ #
5
+ # Applications usually need only call the @link #search(Query)
6
+ # or @link #search(Query,Filter) methods. For performance reasons it is
7
+ # recommended to open only one Searcher and use it for all of your searches.
8
+ class MultiSearcher
9
+ include Ferret::Index
10
+
11
+ attr_accessor :similarity, :searchers
12
+
13
+ # Creates a MultiSearcher searching across all the searchers
14
+ # in the provided array.
15
+ #
16
+ def initialize(args)
17
+ @searchers = Array.new(args)
18
+ @similarity = Similarity.default
19
+
20
+ # initialize reader lookup array
21
+ @max_doc = 0
22
+ @starts = Array.new(@searchers.size + 1)
23
+ @searchers.each_with_index { |searcher, i|
24
+ @starts[i] = @max_doc
25
+ @max_doc += searcher.max_doc
26
+ }
27
+ @starts[@searchers.size] = @max_doc
28
+ end
29
+
30
+ # closes all underlying Searchers
31
+ def close()
32
+ @searchers.each { |searcher| searcher.close() }
33
+ end
34
+
35
+ # Expert: Returns the number of documents containing +term+.
36
+ # Called by search code to compute term weights.
37
+ # See IndexReader#doc_freq
38
+ def doc_freq(term)
39
+ return @searchers.inject(0) { |df, searcher|
40
+ df + searcher.doc_freq(term)
41
+ }
42
+ end
43
+
44
+ # Expert: For each term in the terms array, calculates the number of
45
+ # documents containing +term+. Returns an array with these
46
+ # document frequencies. Used to minimize number of remote calls.
47
+ def doc_freqs(terms)
48
+ result = Array.new
49
+ terms.each {|term, i| result << doc_freq(term)}
50
+ return result
51
+ end
52
+
53
+ # Expert: Returns the stored fields of document +n+.
54
+ #
55
+ # See IndexReader#get_document
56
+ def doc(n)
57
+ i = sub_searcher(n)
58
+ return @searchers[i].doc(n - @starts[i])
59
+ end
60
+
61
+ # Returns index of the searcher for document <code>n</code> in the
62
+ # array used to construct this searcher.
63
+ def sub_searcher(n)
64
+ lo = 0 # search starts array
65
+ hi = @searchers.size - 1 # for first element less
66
+ # than n, return its index
67
+ while hi >= lo do
68
+ mid = (lo + hi) >> 1
69
+ midValue = @starts[mid]
70
+ if n < midValue
71
+ hi = mid - 1;
72
+ elsif n > midValue
73
+ lo = mid + 1;
74
+ else # found a match
75
+ while mid+1 < @searchers.size && @starts[mid+1] == midValue do
76
+ mid += 1 # scan to last match
77
+ end
78
+ return mid
79
+ end
80
+ end
81
+ return hi
82
+ end
83
+
84
+ # Returns the document number of document <code>n</code> within its
85
+ # sub-index.
86
+ def sub_doc(n)
87
+ return n - @starts[sub_searcher(n)]
88
+ end
89
+
90
+ # Expert: Returns one greater than the largest possible document number.
91
+ # Called by search code to compute term weights.
92
+ # See IndexReader#max_doc
93
+ def max_doc
94
+ return @max_doc
95
+ end
96
+
97
+ # Create weight in multiple index scenario.
98
+ #
99
+ # Distributed query processing is done in the following steps:
100
+ # 1. rewrite query
101
+ # 2. extract necessary terms
102
+ # 3. collect dfs for these terms from the Searchables
103
+ # 4. create query weight using aggregate dfs.
104
+ # 5. distribute that weight to Searchables
105
+ # 6. merge results
106
+ #
107
+ # Steps 1-4 are done here, 5+6 in the search() methods
108
+ def create_weight(query)
109
+ # step 1
110
+ rewritten_query = self.rewrite(query)
111
+
112
+ # step 2
113
+ terms = Set.new
114
+ rewritten_query.extract_terms(terms)
115
+
116
+ # step 3
117
+ aggregated_dfs = Array.new(terms.size, 0)
118
+ @searchers.each { |searcher|
119
+ dfs = searcher.doc_freqs(terms)
120
+ dfs.each_with_index { |df,i|
121
+ aggregated_dfs[i] += df
122
+ }
123
+ }
124
+
125
+ df_map = Hash.new
126
+ terms.each_with_index { |term,i|
127
+ df_map[term] = aggregated_dfs[i]
128
+ }
129
+
130
+ # step 4
131
+ cache_sim = CachedDfSource.new(df_map, self.max_doc, self.similarity)
132
+
133
+ return rewritten_query.weight(cache_sim)
134
+ end
135
+
136
+
137
+ def search(query, options = {})
138
+ filter = options[:filter]
139
+ first_doc = options[:first_doc]||0
140
+ num_docs = options[:num_docs]||10
141
+ max_size = first_doc + num_docs
142
+ sort = options[:sort]
143
+
144
+ if (num_docs <= 0)
145
+ raise ArgumentError, "num_docs must be > 0 to run a search"
146
+ end
147
+
148
+ if (first_doc < 0)
149
+ raise ArgumentError, "first_doc must be >= 0 to run a search"
150
+ end
151
+
152
+
153
+ if (sort)
154
+ raise NotImplementedError
155
+ #fields = sort.is_a?(Array) ? sort : sort.fields
156
+ #hq = FieldDocSortedHitQueue.new(fields, max_size)
157
+ else
158
+ hq = HitQueue.new(max_size)
159
+ end
160
+
161
+ total_hits = 0
162
+ weight = create_weight(query)
163
+ @searchers.each_with_index { |searcher,i| # search each searcher
164
+ docs = searcher.search(weight,
165
+ :filter => filter,
166
+ #:sort => sort,
167
+ :num_docs => max_size,
168
+ :first_doc => 0)
169
+ total_hits += docs.total_hits # update total_hits
170
+ docs.score_docs.each { |score_doc|
171
+ score_doc.doc += @starts[i] # convert doc
172
+ break unless hq.insert(score_doc) # no more scores > min_score
173
+ }
174
+ }
175
+
176
+ score_docs = []
177
+ if (hq.size > first_doc)
178
+ if (hq.size - first_doc) < num_docs
179
+ num_docs = hq.size - first_doc
180
+ end
181
+ num_docs.times do
182
+ score_docs.unshift(hq.pop)
183
+ end
184
+ end
185
+ hq.clear
186
+
187
+ return TopDocs.new(total_hits, score_docs)
188
+ end
189
+
190
+ def search_each(query, filter = nil, &block)
191
+ weight = create_weight(query)
192
+ @searchers.each { |searcher| # search each searcher
193
+ searcher.search_each(weight, filter, &block)
194
+ }
195
+ end
196
+
197
+ # rewrites the query into a query that can be processed by the search
198
+ # methods. For example, a Fuzzy query is turned into a massive boolean
199
+ # query.
200
+ #
201
+ # original:: The original query to be rewritten.
202
+ def rewrite(original)
203
+ #print "multi_searcher#rewrite: #{original}\n"
204
+ queries = []
205
+ @searchers.each { |searcher|
206
+ queries << searcher.rewrite(original)
207
+ }
208
+ return queries.first.combine(queries)
209
+ end
210
+
211
+ # Returns an Explanation that describes how +doc+ scored against
212
+ # +query+.
213
+ #
214
+ # This is intended to be used in developing Similarity implementations,
215
+ # and, for good performance, should not be displayed with every hit.
216
+ # Computing an explanation is as expensive as executing the query over the
217
+ # entire index.
218
+ def explain(query, doc)
219
+ i = sub_searcher(doc)
220
+ return @searchers[i].explain(create_weight(query), doc-@starts[i])
221
+ end
222
+
223
+ end
224
+
225
+
226
+ # Document Frequency cache acting as a Dummy-Searcher.
227
+ # This class is no full-fledged Searcher, but only supports
228
+ # the methods necessary to initialize Weights.
229
+ class CachedDfSource
230
+
231
+ attr_reader :max_doc, :similarity
232
+
233
+ def initialize(df_map, max_doc, similarity)
234
+ @df_map = df_map
235
+ @max_doc = max_doc
236
+ @similarity = similarity
237
+ end
238
+
239
+ def doc_freq(term)
240
+ return @df_map[term]
241
+ end
242
+
243
+ def doc_freqs(terms)
244
+ result = Array.new
245
+ terms.each { |term|
246
+ result << doc_freq(term)
247
+ }
248
+ return result
249
+ end
250
+
251
+ def rewrite(query)
252
+ # this is a bit of a hack. We know that a query which
253
+ # creates a Weight based on this Dummy-Searcher is
254
+ # always already rewritten (see preparedWeight()).
255
+ # Therefore we just return the unmodified query here
256
+ return query
257
+ end
258
+
259
+ end
260
+
261
+ end