ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/w32_io.c
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
#ifdef WIN32
|
2
|
+
|
3
|
+
#include "global.h"
|
4
|
+
#include "store.h"
|
5
|
+
#include <stdio.h>
|
6
|
+
#include <io.h>
|
7
|
+
#include <errno.h>
|
8
|
+
#include <string.h>
|
9
|
+
|
10
|
+
/**
|
11
|
+
* Create a filepath for a file in the store using the operating systems
|
12
|
+
* default file seperator.
|
13
|
+
*/
|
14
|
+
char *join_path(char *buf, const char *base, const char *filename)
|
15
|
+
{
|
16
|
+
sprintf(buf, "%s\\%s", base, filename);
|
17
|
+
return buf;
|
18
|
+
}
|
19
|
+
|
20
|
+
bool exists(char *path)
|
21
|
+
{
|
22
|
+
int fd = _open(path, 0);
|
23
|
+
if (fd < 0) {
|
24
|
+
if (errno != ENOENT) {
|
25
|
+
RAISE(IO_ERROR, strerror(errno));
|
26
|
+
}
|
27
|
+
return false;
|
28
|
+
}
|
29
|
+
_close(fd);
|
30
|
+
return true;
|
31
|
+
}
|
32
|
+
|
33
|
+
int fcount(char *path)
|
34
|
+
{
|
35
|
+
char buf[MAX_FILE_PATH];
|
36
|
+
struct _finddata_t fd;
|
37
|
+
intptr_t d;
|
38
|
+
int cnt = 0;
|
39
|
+
|
40
|
+
join_path(buf, path, "*");
|
41
|
+
|
42
|
+
if ((d = _findfirst(buf, &fd)) < 0) {
|
43
|
+
RAISE(IO_ERROR, strerror(errno));
|
44
|
+
}
|
45
|
+
|
46
|
+
do {
|
47
|
+
if (fd.name[0] != '.') {
|
48
|
+
cnt++;
|
49
|
+
}
|
50
|
+
} while (_findnext(d, &fd) == 0);
|
51
|
+
_findclose(d);
|
52
|
+
|
53
|
+
return cnt;
|
54
|
+
}
|
55
|
+
|
56
|
+
void dir_each(char *path, void (*func)(char *fname, void *arg), void *arg)
|
57
|
+
{
|
58
|
+
char buf[MAX_FILE_PATH];
|
59
|
+
struct _finddata_t fd;
|
60
|
+
intptr_t d;
|
61
|
+
join_path(buf, path, "*");
|
62
|
+
|
63
|
+
if ((d = _findfirst(buf, &fd)) < 0) {
|
64
|
+
RAISE(IO_ERROR, strerror(errno));
|
65
|
+
}
|
66
|
+
|
67
|
+
while (_findnext(d, &fd) == 0) {
|
68
|
+
if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
|
69
|
+
func(fd.name, arg);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
_findclose(d);
|
73
|
+
}
|
74
|
+
|
75
|
+
/**
|
76
|
+
* Clear all the locks in the store.
|
77
|
+
*
|
78
|
+
* @param store the store to clear the locks from
|
79
|
+
* @throws IO_ERROR if there is an error opening the directory
|
80
|
+
*/
|
81
|
+
void fs_clear_locks(Store *store)
|
82
|
+
{
|
83
|
+
char buf[MAX_FILE_PATH];
|
84
|
+
struct _finddata_t fd;
|
85
|
+
intptr_t d;
|
86
|
+
join_path(buf, store->dir.path, "*");
|
87
|
+
|
88
|
+
if ((d = _findfirst(buf, &fd)) < 0) {
|
89
|
+
RAISE(IO_ERROR, strerror(errno));
|
90
|
+
}
|
91
|
+
|
92
|
+
while (_findnext(d, &fd) == 0) {
|
93
|
+
if (file_is_lock(fd.name)) {
|
94
|
+
remove(join_path(buf, store->dir.path, fd.name));
|
95
|
+
}
|
96
|
+
}
|
97
|
+
_findclose(d);
|
98
|
+
}
|
99
|
+
|
100
|
+
/**
|
101
|
+
* Clear all files from the store except the lock files.
|
102
|
+
*
|
103
|
+
* @param store the store to clear all the files from
|
104
|
+
* @throws IO_ERROR if there is an error deleting the files
|
105
|
+
*/
|
106
|
+
void fs_clear(Store *store)
|
107
|
+
{
|
108
|
+
char buf[MAX_FILE_PATH];
|
109
|
+
struct _finddata_t fd;
|
110
|
+
intptr_t d;
|
111
|
+
join_path(buf, store->dir.path, "*");
|
112
|
+
|
113
|
+
if ((d = _findfirst(buf, &fd)) < 0) {
|
114
|
+
RAISE(IO_ERROR, strerror(errno));
|
115
|
+
}
|
116
|
+
|
117
|
+
while (_findnext(d, &fd) == 0) {
|
118
|
+
if (fd.name[0] != '.' && !file_is_lock(fd.name)) {
|
119
|
+
remove(join_path(buf, store->dir.path, fd.name));
|
120
|
+
}
|
121
|
+
}
|
122
|
+
_findclose(d);
|
123
|
+
}
|
124
|
+
|
125
|
+
/**
|
126
|
+
* Clear all files from the store including the lock files.
|
127
|
+
*
|
128
|
+
* @param store the store to clear all the files from
|
129
|
+
* @throws IO_ERROR if there is an error deleting the files
|
130
|
+
*/
|
131
|
+
void fs_clear_all(Store *store)
|
132
|
+
{
|
133
|
+
char buf[MAX_FILE_PATH];
|
134
|
+
struct _finddata_t fd;
|
135
|
+
intptr_t d;
|
136
|
+
join_path(buf, store->dir.path, "*");
|
137
|
+
|
138
|
+
if ((d = _findfirst(buf, &fd)) < 0) {
|
139
|
+
RAISE(IO_ERROR, strerror(errno));
|
140
|
+
}
|
141
|
+
|
142
|
+
while (_findnext(d, &fd) == 0) {
|
143
|
+
if (fd.name[0] != '.') {
|
144
|
+
remove(join_path(buf, store->dir.path, fd.name));
|
145
|
+
}
|
146
|
+
}
|
147
|
+
_findclose(d);
|
148
|
+
}
|
149
|
+
|
150
|
+
#endif
|
data/lib/ferret.rb
CHANGED
@@ -13,14 +13,15 @@ module Ferret::Analysis
|
|
13
13
|
# addresses, phone numbers, etc.
|
14
14
|
|
15
15
|
class StandardTokenizer < RegExpTokenizer
|
16
|
-
ALPHA = /[[:alpha:]]+/
|
16
|
+
ALPHA = /[[:alpha:]_-]+/
|
17
17
|
APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
|
18
18
|
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
19
19
|
P = /[_\/.,-]/
|
20
20
|
HASDIGIT = /\w*\d\w*/
|
21
|
-
TOKEN_RE =
|
22
|
-
|\.(
|
21
|
+
TOKEN_RE = /#{ALPHA}+(('#{ALPHA}+)+
|
22
|
+
|\.(#{ALPHA}\.)+
|
23
23
|
|(@|\&)\w+([-.]\w+)*
|
24
|
+
|:\/\/\w+([-.\/]\w+)*
|
24
25
|
)
|
25
26
|
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
26
27
|
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
@@ -104,7 +104,7 @@ module Ferret
|
|
104
104
|
# Retrieve the field_info object by either field number or field name.
|
105
105
|
def [](index)
|
106
106
|
if index.is_a? Integer
|
107
|
-
if index
|
107
|
+
if index >= NOT_A_FIELD || index < 0 # < 0 is for C extensions
|
108
108
|
return FieldInfo.new("", false, NOT_A_FIELD, false)
|
109
109
|
end
|
110
110
|
return @fi_array[index]
|
data/lib/ferret/index/term.rb
CHANGED
@@ -11,7 +11,7 @@ module Ferret
|
|
11
11
|
|
12
12
|
class QueryParser < Racc::Parser
|
13
13
|
|
14
|
-
module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..
|
14
|
+
module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id155b60f3fb', 'lib/ferret/query_parser/query_parser.y', 126
|
15
15
|
attr_accessor :default_field, :fields, :handle_parse_errors
|
16
16
|
|
17
17
|
def initialize(default_field = "*", options = {})
|
@@ -20,7 +20,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
|
|
20
20
|
default_field = default_field.split("|")
|
21
21
|
end
|
22
22
|
@field = @default_field = default_field
|
23
|
-
@analyzer = options[:analyzer] || Analysis::
|
23
|
+
@analyzer = options[:analyzer] || Analysis::StandardAnalyzer.new
|
24
24
|
@wild_lower = options[:wild_lower].nil? ? true : options[:wild_lower]
|
25
25
|
@occur_default = options[:occur_default] || BooleanClause::Occur::SHOULD
|
26
26
|
@default_slop = options[:default_slop] || 0
|
@@ -170,23 +170,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
|
|
170
170
|
end
|
171
171
|
|
172
172
|
def get_bad_query(field, str)
|
173
|
-
get_term_query(field, str)
|
174
|
-
#tokens = []
|
175
|
-
#stream = @analyzer.token_stream(field, str)
|
176
|
-
#while token = stream.next
|
177
|
-
# tokens << token
|
178
|
-
#end
|
179
|
-
#if tokens.length == 0
|
180
|
-
# return TermQuery.new(Term.new(field, ""))
|
181
|
-
#elsif tokens.length == 1
|
182
|
-
# return TermQuery.new(Term.new(field, tokens[0].text))
|
183
|
-
#else
|
184
|
-
# bq = BooleanQuery.new()
|
185
|
-
# tokens.each do |token|
|
186
|
-
# bq << BooleanClause.new(TermQuery.new(Term.new(field, token.text)))
|
187
|
-
# end
|
188
|
-
# return bq
|
189
|
-
#end
|
173
|
+
get_term_query(field, str) || BooleanQuery.new()
|
190
174
|
end
|
191
175
|
|
192
176
|
def get_range_query(field, start_word, end_word, inc_upper, inc_lower)
|
@@ -200,7 +184,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
|
|
200
184
|
tokens << token
|
201
185
|
end
|
202
186
|
if tokens.length == 0
|
203
|
-
return
|
187
|
+
return nil
|
204
188
|
elsif tokens.length == 1
|
205
189
|
return TermQuery.new(Term.new(field, tokens[0].text))
|
206
190
|
else
|
@@ -365,14 +349,14 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
|
|
365
349
|
|
366
350
|
def get_boolean_query(clauses)
|
367
351
|
# possible that we got all nil clauses so check
|
368
|
-
|
352
|
+
bq = BooleanQuery.new()
|
353
|
+
return bq if clauses.nil?
|
369
354
|
clauses.compact!
|
370
|
-
return
|
355
|
+
return bq if clauses.size == 0
|
371
356
|
|
372
357
|
if clauses.size == 1 and not clauses[0].prohibited?
|
373
358
|
return clauses[0].query
|
374
359
|
end
|
375
|
-
bq = BooleanQuery.new()
|
376
360
|
clauses.each {|clause| bq << clause }
|
377
361
|
return bq
|
378
362
|
end
|
@@ -414,7 +398,7 @@ module_eval <<'..end lib/ferret/query_parser/query_parser.y modeval..id94697a994
|
|
414
398
|
return qp.parse(query)
|
415
399
|
end
|
416
400
|
|
417
|
-
..end lib/ferret/query_parser/query_parser.y modeval..
|
401
|
+
..end lib/ferret/query_parser/query_parser.y modeval..id155b60f3fb
|
418
402
|
|
419
403
|
##### racc 1.4.4 generates ###
|
420
404
|
|
data/lib/ferret/search.rb
CHANGED
@@ -104,7 +104,13 @@ module Ferret::Search
|
|
104
104
|
raise ArgumentError, "first_doc must be >= 0 to run a search"
|
105
105
|
end
|
106
106
|
|
107
|
-
|
107
|
+
# for MultiSearcher: the weight is computed across all searchers
|
108
|
+
if query.is_a? Weight
|
109
|
+
scorer = query.scorer(@reader)
|
110
|
+
else
|
111
|
+
scorer = query.weight(self).scorer(@reader)
|
112
|
+
end
|
113
|
+
|
108
114
|
if (scorer == nil)
|
109
115
|
return TopDocs.new(0, [])
|
110
116
|
end
|
@@ -117,14 +123,10 @@ module Ferret::Search
|
|
117
123
|
hq = HitQueue.new(max_size)
|
118
124
|
end
|
119
125
|
total_hits = 0
|
120
|
-
min_score = 0.0
|
121
126
|
scorer.each_hit() do |doc, score|
|
122
127
|
if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
|
123
128
|
total_hits += 1
|
124
|
-
|
125
|
-
hq.insert(ScoreDoc.new(doc, score))
|
126
|
-
min_score = hq.top.score # maintain min_score
|
127
|
-
end
|
129
|
+
hq.insert(ScoreDoc.new(doc, score))
|
128
130
|
end
|
129
131
|
end
|
130
132
|
|
@@ -148,7 +150,12 @@ module Ferret::Search
|
|
148
150
|
# usually want your hits sorted at least by score so you should use the
|
149
151
|
# #search method.
|
150
152
|
def search_each(query, filter = nil)
|
151
|
-
|
153
|
+
# for MultiSearcher: the weight is computed across all searchers
|
154
|
+
if query.is_a? Weight
|
155
|
+
scorer = query.scorer(@reader)
|
156
|
+
else
|
157
|
+
scorer = query.weight(self).scorer(@reader)
|
158
|
+
end
|
152
159
|
return if scorer == nil
|
153
160
|
bits = (filter.nil? ? nil : filter.bits(@reader))
|
154
161
|
scorer.each_hit() do |doc, score|
|
@@ -175,13 +182,19 @@ module Ferret::Search
|
|
175
182
|
|
176
183
|
# Returns an Explanation that describes how +doc+ scored against
|
177
184
|
# +query+.
|
185
|
+
# A weight may be given as first parameter instead of the query, too.
|
178
186
|
#
|
179
187
|
# This is intended to be used in developing Similarity implementations,
|
180
188
|
# and, for good performance, should not be displayed with every hit.
|
181
189
|
# Computing an explanation is as expensive as executing the query over the
|
182
190
|
# entire index.
|
183
191
|
def explain(query, doc)
|
184
|
-
|
192
|
+
if query.is_a? Weight
|
193
|
+
weight = query
|
194
|
+
else
|
195
|
+
weight = query.weight(self)
|
196
|
+
end
|
197
|
+
return weight.explain(@reader, doc)
|
185
198
|
end
|
186
199
|
end
|
187
200
|
end
|
@@ -181,6 +181,13 @@ module Ferret::Search
|
|
181
181
|
end
|
182
182
|
end
|
183
183
|
|
184
|
+
# See Query#extract_terms()
|
185
|
+
def extract_terms(query_terms)
|
186
|
+
@term_arrays.each { |terms|
|
187
|
+
query_terms.merge(terms)
|
188
|
+
}
|
189
|
+
end
|
190
|
+
|
184
191
|
def create_weight(searcher)
|
185
192
|
return MultiPhraseWeight.new(self, searcher)
|
186
193
|
end
|
@@ -0,0 +1,261 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
|
3
|
+
# Implements searching multiple IndexSearchers at once
|
4
|
+
#
|
5
|
+
# Applications usually need only call the @link #search(Query)
|
6
|
+
# or @link #search(Query,Filter) methods. For performance reasons it is
|
7
|
+
# recommended to open only one Searcher and use it for all of your searches.
|
8
|
+
class MultiSearcher
|
9
|
+
include Ferret::Index
|
10
|
+
|
11
|
+
attr_accessor :similarity, :searchers
|
12
|
+
|
13
|
+
# Creates a MultiSearcher searching across all the searchers
|
14
|
+
# in the provided array.
|
15
|
+
#
|
16
|
+
def initialize(args)
|
17
|
+
@searchers = Array.new(args)
|
18
|
+
@similarity = Similarity.default
|
19
|
+
|
20
|
+
# initialize reader lookup array
|
21
|
+
@max_doc = 0
|
22
|
+
@starts = Array.new(@searchers.size + 1)
|
23
|
+
@searchers.each_with_index { |searcher, i|
|
24
|
+
@starts[i] = @max_doc
|
25
|
+
@max_doc += searcher.max_doc
|
26
|
+
}
|
27
|
+
@starts[@searchers.size] = @max_doc
|
28
|
+
end
|
29
|
+
|
30
|
+
# closes all underlying Searchers
|
31
|
+
def close()
|
32
|
+
@searchers.each { |searcher| searcher.close() }
|
33
|
+
end
|
34
|
+
|
35
|
+
# Expert: Returns the number of documents containing +term+.
|
36
|
+
# Called by search code to compute term weights.
|
37
|
+
# See IndexReader#doc_freq
|
38
|
+
def doc_freq(term)
|
39
|
+
return @searchers.inject(0) { |df, searcher|
|
40
|
+
df + searcher.doc_freq(term)
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
# Expert: For each term in the terms array, calculates the number of
|
45
|
+
# documents containing +term+. Returns an array with these
|
46
|
+
# document frequencies. Used to minimize number of remote calls.
|
47
|
+
def doc_freqs(terms)
|
48
|
+
result = Array.new
|
49
|
+
terms.each {|term, i| result << doc_freq(term)}
|
50
|
+
return result
|
51
|
+
end
|
52
|
+
|
53
|
+
# Expert: Returns the stored fields of document +n+.
|
54
|
+
#
|
55
|
+
# See IndexReader#get_document
|
56
|
+
def doc(n)
|
57
|
+
i = sub_searcher(n)
|
58
|
+
return @searchers[i].doc(n - @starts[i])
|
59
|
+
end
|
60
|
+
|
61
|
+
# Returns index of the searcher for document <code>n</code> in the
|
62
|
+
# array used to construct this searcher.
|
63
|
+
def sub_searcher(n)
|
64
|
+
lo = 0 # search starts array
|
65
|
+
hi = @searchers.size - 1 # for first element less
|
66
|
+
# than n, return its index
|
67
|
+
while hi >= lo do
|
68
|
+
mid = (lo + hi) >> 1
|
69
|
+
midValue = @starts[mid]
|
70
|
+
if n < midValue
|
71
|
+
hi = mid - 1;
|
72
|
+
elsif n > midValue
|
73
|
+
lo = mid + 1;
|
74
|
+
else # found a match
|
75
|
+
while mid+1 < @searchers.size && @starts[mid+1] == midValue do
|
76
|
+
mid += 1 # scan to last match
|
77
|
+
end
|
78
|
+
return mid
|
79
|
+
end
|
80
|
+
end
|
81
|
+
return hi
|
82
|
+
end
|
83
|
+
|
84
|
+
# Returns the document number of document <code>n</code> within its
|
85
|
+
# sub-index.
|
86
|
+
def sub_doc(n)
|
87
|
+
return n - @starts[sub_searcher(n)]
|
88
|
+
end
|
89
|
+
|
90
|
+
# Expert: Returns one greater than the largest possible document number.
|
91
|
+
# Called by search code to compute term weights.
|
92
|
+
# See IndexReader#max_doc
|
93
|
+
def max_doc
|
94
|
+
return @max_doc
|
95
|
+
end
|
96
|
+
|
97
|
+
# Create weight in multiple index scenario.
|
98
|
+
#
|
99
|
+
# Distributed query processing is done in the following steps:
|
100
|
+
# 1. rewrite query
|
101
|
+
# 2. extract necessary terms
|
102
|
+
# 3. collect dfs for these terms from the Searchables
|
103
|
+
# 4. create query weight using aggregate dfs.
|
104
|
+
# 5. distribute that weight to Searchables
|
105
|
+
# 6. merge results
|
106
|
+
#
|
107
|
+
# Steps 1-4 are done here, 5+6 in the search() methods
|
108
|
+
def create_weight(query)
|
109
|
+
# step 1
|
110
|
+
rewritten_query = self.rewrite(query)
|
111
|
+
|
112
|
+
# step 2
|
113
|
+
terms = Set.new
|
114
|
+
rewritten_query.extract_terms(terms)
|
115
|
+
|
116
|
+
# step 3
|
117
|
+
aggregated_dfs = Array.new(terms.size, 0)
|
118
|
+
@searchers.each { |searcher|
|
119
|
+
dfs = searcher.doc_freqs(terms)
|
120
|
+
dfs.each_with_index { |df,i|
|
121
|
+
aggregated_dfs[i] += df
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
df_map = Hash.new
|
126
|
+
terms.each_with_index { |term,i|
|
127
|
+
df_map[term] = aggregated_dfs[i]
|
128
|
+
}
|
129
|
+
|
130
|
+
# step 4
|
131
|
+
cache_sim = CachedDfSource.new(df_map, self.max_doc, self.similarity)
|
132
|
+
|
133
|
+
return rewritten_query.weight(cache_sim)
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
def search(query, options = {})
|
138
|
+
filter = options[:filter]
|
139
|
+
first_doc = options[:first_doc]||0
|
140
|
+
num_docs = options[:num_docs]||10
|
141
|
+
max_size = first_doc + num_docs
|
142
|
+
sort = options[:sort]
|
143
|
+
|
144
|
+
if (num_docs <= 0)
|
145
|
+
raise ArgumentError, "num_docs must be > 0 to run a search"
|
146
|
+
end
|
147
|
+
|
148
|
+
if (first_doc < 0)
|
149
|
+
raise ArgumentError, "first_doc must be >= 0 to run a search"
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
if (sort)
|
154
|
+
raise NotImplementedError
|
155
|
+
#fields = sort.is_a?(Array) ? sort : sort.fields
|
156
|
+
#hq = FieldDocSortedHitQueue.new(fields, max_size)
|
157
|
+
else
|
158
|
+
hq = HitQueue.new(max_size)
|
159
|
+
end
|
160
|
+
|
161
|
+
total_hits = 0
|
162
|
+
weight = create_weight(query)
|
163
|
+
@searchers.each_with_index { |searcher,i| # search each searcher
|
164
|
+
docs = searcher.search(weight,
|
165
|
+
:filter => filter,
|
166
|
+
#:sort => sort,
|
167
|
+
:num_docs => max_size,
|
168
|
+
:first_doc => 0)
|
169
|
+
total_hits += docs.total_hits # update total_hits
|
170
|
+
docs.score_docs.each { |score_doc|
|
171
|
+
score_doc.doc += @starts[i] # convert doc
|
172
|
+
break unless hq.insert(score_doc) # no more scores > min_score
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
score_docs = []
|
177
|
+
if (hq.size > first_doc)
|
178
|
+
if (hq.size - first_doc) < num_docs
|
179
|
+
num_docs = hq.size - first_doc
|
180
|
+
end
|
181
|
+
num_docs.times do
|
182
|
+
score_docs.unshift(hq.pop)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
hq.clear
|
186
|
+
|
187
|
+
return TopDocs.new(total_hits, score_docs)
|
188
|
+
end
|
189
|
+
|
190
|
+
def search_each(query, filter = nil, &block)
|
191
|
+
weight = create_weight(query)
|
192
|
+
@searchers.each { |searcher| # search each searcher
|
193
|
+
searcher.search_each(weight, filter, &block)
|
194
|
+
}
|
195
|
+
end
|
196
|
+
|
197
|
+
# rewrites the query into a query that can be processed by the search
|
198
|
+
# methods. For example, a Fuzzy query is turned into a massive boolean
|
199
|
+
# query.
|
200
|
+
#
|
201
|
+
# original:: The original query to be rewritten.
|
202
|
+
def rewrite(original)
|
203
|
+
#print "multi_searcher#rewrite: #{original}\n"
|
204
|
+
queries = []
|
205
|
+
@searchers.each { |searcher|
|
206
|
+
queries << searcher.rewrite(original)
|
207
|
+
}
|
208
|
+
return queries.first.combine(queries)
|
209
|
+
end
|
210
|
+
|
211
|
+
# Returns an Explanation that describes how +doc+ scored against
|
212
|
+
# +query+.
|
213
|
+
#
|
214
|
+
# This is intended to be used in developing Similarity implementations,
|
215
|
+
# and, for good performance, should not be displayed with every hit.
|
216
|
+
# Computing an explanation is as expensive as executing the query over the
|
217
|
+
# entire index.
|
218
|
+
def explain(query, doc)
|
219
|
+
i = sub_searcher(doc)
|
220
|
+
return @searchers[i].explain(create_weight(query), doc-@starts[i])
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
224
|
+
|
225
|
+
|
226
|
+
# Document Frequency cache acting as a Dummy-Searcher.
|
227
|
+
# This class is no full-fledged Searcher, but only supports
|
228
|
+
# the methods necessary to initialize Weights.
|
229
|
+
class CachedDfSource
|
230
|
+
|
231
|
+
attr_reader :max_doc, :similarity
|
232
|
+
|
233
|
+
def initialize(df_map, max_doc, similarity)
|
234
|
+
@df_map = df_map
|
235
|
+
@max_doc = max_doc
|
236
|
+
@similarity = similarity
|
237
|
+
end
|
238
|
+
|
239
|
+
def doc_freq(term)
|
240
|
+
return @df_map[term]
|
241
|
+
end
|
242
|
+
|
243
|
+
def doc_freqs(terms)
|
244
|
+
result = Array.new
|
245
|
+
terms.each { |term|
|
246
|
+
result << doc_freq(term)
|
247
|
+
}
|
248
|
+
return result
|
249
|
+
end
|
250
|
+
|
251
|
+
def rewrite(query)
|
252
|
+
# this is a bit of a hack. We know that a query which
|
253
|
+
# creates a Weight based on this Dummy-Searcher is
|
254
|
+
# always already rewritten (see preparedWeight()).
|
255
|
+
# Therefore we just return the unmodified query here
|
256
|
+
return query
|
257
|
+
end
|
258
|
+
|
259
|
+
end
|
260
|
+
|
261
|
+
end
|