ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,34 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Lower-level search API.
|
3
|
-
#
|
4
|
-
# HitCollectors are primarily meant to be used to implement queries, sorting
|
5
|
-
# and filtering.
|
6
|
-
#
|
7
|
-
# See Searcher#search(Query, HitCollector)
|
8
|
-
class HitCollector
|
9
|
-
# Called once for every non-zero scoring document, with the document number
|
10
|
-
# and its score.
|
11
|
-
#
|
12
|
-
# If, for example, an application wished to collect all of the hits for a
|
13
|
-
# query in a BitSet, then it might:
|
14
|
-
#
|
15
|
-
# searcher = IndexSearcher.new(index_reader)
|
16
|
-
# bits = BitSet.new(index_reader.max_doc())
|
17
|
-
# searcher.search(query, HitCollector.new()
|
18
|
-
# def collect(doc, score)
|
19
|
-
# bits.set(doc)
|
20
|
-
# end
|
21
|
-
# end
|
22
|
-
#
|
23
|
-
# NOTE: This is called in an inner search loop. For good search
|
24
|
-
# performance, implementations of this method should not call
|
25
|
-
# Searcher#doc(int) or IndexReader#document(int) on every document number
|
26
|
-
# encountered. Doing so can slow searches by an order of magnitude or more.
|
27
|
-
#
|
28
|
-
# NOTE: The +score+ passed to this method is a raw score. In other words,
|
29
|
-
# the score will not necessarily be a float whose value is between 0 and 1.
|
30
|
-
def collect(doc, score)
|
31
|
-
raise NotImplementedError
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
@@ -1,200 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
|
3
|
-
# Implements search over a single IndexReader.
|
4
|
-
#
|
5
|
-
# Applications usually need only call the inherited @link #search(Query)end
|
6
|
-
# or @link #search(Query,Filter)endmethods. For performance reasons it is
|
7
|
-
# recommended to open only one IndexSearcher and use it for all of your searches.
|
8
|
-
class IndexSearcher
|
9
|
-
include Ferret::Index
|
10
|
-
|
11
|
-
attr_accessor :similarity, :reader
|
12
|
-
|
13
|
-
# Creates a searcher searching the index in the provided directory.
|
14
|
-
#
|
15
|
-
# You need to pass one argument which should be one of the following:
|
16
|
-
#
|
17
|
-
# * An index reader which the searcher will search
|
18
|
-
# * A directory where the searcher will open an index reader to search
|
19
|
-
# * A string which represents a path to the directory to be searched
|
20
|
-
#
|
21
|
-
def initialize(arg)
|
22
|
-
if arg.is_a?(IndexReader)
|
23
|
-
@reader = arg
|
24
|
-
elsif arg.is_a?(Ferret::Store::Directory)
|
25
|
-
@reader = IndexReader.open(arg, false)
|
26
|
-
elsif arg.is_a?(String)
|
27
|
-
@dir = Ferret::Store::FSDirectory.new(arg, false)
|
28
|
-
@reader = IndexReader.open(@dir, true)
|
29
|
-
else
|
30
|
-
raise ArgumentError, "Unknown argument passed to initialize IndexReader"
|
31
|
-
end
|
32
|
-
|
33
|
-
@similarity = Similarity.default
|
34
|
-
end
|
35
|
-
|
36
|
-
# IndexSearcher was constructed with IndexSearcher(r).
|
37
|
-
# If the IndexReader was supplied implicitly by specifying a directory, then
|
38
|
-
# the IndexReader gets closed.
|
39
|
-
def close()
|
40
|
-
@reader.close()
|
41
|
-
end
|
42
|
-
|
43
|
-
# Expert: Returns the number of documents containing +term+.
|
44
|
-
# Called by search code to compute term weights.
|
45
|
-
# See IndexReader#doc_freq
|
46
|
-
def doc_freq(term)
|
47
|
-
return @reader.doc_freq(term)
|
48
|
-
end
|
49
|
-
|
50
|
-
# Expert: For each term in the terms array, calculates the number of
|
51
|
-
# documents containing +term+. Returns an array with these
|
52
|
-
# document frequencies. Used to minimize number of remote calls.
|
53
|
-
def doc_freqs(terms)
|
54
|
-
result = Array.new(terms.length)
|
55
|
-
terms.each_with_index {|term, i| result[i] = doc_freq(term)}
|
56
|
-
return result
|
57
|
-
end
|
58
|
-
|
59
|
-
# Expert: Returns the stored fields of document +i+.
|
60
|
-
#
|
61
|
-
# See IndexReader#get_document
|
62
|
-
def doc(i)
|
63
|
-
return @reader.get_document(i)
|
64
|
-
end
|
65
|
-
|
66
|
-
# Expert: Returns one greater than the largest possible document number.
|
67
|
-
# Called by search code to compute term weights.
|
68
|
-
# See IndexReader#max_doc
|
69
|
-
def max_doc()
|
70
|
-
return @reader.max_doc()
|
71
|
-
end
|
72
|
-
|
73
|
-
# Creates a weight for +query+
|
74
|
-
# returns:: new weight
|
75
|
-
def create_weight(query)
|
76
|
-
return query.weight(self)
|
77
|
-
end
|
78
|
-
|
79
|
-
# The main search method for the index. You need to create a query to
|
80
|
-
# pass to this method. You can also pass a hash with one or more of the
|
81
|
-
# following; {filter, num_docs, first_doc, sort}
|
82
|
-
#
|
83
|
-
# query:: The query to run on the index
|
84
|
-
# filter:: filters docs from the search result
|
85
|
-
# first_doc:: The index in the results of the first doc retrieved.
|
86
|
-
# Default is 0
|
87
|
-
# num_docs:: The number of results returned. Default is 10
|
88
|
-
# sort:: An array of SortFields describing how to sort the results.
|
89
|
-
def search(query, options = {})
|
90
|
-
filter = options[:filter]
|
91
|
-
first_doc = options[:first_doc]||0
|
92
|
-
num_docs = options[:num_docs]||10
|
93
|
-
max_size = first_doc + num_docs
|
94
|
-
sort = options[:sort]
|
95
|
-
if sort and not sort.kind_of?(Sort)
|
96
|
-
sort = Sort.new(sort)
|
97
|
-
end
|
98
|
-
|
99
|
-
if (num_docs <= 0)
|
100
|
-
raise ArgumentError, "num_docs must be > 0 to run a search"
|
101
|
-
end
|
102
|
-
|
103
|
-
if (first_doc < 0)
|
104
|
-
raise ArgumentError, "first_doc must be >= 0 to run a search"
|
105
|
-
end
|
106
|
-
|
107
|
-
# for MultiSearcher: the weight is computed across all searchers
|
108
|
-
if query.is_a? Weight
|
109
|
-
scorer = query.scorer(@reader)
|
110
|
-
else
|
111
|
-
scorer = query.weight(self).scorer(@reader)
|
112
|
-
end
|
113
|
-
|
114
|
-
if (scorer == nil)
|
115
|
-
return TopDocs.new(0, [])
|
116
|
-
end
|
117
|
-
|
118
|
-
bits = (filter.nil? ? nil : filter.bits(@reader))
|
119
|
-
if (sort)
|
120
|
-
fields = sort.is_a?(Array) ? sort : sort.fields
|
121
|
-
hq = FieldSortedHitQueue.new(@reader, fields, max_size)
|
122
|
-
else
|
123
|
-
hq = HitQueue.new(max_size)
|
124
|
-
end
|
125
|
-
total_hits = 0
|
126
|
-
scorer.each_hit() do |doc, score|
|
127
|
-
if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
|
128
|
-
total_hits += 1
|
129
|
-
hq.insert(ScoreDoc.new(doc, score))
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
score_docs = []
|
134
|
-
if (hq.size > first_doc)
|
135
|
-
if (hq.size - first_doc) < num_docs
|
136
|
-
num_docs = hq.size - first_doc
|
137
|
-
end
|
138
|
-
num_docs.times do
|
139
|
-
score_docs.unshift(hq.pop)
|
140
|
-
end
|
141
|
-
end
|
142
|
-
hq.clear
|
143
|
-
|
144
|
-
return TopDocs.new(total_hits, score_docs)
|
145
|
-
end
|
146
|
-
|
147
|
-
# Accepts a block and iterates through all of results yielding the doc
|
148
|
-
# number and the score for that hit. The hits are unsorted. This is the
|
149
|
-
# fastest way to get all of the hits from a search. However, you will
|
150
|
-
# usually want your hits sorted at least by score so you should use the
|
151
|
-
# #search method.
|
152
|
-
def search_each(query, filter = nil)
|
153
|
-
# for MultiSearcher: the weight is computed across all searchers
|
154
|
-
if query.is_a? Weight
|
155
|
-
scorer = query.scorer(@reader)
|
156
|
-
else
|
157
|
-
scorer = query.weight(self).scorer(@reader)
|
158
|
-
end
|
159
|
-
return if scorer == nil
|
160
|
-
bits = (filter.nil? ? nil : filter.bits(@reader))
|
161
|
-
scorer.each_hit() do |doc, score|
|
162
|
-
if score > 0.0 and (bits.nil? or bits.get(doc)) # skip docs not in bits
|
163
|
-
yield(doc, score)
|
164
|
-
end
|
165
|
-
end
|
166
|
-
end
|
167
|
-
|
168
|
-
# rewrites the query into a query that can be processed by the search
|
169
|
-
# methods. For example, a Fuzzy query is turned into a massive boolean
|
170
|
-
# query.
|
171
|
-
#
|
172
|
-
# original:: The original query to be rewritten.
|
173
|
-
def rewrite(original)
|
174
|
-
query = original
|
175
|
-
rewritten_query = query.rewrite(@reader)
|
176
|
-
while query != rewritten_query
|
177
|
-
query = rewritten_query
|
178
|
-
rewritten_query = query.rewrite(@reader)
|
179
|
-
end
|
180
|
-
return query
|
181
|
-
end
|
182
|
-
|
183
|
-
# Returns an Explanation that describes how +doc+ scored against
|
184
|
-
# +query+.
|
185
|
-
# A weight may be given as first parameter instead of the query, too.
|
186
|
-
#
|
187
|
-
# This is intended to be used in developing Similarity implementations,
|
188
|
-
# and, for good performance, should not be displayed with every hit.
|
189
|
-
# Computing an explanation is as expensive as executing the query over the
|
190
|
-
# entire index.
|
191
|
-
def explain(query, doc)
|
192
|
-
if query.is_a? Weight
|
193
|
-
weight = query
|
194
|
-
else
|
195
|
-
weight = query.weight(self)
|
196
|
-
end
|
197
|
-
return weight.explain(@reader, doc)
|
198
|
-
end
|
199
|
-
end
|
200
|
-
end
|
@@ -1,104 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# A query that matches all documents.
|
3
|
-
class MatchAllQuery < Query
|
4
|
-
|
5
|
-
def initialize()
|
6
|
-
super
|
7
|
-
end
|
8
|
-
|
9
|
-
class MatchAllScorer < Scorer
|
10
|
-
|
11
|
-
def initialize(reader, similarity)
|
12
|
-
super(similarity)
|
13
|
-
@reader = reader
|
14
|
-
@count = -1
|
15
|
-
@max_doc = reader.max_doc
|
16
|
-
end
|
17
|
-
|
18
|
-
def doc()
|
19
|
-
return @count
|
20
|
-
end
|
21
|
-
|
22
|
-
def explain(doc)
|
23
|
-
return Explanation.new(1.0, "MatchAllQuery")
|
24
|
-
end
|
25
|
-
|
26
|
-
def next?
|
27
|
-
while (@count < (@max_doc - 1))
|
28
|
-
@count += 1
|
29
|
-
if (!@reader.deleted?(@count))
|
30
|
-
return true
|
31
|
-
end
|
32
|
-
end
|
33
|
-
return false
|
34
|
-
end
|
35
|
-
|
36
|
-
def score()
|
37
|
-
return 1.0
|
38
|
-
end
|
39
|
-
|
40
|
-
def skip_to(target)
|
41
|
-
@count = target - 1
|
42
|
-
return next?
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
class MatchAllWeight < Weight
|
47
|
-
attr_reader :query
|
48
|
-
def initialize(query, searcher)
|
49
|
-
@query = query
|
50
|
-
@searcher = searcher
|
51
|
-
end
|
52
|
-
|
53
|
-
def to_s()
|
54
|
-
return "weight(#{@query})"
|
55
|
-
end
|
56
|
-
|
57
|
-
def value()
|
58
|
-
return 1.0
|
59
|
-
end
|
60
|
-
|
61
|
-
def sum_of_squared_weights()
|
62
|
-
return 1.0
|
63
|
-
end
|
64
|
-
|
65
|
-
def normalize(query_norm)
|
66
|
-
end
|
67
|
-
|
68
|
-
def scorer(reader)
|
69
|
-
return MatchAllScorer.new(reader, @query.similarity(@searcher))
|
70
|
-
end
|
71
|
-
|
72
|
-
def explain(reader, doc)
|
73
|
-
# explain query weight
|
74
|
-
query_expl = Explanation.new(1.0, "MatchAllQuery")
|
75
|
-
boost_expl = Explanation.new(@query.boost, "boost")
|
76
|
-
if (boost_expl.value != 1.0)
|
77
|
-
query_expl << boost_expl
|
78
|
-
query_expl.value = boost_expl.value
|
79
|
-
end
|
80
|
-
|
81
|
-
return query_expl
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
def create_weight(searcher)
|
86
|
-
return MatchAllWeight.new(self, searcher)
|
87
|
-
end
|
88
|
-
|
89
|
-
def to_s(field)
|
90
|
-
buffer = "MatchAllQuery"
|
91
|
-
buffer << "^#{boost}" if (boost() != 1.0)
|
92
|
-
return buffer
|
93
|
-
end
|
94
|
-
|
95
|
-
def eql?(o)
|
96
|
-
return (o.instance_of?(MatchAllQuery) and boost == o.boost)
|
97
|
-
end
|
98
|
-
alias :== :eql?
|
99
|
-
|
100
|
-
def hash
|
101
|
-
return boost.hash
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
@@ -1,216 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# MultiPhraseQuery is a generalized version of PhraseQuery, with an added
|
3
|
-
# method #add(Term[]).
|
4
|
-
#
|
5
|
-
# To use this class, to search for the phrase "Microsoft app*" first use
|
6
|
-
# add(Term) on the term "Microsoft", then find all terms that have "app" as
|
7
|
-
# prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[]
|
8
|
-
# terms) to add them to the query.
|
9
|
-
#
|
10
|
-
# Author Anders Nielsen
|
11
|
-
class MultiPhraseQuery < Query
|
12
|
-
include Ferret::Index
|
13
|
-
|
14
|
-
attr_accessor :slop
|
15
|
-
attr_reader :positions, :term_arrays, :field
|
16
|
-
|
17
|
-
def initialize()
|
18
|
-
super()
|
19
|
-
@slop = 0
|
20
|
-
@term_arrays = []
|
21
|
-
@positions = []
|
22
|
-
@field = nil
|
23
|
-
end
|
24
|
-
|
25
|
-
# Allows to specify the relative position of terms within the phrase.
|
26
|
-
#
|
27
|
-
# See PhraseQuery#add(Term, int)
|
28
|
-
# terms:: the array of terms to search for or a single term
|
29
|
-
# position:: the position to search for these terms
|
30
|
-
def add(terms, position = nil, pos_inc = 1)
|
31
|
-
if position.nil?
|
32
|
-
position = (@positions.size > 0) ? (@positions[-1] + pos_inc) : 0
|
33
|
-
end
|
34
|
-
|
35
|
-
if terms.instance_of?(Term)
|
36
|
-
terms = [terms]
|
37
|
-
end
|
38
|
-
|
39
|
-
if (@term_arrays.size == 0)
|
40
|
-
@field = terms[0].field
|
41
|
-
end
|
42
|
-
|
43
|
-
terms.each do |term|
|
44
|
-
if (term.field != @field)
|
45
|
-
raise ArgumentError,
|
46
|
-
"All phrase terms must be in the same field (#{@field}): #{term}"
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
if i = @positions.index(position)
|
51
|
-
term_arrays[i] += terms
|
52
|
-
else
|
53
|
-
@term_arrays << terms
|
54
|
-
@positions << position
|
55
|
-
end
|
56
|
-
end
|
57
|
-
alias :<< :add
|
58
|
-
|
59
|
-
class MultiPhraseWeight < Weight
|
60
|
-
include Ferret::Index
|
61
|
-
|
62
|
-
attr_reader :query, :value
|
63
|
-
|
64
|
-
def initialize(query, searcher)
|
65
|
-
@query = query
|
66
|
-
@term_arrays = query.term_arrays
|
67
|
-
@positions = query.positions
|
68
|
-
@similarity = query.similarity(searcher)
|
69
|
-
@idf = 0.0
|
70
|
-
|
71
|
-
# compute idf
|
72
|
-
query.term_arrays.each do |terms|
|
73
|
-
terms.each do |term|
|
74
|
-
@idf += @similarity.idf_term(term, searcher)
|
75
|
-
end
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def sum_of_squared_weights()
|
80
|
-
@query_weight = @idf * @query.boost() # compute query weight
|
81
|
-
return @query_weight * @query_weight # square it
|
82
|
-
end
|
83
|
-
|
84
|
-
def normalize(query_norm)
|
85
|
-
@query_norm = query_norm
|
86
|
-
@query_weight *= query_norm # normalize query weight
|
87
|
-
@value = @query_weight * @idf # idf for document
|
88
|
-
end
|
89
|
-
|
90
|
-
def scorer(reader)
|
91
|
-
return nil if (@term_arrays.size == 0) # optimize zero-term case
|
92
|
-
tps = []
|
93
|
-
@term_arrays.each do |terms|
|
94
|
-
p = []
|
95
|
-
if (terms.length > 1)
|
96
|
-
p = MultipleTermDocPosEnum.new(reader, terms)
|
97
|
-
else
|
98
|
-
p = reader.term_positions_for(terms[0])
|
99
|
-
end
|
100
|
-
|
101
|
-
return nil if (p == nil)
|
102
|
-
|
103
|
-
tps << p
|
104
|
-
end
|
105
|
-
|
106
|
-
if (@query.slop == 0)
|
107
|
-
return ExactPhraseScorer.new(self, tps, @positions, @similarity,
|
108
|
-
reader.get_norms(@query.field))
|
109
|
-
else
|
110
|
-
return SloppyPhraseScorer.new(self, tps, @positions, @similarity,
|
111
|
-
@query.slop, reader.get_norms(@query.field))
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
def explain(reader, doc)
|
116
|
-
|
117
|
-
result = Explanation.new()
|
118
|
-
result.description = "weight(#{@query} in #{doc}), product of:"
|
119
|
-
|
120
|
-
idf_expl = Explanation.new(@idf, "idf(#{@query})")
|
121
|
-
|
122
|
-
# explain query weight
|
123
|
-
query_expl = Explanation.new()
|
124
|
-
query_expl.description = "query_weight(#{@query}), product of:"
|
125
|
-
|
126
|
-
boost = @query.boost()
|
127
|
-
if boost != 1.0
|
128
|
-
boost_expl = Explanation.new(boost, "boost")
|
129
|
-
query_expl << boost_expl
|
130
|
-
end
|
131
|
-
query_expl << idf_expl
|
132
|
-
|
133
|
-
query_norm_expl = Explanation.new(@query_norm,"query_norm")
|
134
|
-
query_expl << query_norm_expl
|
135
|
-
|
136
|
-
query_expl.value = boost * @idf * @query_norm
|
137
|
-
|
138
|
-
result << query_expl
|
139
|
-
|
140
|
-
# explain field weight
|
141
|
-
field_expl = Explanation.new()
|
142
|
-
field_expl.description =
|
143
|
-
"field_weight(#{@query} in #{doc}), product of:"
|
144
|
-
|
145
|
-
tf_expl = scorer(reader).explain(doc)
|
146
|
-
field_expl << tf_expl
|
147
|
-
field_expl << idf_expl
|
148
|
-
|
149
|
-
field_norm_expl = Explanation.new()
|
150
|
-
field_norms = reader.get_norms(@query.field)
|
151
|
-
field_norm =
|
152
|
-
field_norms ? Similarity.decode_norm(field_norms[doc]) : 0.0
|
153
|
-
field_norm_expl.value = field_norm
|
154
|
-
field_norm_expl.description =
|
155
|
-
"field_norm(field=#{@query.field}, doc=#{doc})"
|
156
|
-
field_expl << field_norm_expl
|
157
|
-
|
158
|
-
field_expl.value = tf_expl.value * idf_expl.value * field_norm_expl.value
|
159
|
-
result << field_expl
|
160
|
-
|
161
|
-
if (query_expl.value == 1.0)
|
162
|
-
return field_expl
|
163
|
-
else
|
164
|
-
result.value = query_expl.value * field_expl.value
|
165
|
-
return result
|
166
|
-
end
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
def rewrite(reader)
|
171
|
-
if (@term_arrays.size() == 1) # optimize one-term case
|
172
|
-
terms = @term_arrays[0]
|
173
|
-
bq = BooleanQuery.new(true)
|
174
|
-
terms.each do |term|
|
175
|
-
bq.add_query(TermQuery.new(term), BooleanClause::Occur::SHOULD)
|
176
|
-
end
|
177
|
-
bq.boost = boost()
|
178
|
-
return bq
|
179
|
-
else
|
180
|
-
return self
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
# See Query#extract_terms()
|
185
|
-
def extract_terms(query_terms)
|
186
|
-
@term_arrays.each { |terms|
|
187
|
-
query_terms.merge(terms)
|
188
|
-
}
|
189
|
-
end
|
190
|
-
|
191
|
-
def create_weight(searcher)
|
192
|
-
return MultiPhraseWeight.new(self, searcher)
|
193
|
-
end
|
194
|
-
|
195
|
-
# Prints a user-readable version of this query.
|
196
|
-
def to_s(f = nil)
|
197
|
-
buffer = ""
|
198
|
-
buffer << "#{@field}:" if @field != f
|
199
|
-
buffer << '"'
|
200
|
-
last_pos = -1
|
201
|
-
@term_arrays.each_index do |i|
|
202
|
-
terms = @term_arrays[i]
|
203
|
-
pos = @positions[i]
|
204
|
-
last_pos.upto(pos-2) {buffer << "<> "}
|
205
|
-
last_pos = pos
|
206
|
-
buffer << "#{terms.map {|term| term.text}.join("|")} "
|
207
|
-
end
|
208
|
-
buffer.rstrip!
|
209
|
-
buffer << '"'
|
210
|
-
|
211
|
-
buffer << "~#{@slop}" if (@slop != 0)
|
212
|
-
buffer << "^#{boost()}" if boost() != 1.0
|
213
|
-
return buffer
|
214
|
-
end
|
215
|
-
end
|
216
|
-
end
|