ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,32 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
class ExactPhraseScorer < PhraseScorer
|
3
|
-
|
4
|
-
def initialize(weight, tps, positions, similarity, norms)
|
5
|
-
super(weight, tps, positions, similarity, norms)
|
6
|
-
end
|
7
|
-
|
8
|
-
def phrase_freq()
|
9
|
-
# sort list with pq
|
10
|
-
each do |pp|
|
11
|
-
pp.first_position()
|
12
|
-
@pq.push(pp) # build pq from list
|
13
|
-
end
|
14
|
-
pq_to_list() # rebuild list from pq
|
15
|
-
|
16
|
-
freq = 0
|
17
|
-
begin # find position w/ all terms
|
18
|
-
while (@first.position < @last.position) # scan forward in first
|
19
|
-
begin
|
20
|
-
if not @first.next_position()
|
21
|
-
return freq
|
22
|
-
end
|
23
|
-
end while (@first.position < @last.position)
|
24
|
-
first_to_last()
|
25
|
-
end
|
26
|
-
freq += 1 # all equal: a match
|
27
|
-
end while @last.next_position()
|
28
|
-
|
29
|
-
return freq
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
@@ -1,41 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Expert: Describes the score computation for document and query.
|
3
|
-
class Explanation
|
4
|
-
attr_accessor :value, :description, :details
|
5
|
-
|
6
|
-
def initialize(value = nil, description = nil)
|
7
|
-
@value = value
|
8
|
-
@description = description
|
9
|
-
@details = []
|
10
|
-
end
|
11
|
-
|
12
|
-
def <<(detail)
|
13
|
-
@details << detail
|
14
|
-
end
|
15
|
-
|
16
|
-
# Render an explanation as text.
|
17
|
-
def to_s(depth = 0)
|
18
|
-
buffer = " " * depth
|
19
|
-
buffer << "#{@value} = #{@description}\n"
|
20
|
-
|
21
|
-
@details.each do |detail|
|
22
|
-
buffer << detail.to_s(depth + 1)
|
23
|
-
end
|
24
|
-
return buffer
|
25
|
-
end
|
26
|
-
|
27
|
-
# Render an explanation as HTML.
|
28
|
-
def to_html()
|
29
|
-
buffer = "<ul>\n"
|
30
|
-
buffer << "<li>#{@value} = #{@description}</li>\n"
|
31
|
-
|
32
|
-
@details.each do |detail|
|
33
|
-
buffer << detail.to_html
|
34
|
-
end
|
35
|
-
|
36
|
-
buffer << "</ul>\n"
|
37
|
-
|
38
|
-
return buffer
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
@@ -1,215 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
|
3
|
-
# Expert: The default cache implementation, storing all values in memory.
|
4
|
-
# A WeakKeyHash is used for storage.
|
5
|
-
class FieldCache
|
6
|
-
include Ferret::Index
|
7
|
-
|
8
|
-
StringIndex = Struct.new(:str_index, :str_map)
|
9
|
-
|
10
|
-
# Expert: Every key in the internal cache is of this type.
|
11
|
-
class Entry
|
12
|
-
attr_reader :field, :sort_type, :comparator
|
13
|
-
# Creates one of these objects.
|
14
|
-
def initialize(field, sort_type, comparator = nil)
|
15
|
-
@field = field
|
16
|
-
@sort_type = sort_type
|
17
|
-
@comparator = comparator
|
18
|
-
end
|
19
|
-
|
20
|
-
# Two of these are equal iff they reference the same field and sort_type.
|
21
|
-
def eql?(o)
|
22
|
-
return (o.instance_of? Entry and o.field == @field and
|
23
|
-
o.sort_type == @sort_type and o.comparator == comparator)
|
24
|
-
end
|
25
|
-
alias :== :eql?
|
26
|
-
|
27
|
-
# Composes a hashcode based on the field and sort_type.
|
28
|
-
def hash()
|
29
|
-
return @field.hash ^ @sort_type.hash ^ @comparator.hash
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
INT_PARSER = lambda {|i| i.to_i}
|
34
|
-
|
35
|
-
FLOAT_PARSER = lambda {|i| i.to_f}
|
36
|
-
|
37
|
-
# The internal cache. Maps Entry to array of interpreted term values.
|
38
|
-
@@cache = Ferret::Utils::WeakKeyHash.new
|
39
|
-
|
40
|
-
# See if an object is in the cache.
|
41
|
-
def FieldCache.lookup(reader, field, sort_type)
|
42
|
-
entry = Entry.new(field, sort_type)
|
43
|
-
@@cache.synchronize() do
|
44
|
-
reader_cache = @@cache[reader]
|
45
|
-
return nil if reader_cache.nil?
|
46
|
-
return reader_cache[entry]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
# Put an object into the cache.
|
51
|
-
def FieldCache.store(reader, field, sort_type, value)
|
52
|
-
entry = Entry.new(field, sort_type)
|
53
|
-
@@cache.synchronize() do
|
54
|
-
reader_cache = @@cache[reader]
|
55
|
-
if (reader_cache == nil)
|
56
|
-
reader_cache = {}
|
57
|
-
@@cache[reader] = reader_cache
|
58
|
-
end
|
59
|
-
return reader_cache[entry] = value
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# Checks the internal cache for an appropriate entry, and if none is found,
|
64
|
-
# reads the terms in +field+ and parses them with the provided parser and
|
65
|
-
# returns an array of size +reader.max_doc+ of the value each document has
|
66
|
-
# in the given field.
|
67
|
-
#
|
68
|
-
# reader:: Used to get field values.
|
69
|
-
# field:: Which field contains the values.
|
70
|
-
# sort_type:: The type of sort to run on the field. Holds the parser
|
71
|
-
# return:: The values in the given field for each document.
|
72
|
-
def FieldCache.get_index(reader, field, sort_type)
|
73
|
-
index = lookup(reader, field, sort_type)
|
74
|
-
if (index == nil)
|
75
|
-
parser = sort_type.parser
|
76
|
-
index = Array.new(reader.max_doc)
|
77
|
-
if (index.length > 0)
|
78
|
-
term_docs = reader.term_docs
|
79
|
-
term_enum = reader.terms_from(Term.new(field, ""))
|
80
|
-
begin
|
81
|
-
if term_enum.term.nil?
|
82
|
-
raise "no terms in field '#{field}' to sort by"
|
83
|
-
end
|
84
|
-
begin
|
85
|
-
term = term_enum.term
|
86
|
-
break if (term.field != field)
|
87
|
-
termval = parser.call(term.text)
|
88
|
-
term_docs.seek(term)
|
89
|
-
while term_docs.next?
|
90
|
-
index[term_docs.doc] = termval
|
91
|
-
end
|
92
|
-
end while term_enum.next?
|
93
|
-
ensure
|
94
|
-
term_docs.close()
|
95
|
-
term_enum.close()
|
96
|
-
end
|
97
|
-
end
|
98
|
-
store(reader, field, sort_type, index)
|
99
|
-
end
|
100
|
-
return index
|
101
|
-
end
|
102
|
-
|
103
|
-
# Checks the internal cache for an appropriate entry, and if none is found
|
104
|
-
# reads the term values in +field+ and returns an array of them in natural
|
105
|
-
# order, along with an array telling which element in the term array each
|
106
|
-
# document uses.
|
107
|
-
#
|
108
|
-
# reader:: Used to get field values.
|
109
|
-
# field:: Which field contains the strings.
|
110
|
-
# returns:: Array of terms and index into the array for each document.
|
111
|
-
def FieldCache.get_string_index(reader, field)
|
112
|
-
index = lookup(reader, field, SortField::SortType::STRING)
|
113
|
-
if (index == nil)
|
114
|
-
str_index = Array.new(reader.max_doc)
|
115
|
-
str_map = Array.new(reader.max_doc+1)
|
116
|
-
if (str_index.length > 0)
|
117
|
-
term_docs = reader.term_docs
|
118
|
-
term_enum = reader.terms_from(Term.new(field,""))
|
119
|
-
t = 0 # current term number
|
120
|
-
|
121
|
-
# an entry for documents that have no terms in this field should a
|
122
|
-
# document with no terms be at top or bottom?
|
123
|
-
#
|
124
|
-
# this puts them at the top - if it is changed, FieldDocSortedHitQueue
|
125
|
-
# needs to change as well.
|
126
|
-
str_map[t] = nil
|
127
|
-
t += 1
|
128
|
-
|
129
|
-
begin
|
130
|
-
if (term_enum.term() == nil)
|
131
|
-
raise "no terms in field #{field} to sort by"
|
132
|
-
end
|
133
|
-
begin
|
134
|
-
term = term_enum.term
|
135
|
-
break if (term.field != field)
|
136
|
-
|
137
|
-
# store term text
|
138
|
-
# we expect that there is at most one term per document
|
139
|
-
if (t >= str_map.length)
|
140
|
-
raise "there are more terms than documents in field \"#{field}\", but it's impossible to sort on tokenized fields"
|
141
|
-
end
|
142
|
-
str_map[t] = term.text
|
143
|
-
|
144
|
-
term_docs.seek(term)
|
145
|
-
while term_docs.next?
|
146
|
-
str_index[term_docs.doc] = t
|
147
|
-
end
|
148
|
-
|
149
|
-
t += 1
|
150
|
-
end while term_enum.next?
|
151
|
-
ensure
|
152
|
-
term_docs.close()
|
153
|
-
term_enum.close()
|
154
|
-
end
|
155
|
-
|
156
|
-
if (t == 0)
|
157
|
-
# if there are no terms, make the term array
|
158
|
-
# have a single nil entry
|
159
|
-
# str_map = [nil] <= already set above
|
160
|
-
elsif (t < str_map.length)
|
161
|
-
# if there are less terms than documents,
|
162
|
-
# trim off the dead array space
|
163
|
-
str_map.compact!
|
164
|
-
end
|
165
|
-
end
|
166
|
-
index = StringIndex.new(str_index, str_map)
|
167
|
-
store(reader, field, SortField::SortType::STRING, index)
|
168
|
-
end
|
169
|
-
return index
|
170
|
-
end
|
171
|
-
|
172
|
-
# Checks the internal cache for an appropriate entry, and if none is found
|
173
|
-
# reads +field+ to see if it contains integers, floats or strings, and then
|
174
|
-
# calls one of the other methods in this class to get the values. For
|
175
|
-
# string values, a StringIndex is returned. After calling this method,
|
176
|
-
# there is an entry in the cache for both type +AUTO+ and the actual found
|
177
|
-
# type.
|
178
|
-
#
|
179
|
-
# reader:: Used to get field values.
|
180
|
-
# field:: Which field contains the values.
|
181
|
-
# return:: Integer Array, Float Array or StringIndex.
|
182
|
-
def FieldCache.get_auto_index(reader, field)
|
183
|
-
index = lookup(reader, field, SortField::SortType::AUTO)
|
184
|
-
if (index == nil)
|
185
|
-
term_enum = reader.terms_from(Term.new(field, ""))
|
186
|
-
begin
|
187
|
-
term = term_enum.term
|
188
|
-
if (term == nil)
|
189
|
-
raise "no terms in field #{field} to sort by"
|
190
|
-
end
|
191
|
-
if (term.field == field)
|
192
|
-
termtext = term.text.strip
|
193
|
-
|
194
|
-
if (termtext == termtext.to_i.to_s)
|
195
|
-
index = get_index(reader, field, SortField::SortType::INTEGER)
|
196
|
-
elsif (termtext == termtext.to_f.to_s or termtext == "%f"%termtext.to_f)
|
197
|
-
index = get_index(reader, field, SortField::SortType::FLOAT)
|
198
|
-
else
|
199
|
-
index = get_string_index(reader, field)
|
200
|
-
end
|
201
|
-
|
202
|
-
if (index != nil)
|
203
|
-
store(reader, field, SortField::SortType::AUTO, index)
|
204
|
-
end
|
205
|
-
else
|
206
|
-
raise "field \"#{field}\" does not appear to be indexed"
|
207
|
-
end
|
208
|
-
ensure
|
209
|
-
term_enum.close()
|
210
|
-
end
|
211
|
-
end
|
212
|
-
return index
|
213
|
-
end
|
214
|
-
end
|
215
|
-
end
|
@@ -1,31 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Expert: A ScoreDoc which also contains information about
|
3
|
-
# how to sort the referenced document. In addition to the
|
4
|
-
# document number and score, this object contains an array
|
5
|
-
# of values for the document from the field(s) used to sort.
|
6
|
-
# For example, if the sort criteria was to sort by fields
|
7
|
-
# "a", "b" then "c", the +fields+ object array
|
8
|
-
# will have three elements, corresponding respectively to
|
9
|
-
# the term values for the document in fields "a", "b" and "c".
|
10
|
-
# The class of each element in the array will be either
|
11
|
-
# Integer, Float or String depending on the type of values
|
12
|
-
# in the terms of each field.
|
13
|
-
#
|
14
|
-
class FieldDoc < ScoreDoc
|
15
|
-
|
16
|
-
# Expert: The values which are used to sort the referenced document.
|
17
|
-
# The order of these will match the original sort criteria given by a
|
18
|
-
# Sort object. Each Object will be either an Integer, Float or String,
|
19
|
-
# depending on the type of values in the terms of the original field.
|
20
|
-
# See Sort
|
21
|
-
# See Searcher#search(Query,Filter,int,Sort)
|
22
|
-
attr_accessor :fields
|
23
|
-
|
24
|
-
# Expert: Creates one of these objects with the given sort information.
|
25
|
-
def initialize(doc, score, fields = nil)
|
26
|
-
super(doc, score)
|
27
|
-
@fields = fields
|
28
|
-
end
|
29
|
-
|
30
|
-
end
|
31
|
-
end
|
@@ -1,184 +0,0 @@
|
|
1
|
-
require 'monitor'
|
2
|
-
|
3
|
-
module Ferret::Search
|
4
|
-
# Expert: A hit queue for sorting by hits by terms in more than one field.
|
5
|
-
# Uses +FieldCache+ for maintaining internal term lookup tables.
|
6
|
-
class FieldSortedHitQueue < Ferret::Utils::PriorityQueue
|
7
|
-
# Stores a comparator corresponding to each field being sorted by
|
8
|
-
attr_accessor :comparators
|
9
|
-
|
10
|
-
# Stores the sort criteria being used.
|
11
|
-
attr_accessor :fields
|
12
|
-
|
13
|
-
# Creates a hit queue sorted by the given list of fields.
|
14
|
-
#
|
15
|
-
# reader:: Index to use.
|
16
|
-
# fields:: Field names, in priority order (highest priority first).
|
17
|
-
# Cannot be +nil+ or empty.
|
18
|
-
# size:: The number of hits to retain. Must be greater than zero.
|
19
|
-
# raises:: IOError
|
20
|
-
def initialize(reader, fields, size)
|
21
|
-
super(size)
|
22
|
-
n = fields.length
|
23
|
-
@comparators = Array.new(n)
|
24
|
-
@fields = Array.new(n)
|
25
|
-
fields.each_with_index do |field, i|
|
26
|
-
@comparators[i] = get_cached_comparator(reader, field)
|
27
|
-
@fields[i] = SortField.new(field.name,
|
28
|
-
{:sort_type => comparators[i].sort_type,
|
29
|
-
:reverse => field.reverse?})
|
30
|
-
end
|
31
|
-
|
32
|
-
# Stores the maximum score value encountered, for normalizing.
|
33
|
-
# we only care about scores greater than 1.0 - if all the scores
|
34
|
-
# are less than 1.0, we don't have to normalize.
|
35
|
-
@max_score = 1.0
|
36
|
-
end
|
37
|
-
|
38
|
-
|
39
|
-
# Returns whether +a+ is less relevant than +b+.
|
40
|
-
# sd1:: ScoreDoc
|
41
|
-
# sd2:: ScoreDoc
|
42
|
-
# returns:: +true+ if document +a+ should be sorted after document +b+.
|
43
|
-
def less_than(sd1, sd2)
|
44
|
-
# keep track of maximum score
|
45
|
-
@max_score = sd1.score if (sd1.score > @max_score)
|
46
|
-
@max_score = sd2.score if (sd2.score > @max_score)
|
47
|
-
|
48
|
-
# run comparators
|
49
|
-
c = 0
|
50
|
-
|
51
|
-
@comparators.length.times do |i|
|
52
|
-
if @fields[i].reverse?
|
53
|
-
c = @comparators[i].compare(sd2, sd1)
|
54
|
-
else
|
55
|
-
c = @comparators[i].compare(sd1, sd2)
|
56
|
-
end
|
57
|
-
break unless c == 0
|
58
|
-
end
|
59
|
-
|
60
|
-
# avoid random sort order that could lead to duplicates
|
61
|
-
if (c == 0)
|
62
|
-
return sd1.doc > sd2.doc
|
63
|
-
end
|
64
|
-
return c > 0
|
65
|
-
end
|
66
|
-
|
67
|
-
|
68
|
-
# Given a FieldDoc object, stores the values used
|
69
|
-
# to sort the given document. These values are not the raw
|
70
|
-
# values out of the index, but the internal representation
|
71
|
-
# of them. This is so the given search hit can be collated
|
72
|
-
# by a MultiSearcher with other search hits.
|
73
|
-
# doc:: The FieldDoc to store sort values into.
|
74
|
-
# returns:: The same FieldDoc passed in.
|
75
|
-
# See Searchable#search(Weight,Filter,int,Sort)
|
76
|
-
def fill_fields(doc)
|
77
|
-
fields = Array.new(@comparators.length)
|
78
|
-
@comparators.each do |comparator|
|
79
|
-
fields[i] = comparator.sort_value(doc)
|
80
|
-
end
|
81
|
-
doc.fields = fields
|
82
|
-
end
|
83
|
-
|
84
|
-
# Internal cache of comparators. Similar to FieldCache, only
|
85
|
-
# caches comparators instead of term values.
|
86
|
-
@@comparators = Ferret::Utils::WeakKeyHash.new
|
87
|
-
|
88
|
-
# Returns a comparator if it is in the cache.
|
89
|
-
def lookup(reader, field, sort_type, comproc)
|
90
|
-
entry = FieldCache::Entry.new(field, sort_type, comproc)
|
91
|
-
@@comparators.synchronize() do
|
92
|
-
reader_cache = @@comparators[reader]
|
93
|
-
return nil if reader_cache.nil?
|
94
|
-
return reader_cache[entry]
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
# Stores a comparator into the cache.
|
99
|
-
def store(reader, field, sort_type, comproc, value)
|
100
|
-
entry = FieldCache::Entry.new(field, sort_type, comproc)
|
101
|
-
@@comparators.synchronize do
|
102
|
-
reader_cache = @@comparators[reader]
|
103
|
-
if reader_cache.nil?
|
104
|
-
reader_cache = Hash.new()
|
105
|
-
@@comparators[reader] = reader_cache
|
106
|
-
end
|
107
|
-
return reader_cache[entry] = value
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
def get_cached_comparator(reader, field)
|
112
|
-
if field.sort_type == SortField::SortType::DOC
|
113
|
-
return ScoreDocComparator::INDEX_ORDER
|
114
|
-
end
|
115
|
-
if field.sort_type == SortField::SortType::SCORE
|
116
|
-
return ScoreDocComparator::RELEVANCE
|
117
|
-
end
|
118
|
-
|
119
|
-
comparator = lookup(reader, field.name, field.sort_type, field.comparator)
|
120
|
-
if (comparator == nil)
|
121
|
-
case (field.sort_type)
|
122
|
-
when SortField::SortType::AUTO:
|
123
|
-
comparator = comparator_auto(reader, field.name)
|
124
|
-
when SortField::SortType::STRING:
|
125
|
-
comparator = comparator_string(reader, field.name)
|
126
|
-
else
|
127
|
-
comparator = comparator_simple(reader, field)
|
128
|
-
end
|
129
|
-
|
130
|
-
store(reader, field.name, field.sort_type, field.comparator, comparator)
|
131
|
-
end
|
132
|
-
return comparator
|
133
|
-
end
|
134
|
-
|
135
|
-
# Returns a comparator for sorting hits according to the sort type and the
|
136
|
-
# comparator function passed.
|
137
|
-
# strings.
|
138
|
-
#
|
139
|
-
# reader:: Index to use.
|
140
|
-
# field:: Lets us know which field to search and how to parse it.
|
141
|
-
# returns:: Comparator for sorting hits.
|
142
|
-
def comparator_simple(reader, field)
|
143
|
-
index = FieldCache.get_index(reader, field.name, field.sort_type)
|
144
|
-
comproc = field.comparator
|
145
|
-
if (comproc)
|
146
|
-
return SpecialFieldComparator.new(index, field.sort_type, comproc)
|
147
|
-
else
|
148
|
-
return SimpleFieldComparator.new(index, field.sort_type)
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
# Returns a comparator for sorting hits according to a field containing
|
153
|
-
# strings.
|
154
|
-
#
|
155
|
-
# reader:: Index to use.
|
156
|
-
# field:: Field containing string values.
|
157
|
-
# returns:: Comparator for sorting hits.
|
158
|
-
def comparator_string(reader, field)
|
159
|
-
index = FieldCache.get_string_index(reader, field)
|
160
|
-
return StringFieldComparator.new(index)
|
161
|
-
end
|
162
|
-
|
163
|
-
# Returns a comparator for sorting hits according to values in the given field.
|
164
|
-
# The terms in the field are looked at to determine whether they contain integers,
|
165
|
-
# floats or strings. Once the type is determined, one of the other static methods
|
166
|
-
# in this class is called to get the comparator.
|
167
|
-
# reader:: Index to use.
|
168
|
-
# field:: Field containg values.
|
169
|
-
# returns:: Comparator for sorting hits.
|
170
|
-
# raises:: IOException If an error occurs reading the index.
|
171
|
-
def comparator_auto(reader, field)
|
172
|
-
index = FieldCache.get_auto_index(reader, field)
|
173
|
-
if (index.is_a?(FieldCache::StringIndex))
|
174
|
-
return StringFieldComparator.new(index)
|
175
|
-
elsif (index[0].is_a?(Integer))
|
176
|
-
return SimpleFieldComparator.new(index, SortField::SortType::INTEGER)
|
177
|
-
elsif (index[0].is_a?(Float))
|
178
|
-
return SimpleFieldComparator.new(index, SortField::SortType::FLOAT)
|
179
|
-
else
|
180
|
-
raise "unknown data type in field '#{field}'. Data = #{index[0]}"
|
181
|
-
end
|
182
|
-
end
|
183
|
-
end
|
184
|
-
end
|