ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,60 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
# Abstract base class for sorting hits returned by a Query.
|
3
|
-
#
|
4
|
-
# This class should only be used if the other SortField types (SCORE, DOC,
|
5
|
-
# STRING, INTEGER, FLOAT) do not provide an adequate sorting. It maintains
|
6
|
-
# an internal cache of values which could be quite large. The cache is an
|
7
|
-
# array of Comparable, one for each document in the index. There is a
|
8
|
-
# distinct Comparable for each unique term in the field - if some documents
|
9
|
-
# have the same term in the field, the cache array will have entries which
|
10
|
-
# reference the same Comparable.
|
11
|
-
#
|
12
|
-
# Author:: Tim Jones
|
13
|
-
class SortComparator
|
14
|
-
|
15
|
-
# Creates a comparator for the field in the given index.
|
16
|
-
#
|
17
|
-
# reader:: Index to create comparator for.
|
18
|
-
# field_name:: Field to create comparator for.
|
19
|
-
# returns:: Comparator of ScoreDoc objects.
|
20
|
-
def new_comparator(reader, field_name)
|
21
|
-
cached_values = FieldCache::DEFAULT.custom(reader, field, self)
|
22
|
-
|
23
|
-
score_doc_comparator = ScoreDocComparator.new()
|
24
|
-
|
25
|
-
class <<score_doc_comparator
|
26
|
-
attr_writer :cache_values
|
27
|
-
def compare(i, j)
|
28
|
-
return @cached_values[i.doc] <=> @cached_values[j.doc]
|
29
|
-
end
|
30
|
-
|
31
|
-
def sort_value(i)
|
32
|
-
return @cached_values[i.doc]
|
33
|
-
end
|
34
|
-
|
35
|
-
def sort_type()
|
36
|
-
return SortField::SortType::CUSTOM
|
37
|
-
end
|
38
|
-
end
|
39
|
-
score_doc_comparator.cached_values = cached_values
|
40
|
-
return score_doc_comparator
|
41
|
-
end
|
42
|
-
|
43
|
-
# Returns an object which, when sorted according to natural order, will
|
44
|
-
# order the Term values in the correct order. For example, if the Terms
|
45
|
-
# contained integer values, this method would return +term_text.to_i+.
|
46
|
-
# Note that this might not always be the most efficient implementation -
|
47
|
-
# for this particular example, a better implementation might be to make a
|
48
|
-
# ScoreDocLookupComparator that uses an internal lookup table of int.
|
49
|
-
#
|
50
|
-
# term_text:: The textual value of the term.
|
51
|
-
#
|
52
|
-
# returns:: An object representing +term_text+ that sorts according to the
|
53
|
-
# natural order of +term_text+.
|
54
|
-
#
|
55
|
-
# See ScoreDocComparator
|
56
|
-
def get_comparable(term_text)
|
57
|
-
raise NotImplementedError
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
@@ -1,91 +0,0 @@
|
|
1
|
-
module Ferret::Search
|
2
|
-
|
3
|
-
# Stores information about how to sort documents by terms in an individual
|
4
|
-
# field. Fields must be indexed in order to sort by them.
|
5
|
-
class SortField
|
6
|
-
class SortType < Ferret::Utils::Parameter
|
7
|
-
attr_reader :parser, :comparator
|
8
|
-
|
9
|
-
# Creates a new SortType. A SortType is used to specify how a field is
|
10
|
-
# sorted in a document. Each SortType *MUST* have a unique name. This is
|
11
|
-
# because the SortType object is used to cache a fields values for a
|
12
|
-
# particular reader, so each SortType should be created once only and
|
13
|
-
# stored in a constant. See the standard SortTypes stored hear for
|
14
|
-
# example.
|
15
|
-
def initialize(name, parser = lambda{|str| str}, comparator = nil)
|
16
|
-
super(name)
|
17
|
-
@parser = parser
|
18
|
-
@comparator = comparator
|
19
|
-
end
|
20
|
-
|
21
|
-
# Sort by document score (relevancy). Sort values are Float and higher
|
22
|
-
# values are at the front.
|
23
|
-
SCORE = SortType.new("SCORE")
|
24
|
-
|
25
|
-
# Sort by document number (order). Sort values are Integer and lower
|
26
|
-
# values are at the front.
|
27
|
-
DOC = SortType.new("DOC")
|
28
|
-
|
29
|
-
# Guess sort type of sort based on field contents. We try parsing the
|
30
|
-
# field as an integer and then as a floating point number. If we are
|
31
|
-
# unsuccessful, the field is parsed as a plain string.
|
32
|
-
AUTO = SortType.new("auto")
|
33
|
-
|
34
|
-
# Sort using term values as Strings. Sort values are String and lower
|
35
|
-
# values are at the front.
|
36
|
-
STRING = SortType.new("string")
|
37
|
-
|
38
|
-
# Sort using term values as encoded Integers. Sort values are Integer
|
39
|
-
# and lower values are at the front.
|
40
|
-
INTEGER = SortType.new("integer", lambda{|str| str.to_i})
|
41
|
-
|
42
|
-
# Sort using term values as encoded Floats. Sort values are Float and
|
43
|
-
# lower values are at the front.
|
44
|
-
FLOAT = SortType.new("float", lambda{|str| str.to_f})
|
45
|
-
end
|
46
|
-
|
47
|
-
attr_reader :name, :sort_type, :comparator
|
48
|
-
|
49
|
-
def reverse?
|
50
|
-
return @reverse
|
51
|
-
end
|
52
|
-
|
53
|
-
# Creates a SortField which specifies which field the data is sorted on
|
54
|
-
# and how that field is sorted. See SortType.
|
55
|
-
#
|
56
|
-
# name:: Name of field to sort by. Can be +nil+ if +sort_type+ is SCORE or
|
57
|
-
# DOC.
|
58
|
-
#
|
59
|
-
# An options hash with the followind values can also be supplied;
|
60
|
-
# sort_type:: Type of values in the terms.
|
61
|
-
# reverse:: True if natural order should be reversed.
|
62
|
-
# comparator:: A proc used to compare two values from the index. You can
|
63
|
-
# also give this value to the SortType object that you pass.
|
64
|
-
def initialize(name = nil, options= {})
|
65
|
-
@name = name.to_s if name
|
66
|
-
@sort_type = options[:sort_type]||SortType::AUTO
|
67
|
-
@reverse = options[:reverse]||false
|
68
|
-
@comparator = options[:comparator]||@sort_type.comparator
|
69
|
-
if (@name == nil and @sort_type != SortType::DOC and
|
70
|
-
@sort_type != SortType::SCORE)
|
71
|
-
raise ArgumentError, "You must supply a field name for your sort field"
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
# Represents sorting by document score (relevancy).
|
76
|
-
FIELD_SCORE = SortField.new(nil, {:sort_type => SortType::SCORE})
|
77
|
-
|
78
|
-
# Represents sorting by document number (order).
|
79
|
-
FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
|
80
|
-
|
81
|
-
def to_s()
|
82
|
-
if @name
|
83
|
-
buffer = "#@name:<#@sort_type>"
|
84
|
-
else
|
85
|
-
buffer = "<#{@sort_type}>"
|
86
|
-
end
|
87
|
-
buffer << '!' if @reverse
|
88
|
-
return buffer
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
data/lib/ferret/search/spans.rb
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
$:.unshift File.dirname(__FILE__)
|
2
|
-
|
3
|
-
require 'spans/spans_enum.rb'
|
4
|
-
require 'spans/near_spans_enum.rb'
|
5
|
-
require 'spans/span_query.rb'
|
6
|
-
require 'spans/span_first_query.rb'
|
7
|
-
require 'spans/span_near_query.rb'
|
8
|
-
require 'spans/span_not_query.rb'
|
9
|
-
require 'spans/span_or_query.rb'
|
10
|
-
require 'spans/span_scorer.rb'
|
11
|
-
require 'spans/span_term_query.rb'
|
12
|
-
require 'spans/span_weight.rb'
|
@@ -1,304 +0,0 @@
|
|
1
|
-
module Ferret::Search::Spans
|
2
|
-
class NearSpansEnum < SpansEnum
|
3
|
-
|
4
|
-
class CellQueue < Ferret::Utils::PriorityQueue
|
5
|
-
def less_than(o1, o2)
|
6
|
-
if (o1.doc == o2.doc)
|
7
|
-
if (o1.start == o2.start)
|
8
|
-
if (o1.finish == o2.finish)
|
9
|
-
return o1.index > o2.index
|
10
|
-
else
|
11
|
-
return o1.finish < o2.finish
|
12
|
-
end
|
13
|
-
else
|
14
|
-
return o1.start < o2.start
|
15
|
-
end
|
16
|
-
else
|
17
|
-
return o1.doc < o2.doc
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
# Wraps a SpansEnum, and can be used to form a linked list.
|
24
|
-
class SpansCell < SpansEnum
|
25
|
-
attr_accessor :next, :index
|
26
|
-
|
27
|
-
def initialize(parent, spans, index)
|
28
|
-
@parent = parent
|
29
|
-
@spans = spans
|
30
|
-
@index = index
|
31
|
-
@length = -1
|
32
|
-
end
|
33
|
-
|
34
|
-
def next?()
|
35
|
-
if (@length != -1) # subtract old length
|
36
|
-
@parent.total_length -= @length
|
37
|
-
end
|
38
|
-
|
39
|
-
more = @spans.next? # move to next
|
40
|
-
|
41
|
-
if more
|
42
|
-
@length = finish() - start() # compute new length
|
43
|
-
@parent.total_length += @length # add new length to total
|
44
|
-
|
45
|
-
if (@parent.max.nil? or doc() > @parent.max.doc or # maintain max
|
46
|
-
(doc() == @parent.max.doc and finish() > @parent.max.finish))
|
47
|
-
@parent.max = self
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
return more
|
52
|
-
end
|
53
|
-
|
54
|
-
def skip_to(target)
|
55
|
-
if (@length != -1) # subtract old length
|
56
|
-
@parent.total_length -= @length
|
57
|
-
end
|
58
|
-
|
59
|
-
more = @spans.skip_to(target) # skip
|
60
|
-
|
61
|
-
if (more)
|
62
|
-
@length = finish() - start() # compute new length
|
63
|
-
@parent.total_length += @length # add new length to total
|
64
|
-
|
65
|
-
if (@parent.max.nil? or doc() > @parent.max.doc() or # maintain max
|
66
|
-
(doc() == @parent.max.doc and finish() > @parent.max.finish))
|
67
|
-
@parent.max = self
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
return more
|
72
|
-
end
|
73
|
-
|
74
|
-
def doc() return @spans.doc() end
|
75
|
-
def start() return @spans.start() end
|
76
|
-
def finish() return @spans.finish() end
|
77
|
-
|
78
|
-
def to_s() return "#{@spans}##{@index}" end
|
79
|
-
end
|
80
|
-
|
81
|
-
attr_accessor :total_length, :max
|
82
|
-
|
83
|
-
def initialize(query, reader)
|
84
|
-
@ordered = [] # spans in query order
|
85
|
-
|
86
|
-
@first = nil # linked list of spans
|
87
|
-
@last = nil # sorted by doc only
|
88
|
-
|
89
|
-
@total_length = 0 # sum of current lengths
|
90
|
-
|
91
|
-
@queue = nil # sorted queue of spans
|
92
|
-
@max = nil # max element in queue
|
93
|
-
|
94
|
-
@more = true # true iff not done
|
95
|
-
@first_time = true # true before first next?
|
96
|
-
|
97
|
-
|
98
|
-
@query = query
|
99
|
-
@slop = query.slop
|
100
|
-
@in_order = query.in_order?
|
101
|
-
|
102
|
-
clauses = query.clauses # initialize spans & list
|
103
|
-
@queue = CellQueue.new(clauses.length)
|
104
|
-
clauses.length.times do |i|
|
105
|
-
# construct clause spans
|
106
|
-
cell = SpansCell.new(self, clauses[i].spans(reader), i)
|
107
|
-
@ordered << cell # add to ordered
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
def next?()
|
112
|
-
if (@first_time)
|
113
|
-
init_list(true)
|
114
|
-
list_to_queue() # initialize queue
|
115
|
-
@first_time = false
|
116
|
-
elsif (@more)
|
117
|
-
@more = min().next? # trigger further scanning
|
118
|
-
@queue.adjust_top() if (@more) # maintain queue
|
119
|
-
end
|
120
|
-
|
121
|
-
while (@more)
|
122
|
-
queue_stale = false
|
123
|
-
|
124
|
-
if (min().doc != @max.doc) # maintain list
|
125
|
-
queue_to_list()
|
126
|
-
queue_stale = true
|
127
|
-
end
|
128
|
-
|
129
|
-
# skip to doc w/ all clauses
|
130
|
-
|
131
|
-
while (@more and @first.doc < @last.doc)
|
132
|
-
@more = @first.skip_to(@last.doc) # skip first upto last
|
133
|
-
first_to_last() # and move it to the end
|
134
|
-
queue_stale = true
|
135
|
-
end
|
136
|
-
|
137
|
-
return false if not @more
|
138
|
-
|
139
|
-
# found doc w/ all clauses
|
140
|
-
|
141
|
-
if (queue_stale) # maintain the queue
|
142
|
-
list_to_queue()
|
143
|
-
queue_stale = false
|
144
|
-
end
|
145
|
-
|
146
|
-
return true if at_match?
|
147
|
-
|
148
|
-
# trigger further scanning
|
149
|
-
if (@in_order and check_slop?())
|
150
|
-
# There is a non ordered match within slop and an ordered match is needed.
|
151
|
-
@more = first_non_ordered_next_to_partial_list()
|
152
|
-
if (@more)
|
153
|
-
partial_list_to_queue()
|
154
|
-
end
|
155
|
-
else
|
156
|
-
@more = min().next?()
|
157
|
-
if (@more)
|
158
|
-
@queue.adjust_top() # maintain queue
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
return false # no more matches
|
163
|
-
end
|
164
|
-
|
165
|
-
def each()
|
166
|
-
cell = @first
|
167
|
-
while (cell)
|
168
|
-
yield cell
|
169
|
-
cell=cell.next
|
170
|
-
end
|
171
|
-
end
|
172
|
-
|
173
|
-
def skip_to(target)
|
174
|
-
if (@first_time) # initialize
|
175
|
-
init_list(false)
|
176
|
-
each() do |cell|
|
177
|
-
@more = cell.skip_to(target) # skip all
|
178
|
-
break if not @more
|
179
|
-
end
|
180
|
-
|
181
|
-
if (@more)
|
182
|
-
list_to_queue()
|
183
|
-
end
|
184
|
-
@first_time = false
|
185
|
-
|
186
|
-
else # normal case
|
187
|
-
while (@more and min().doc < target) # skip as needed
|
188
|
-
@more = min().skip_to(target)
|
189
|
-
@queue.adjust_top() if (@more)
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
if (@more)
|
194
|
-
return true if (at_match?()) # at a match?
|
195
|
-
return next? # no, scan
|
196
|
-
end
|
197
|
-
|
198
|
-
return false
|
199
|
-
end
|
200
|
-
|
201
|
-
def min() @queue.top() end
|
202
|
-
|
203
|
-
def doc() min().doc() end
|
204
|
-
def start() min().start() end
|
205
|
-
def finish() @max.finish() end
|
206
|
-
|
207
|
-
|
208
|
-
def to_s()
|
209
|
-
buffer = "spans(#{@query})@"
|
210
|
-
if @first_time
|
211
|
-
buffer << "START"
|
212
|
-
else
|
213
|
-
buffer << (@queue.size>0 ? ("#{doc}:#{start()}-#{finish}") : "END")
|
214
|
-
end
|
215
|
-
return buffer
|
216
|
-
end
|
217
|
-
|
218
|
-
def init_list(nxt)
|
219
|
-
@ordered.each do |cell|
|
220
|
-
@more = cell.next? if nxt
|
221
|
-
if @more
|
222
|
-
add_to_list(cell) # add to list
|
223
|
-
else
|
224
|
-
break
|
225
|
-
end
|
226
|
-
end
|
227
|
-
end
|
228
|
-
|
229
|
-
def add_to_list(cell)
|
230
|
-
if (@last != nil) # add next to end of list
|
231
|
-
@last.next = cell
|
232
|
-
else
|
233
|
-
@first = cell
|
234
|
-
end
|
235
|
-
@last = cell
|
236
|
-
cell.next = nil
|
237
|
-
end
|
238
|
-
|
239
|
-
def first_to_last()
|
240
|
-
@last.next = @first # move first to end of list
|
241
|
-
@last = @first
|
242
|
-
@first = @first.next
|
243
|
-
@last.next = nil
|
244
|
-
end
|
245
|
-
|
246
|
-
def queue_to_list()
|
247
|
-
@last = @first = nil
|
248
|
-
while (@queue.top() != nil)
|
249
|
-
add_to_list(@queue.pop())
|
250
|
-
end
|
251
|
-
end
|
252
|
-
|
253
|
-
def first_non_ordered_next_to_partial_list()
|
254
|
-
# Creates a partial list consisting of first non ordered and earlier.
|
255
|
-
# Returns first non ordered .next?.
|
256
|
-
@last = @first = nil
|
257
|
-
ordered_index = 0
|
258
|
-
while (@queue.top() != nil)
|
259
|
-
cell = @queue.pop()
|
260
|
-
add_to_list(cell)
|
261
|
-
if (cell.index == ordered_index)
|
262
|
-
ordered_index += 1
|
263
|
-
else
|
264
|
-
return cell.next?()
|
265
|
-
# FIXME: continue here, rename to eg. checkOrderedMatch():
|
266
|
-
# when check_slop?() and not ordered, repeat cell.next?().
|
267
|
-
# when check_slop?() and ordered, add to list and repeat queue.pop()
|
268
|
-
# without check_slop?(): no match, rebuild the queue from the partial list.
|
269
|
-
# When queue is empty and check_slop?() and ordered there is a match.
|
270
|
-
end
|
271
|
-
end
|
272
|
-
raise RuntimeException, "Unexpected: ordered"
|
273
|
-
end
|
274
|
-
|
275
|
-
def list_to_queue()
|
276
|
-
@queue.clear() # rebuild queue
|
277
|
-
partial_list_to_queue()
|
278
|
-
end
|
279
|
-
|
280
|
-
def partial_list_to_queue()
|
281
|
-
each() { |cell| @queue.push(cell) } # add to queue from list
|
282
|
-
end
|
283
|
-
|
284
|
-
def at_match?()
|
285
|
-
return ((min().doc() == @max.doc()) and check_slop?() and
|
286
|
-
(not @in_order or match_is_ordered?()))
|
287
|
-
end
|
288
|
-
|
289
|
-
def check_slop?()
|
290
|
-
match_length = @max.finish() - min.start()
|
291
|
-
return ((match_length - @total_length) <= @slop)
|
292
|
-
end
|
293
|
-
|
294
|
-
def match_is_ordered?()
|
295
|
-
last_start = -1
|
296
|
-
@ordered.each do |cell|
|
297
|
-
start = cell.start
|
298
|
-
return false if start <= last_start
|
299
|
-
last_start = start
|
300
|
-
end
|
301
|
-
return true
|
302
|
-
end
|
303
|
-
end
|
304
|
-
end
|