ferret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
class SloppyPhraseScorer < PhraseScorer
|
3
|
+
|
4
|
+
def initialize(weight, tps, positions, similarity, slop, norms)
|
5
|
+
super(weight, tps, positions, similarity, norms)
|
6
|
+
@slop = slop
|
7
|
+
end
|
8
|
+
|
9
|
+
def phrase_freq()
|
10
|
+
@pq.clear()
|
11
|
+
last_pos = 0
|
12
|
+
each do |pp|
|
13
|
+
pp.first_position()
|
14
|
+
last_pos = pp.position if (pp.position > last_pos)
|
15
|
+
@pq.push(pp) # build pq from list
|
16
|
+
end
|
17
|
+
|
18
|
+
freq = 0.0
|
19
|
+
done = false
|
20
|
+
begin
|
21
|
+
pp = @pq.pop()
|
22
|
+
pos = start = pp.position
|
23
|
+
next_pos = @pq.top().position
|
24
|
+
while pos <= next_pos
|
25
|
+
start = pos # advance pp to min window
|
26
|
+
if not pp.next_position()
|
27
|
+
done = true # ran out of a term -- done
|
28
|
+
break
|
29
|
+
end
|
30
|
+
pos = pp.position
|
31
|
+
end
|
32
|
+
|
33
|
+
match_length = last_pos - start
|
34
|
+
if (match_length <= @slop)
|
35
|
+
freq += @similarity.sloppy_freq(match_length) # score match
|
36
|
+
end
|
37
|
+
|
38
|
+
if (pp.position > last_pos)
|
39
|
+
last_pos = pp.position
|
40
|
+
end
|
41
|
+
@pq.push(pp) # restore pq
|
42
|
+
end while (!done)
|
43
|
+
|
44
|
+
return freq
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Encapsulates sort criteria for returned hits.
|
3
|
+
#
|
4
|
+
# The fields used to determine sort order must be carefully chosen.
|
5
|
+
# Documents must contain a single term in such a field, and the value of the
|
6
|
+
# term should indicate the document's relative position in a given sort
|
7
|
+
# order. The field must be indexed, but should not be tokenized, and does
|
8
|
+
# not need to be stored (unless you happen to want it back with the rest of
|
9
|
+
# your document data). In other words:
|
10
|
+
#
|
11
|
+
# document << Field.new("by_number",
|
12
|
+
# x.to_s,
|
13
|
+
# Field::Store::NO,
|
14
|
+
# Field::Index::UN_TOKENIZED))
|
15
|
+
#
|
16
|
+
#
|
17
|
+
# === Valid Types of Values
|
18
|
+
#
|
19
|
+
# There are three possible kinds of term values which may be put into
|
20
|
+
# sorting fields: Integers, Floats, or Strings. Unless SortField objects
|
21
|
+
# are specified, the type of value in the field is determined by parsing the
|
22
|
+
# first term in the field.
|
23
|
+
#
|
24
|
+
# Integer term values should contain only digits and an optional preceeding
|
25
|
+
# negative sign. Values must be base 10. Documents which should appear
|
26
|
+
# first in the sort should have low value integers, later documents high
|
27
|
+
# values (i.e. the documents should be numbered +1..n+ where +1+ is the
|
28
|
+
# first and +n+ the last).
|
29
|
+
#
|
30
|
+
# Float term values should conform to values accepted by String#to_f.
|
31
|
+
# Documents which should appear first in the sort should have low values,
|
32
|
+
# later documents high values.
|
33
|
+
#
|
34
|
+
# String term values can contain any valid String, but should not be
|
35
|
+
# tokenized. The values are sorted according to their Comparable natural
|
36
|
+
# order. Note that using this type of term value has higher memory
|
37
|
+
# requirements than the other two types.
|
38
|
+
#
|
39
|
+
# === Object Reuse
|
40
|
+
#
|
41
|
+
# One of these objects can be used multiple times and the sort order changed
|
42
|
+
# between usages.
|
43
|
+
#
|
44
|
+
# This class is thread safe.
|
45
|
+
#
|
46
|
+
# === Memory Usage
|
47
|
+
#
|
48
|
+
# Sorting uses caches of term values maintained by the internal HitQueue(s).
|
49
|
+
# The cache is static and contains an integer or float array of length
|
50
|
+
# +IndexReader#max_doc+ for each field name for which a sort is performed.
|
51
|
+
# In other words, the size of the cache in bytes is:
|
52
|
+
#
|
53
|
+
# 4 * IndexReader#max_doc * (# of different fields actually used to sort)
|
54
|
+
#
|
55
|
+
# For String fields, the cache is larger: in addition to the above array,
|
56
|
+
# the value of every term in the field is kept in memory. If there are many
|
57
|
+
# unique terms in the field, this could be quite large.
|
58
|
+
#
|
59
|
+
# Note that the size of the cache is not affected by how many fields are in
|
60
|
+
# the index and _might_ be used to sort - only by the ones actually used to
|
61
|
+
# sort a result set.
|
62
|
+
#
|
63
|
+
# The cache is cleared each time a new +IndexReader+ is passed in, or if the
|
64
|
+
# value returned by +max_doc()+ changes for the current IndexReader. This
|
65
|
+
# class is not set up to be able to efficiently sort hits from more than one
|
66
|
+
# index simultaneously.
|
67
|
+
class Sort
|
68
|
+
|
69
|
+
attr_accessor :fields
|
70
|
+
|
71
|
+
# Sorts by computed relevance. You can pass a string representing the name
|
72
|
+
# of the field you want to sort on, a SortField, or an array of either
|
73
|
+
# (but not a mixed array). If you pass a string or and array of strings
|
74
|
+
# you can also pass a reverse flag. If you pass a SortField the reverse is
|
75
|
+
# handled by it.
|
76
|
+
#
|
77
|
+
# fields:: The fields you want to sort on. See also SortField
|
78
|
+
# reverse:: pass true if you want the sort order to be reversed. Only
|
79
|
+
# works if you pass the field names.
|
80
|
+
def initialize(fields = [SortField::FIELD_SCORE, SortField::FIELD_DOC],
|
81
|
+
reverse = false)
|
82
|
+
fields = [fields] unless fields.is_a?(Array)
|
83
|
+
@fields = fields
|
84
|
+
if fields[0].is_a?(String)
|
85
|
+
@fields = fields.map do |field|
|
86
|
+
SortField.new(field, {:sort_type => SortField::SortType::AUTO,
|
87
|
+
:reverse => reverse})
|
88
|
+
end
|
89
|
+
@fields << SortField::FIELD_DOC if @fields.size == 1
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Represents sorting by computed relevance. Using this sort criteria returns
|
94
|
+
# the same results as calling Searcher#search(Query) Searcher#search()
|
95
|
+
# without a sort criteria, only with slightly more overhead.
|
96
|
+
RELEVANCE = Sort.new()
|
97
|
+
|
98
|
+
# Represents sorting by index order.
|
99
|
+
INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
|
100
|
+
|
101
|
+
def to_s()
|
102
|
+
return @fields.map {|field| "#{field}"}.join(", ")
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
# Abstract base class for sorting hits returned by a Query.
|
3
|
+
#
|
4
|
+
# This class should only be used if the other SortField types (SCORE, DOC,
|
5
|
+
# STRING, INT, FLOAT) do not provide an adequate sorting. It maintains an
|
6
|
+
# internal cache of values which could be quite large. The cache is an
|
7
|
+
# array of Comparable, one for each document in the index. There is a
|
8
|
+
# distinct Comparable for each unique term in the field - if some documents
|
9
|
+
# have the same term in the field, the cache array will have entries which
|
10
|
+
# reference the same Comparable.
|
11
|
+
#
|
12
|
+
# Author:: Tim Jones
|
13
|
+
class SortComparator
|
14
|
+
|
15
|
+
# Creates a comparator for the field in the given index.
|
16
|
+
#
|
17
|
+
# reader:: Index to create comparator for.
|
18
|
+
# field_name:: Field to create comparator for.
|
19
|
+
# returns:: Comparator of ScoreDoc objects.
|
20
|
+
def new_comparator(reader, field_name)
|
21
|
+
cached_values = FieldCache::DEFAULT.custom(reader, field, self)
|
22
|
+
|
23
|
+
score_doc_comparator = ScoreDocComparator.new()
|
24
|
+
|
25
|
+
class <<score_doc_comparator
|
26
|
+
attr_writer :cache_values
|
27
|
+
def compare(i, j)
|
28
|
+
return @cached_values[i.doc] <=> @cached_values[j.doc]
|
29
|
+
end
|
30
|
+
|
31
|
+
def sort_value(i)
|
32
|
+
return @cached_values[i.doc]
|
33
|
+
end
|
34
|
+
|
35
|
+
def sort_type()
|
36
|
+
return SortField::SortType::CUSTOM
|
37
|
+
end
|
38
|
+
end
|
39
|
+
score_doc_comparator.cached_values = cached_values
|
40
|
+
return score_doc_comparator
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns an object which, when sorted according to natural order, will
|
44
|
+
# order the Term values in the correct order. For example, if the Terms
|
45
|
+
# contained integer values, this method would return +term_text.to_i+.
|
46
|
+
# Note that this might not always be the most efficient implementation -
|
47
|
+
# for this particular example, a better implementation might be to make a
|
48
|
+
# ScoreDocLookupComparator that uses an internal lookup table of int.
|
49
|
+
#
|
50
|
+
# term_text:: The textual value of the term.
|
51
|
+
#
|
52
|
+
# returns:: An object representing +term_text+ that sorts according to the
|
53
|
+
# natural order of +term_text+.
|
54
|
+
#
|
55
|
+
# See ScoreDocComparator
|
56
|
+
def get_comparable(term_text)
|
57
|
+
raise NotImplementedError
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Ferret::Search
|
2
|
+
|
3
|
+
# Stores information about how to sort documents by terms in an individual
|
4
|
+
# field. Fields must be indexed in order to sort by them.
|
5
|
+
class SortField
|
6
|
+
class SortType < Ferret::Utils::Parameter
|
7
|
+
attr_reader :parser, :comparator
|
8
|
+
|
9
|
+
# Creates a new SortType. A SortType is used to specify how a field is
|
10
|
+
# sorted in a document. Each SortType *MUST* have a unique name. This is
|
11
|
+
# because the SortType object is used to cache a fields values for a
|
12
|
+
# particular reader, so each SortType should be created once only and
|
13
|
+
# stored in a constant. See the standard SortTypes stored hear for
|
14
|
+
# example.
|
15
|
+
def initialize(name, parser = lambda{|str| str}, comparator = nil)
|
16
|
+
super(name)
|
17
|
+
@parser = parser
|
18
|
+
@comparator = comparator
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sort by document score (relevancy). Sort values are Float and higher
|
22
|
+
# values are at the front.
|
23
|
+
SCORE = SortType.new("score")
|
24
|
+
|
25
|
+
# Sort by document number (order). Sort values are Integer and lower
|
26
|
+
# values are at the front.
|
27
|
+
DOC = SortType.new("doc")
|
28
|
+
|
29
|
+
# Guess sort type of sort based on field contents. We try parsing the
|
30
|
+
# field as an integer and then as a floating point number. If we are
|
31
|
+
# unsuccessful, the field is parsed as a plain string.
|
32
|
+
AUTO = SortType.new("auto")
|
33
|
+
|
34
|
+
# Sort using term values as Strings. Sort values are String and lower
|
35
|
+
# values are at the front.
|
36
|
+
STRING = SortType.new("string")
|
37
|
+
|
38
|
+
# Sort using term values as encoded Integers. Sort values are Integer
|
39
|
+
# and lower values are at the front.
|
40
|
+
INT = SortType.new("int", lambda{|str| str.to_i})
|
41
|
+
|
42
|
+
# Sort using term values as encoded Floats. Sort values are Float and
|
43
|
+
# lower values are at the front.
|
44
|
+
FLOAT = SortType.new("float", lambda{|str| str.to_f})
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :name, :sort_type, :comparator
|
48
|
+
|
49
|
+
def reverse?
|
50
|
+
return @reverse
|
51
|
+
end
|
52
|
+
|
53
|
+
# Creates a SortField which specifies which field the data is sorted on
|
54
|
+
# and how that field is sorted. See SortType.
|
55
|
+
#
|
56
|
+
# name:: Name of field to sort by. Can be +nil+ if +sort_type+ is SCORE or
|
57
|
+
# DOC.
|
58
|
+
#
|
59
|
+
# A hash with the followind values can also be supplied;
|
60
|
+
# sort_type:: Type of values in the terms.
|
61
|
+
# reverse:: True if natural order should be reversed.
|
62
|
+
# comparator:: a proc used to compare two values from the index. You can
|
63
|
+
# also give this value to the SortType object that you pass.
|
64
|
+
def initialize(name = nil, args= {})
|
65
|
+
@name = name
|
66
|
+
@sort_type = args[:sort_type]||SortType::AUTO
|
67
|
+
@reverse = args[:reverse]||false
|
68
|
+
@comparator = args[:comparator]||@sort_type.comparator
|
69
|
+
if (@name == nil and @sort_type != SortType::DOC and
|
70
|
+
@sort_type != SortType::SCORE)
|
71
|
+
raise ArgumentError, "You must supply a field name for your sort field"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Represents sorting by document score (relevancy).
|
76
|
+
FIELD_SCORE = SortField.new(nil, {:sort_type => SortType::SCORE})
|
77
|
+
|
78
|
+
# Represents sorting by document number (order).
|
79
|
+
FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
|
80
|
+
|
81
|
+
def to_s()
|
82
|
+
buffer = '"' + (@name||"<#{@sort_type}>") + '"'
|
83
|
+
buffer << '!' if @reverse
|
84
|
+
return buffer
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
$:.unshift File.dirname(__FILE__)
|
2
|
+
|
3
|
+
require 'spans/spans_enum.rb'
|
4
|
+
require 'spans/near_spans_enum.rb'
|
5
|
+
require 'spans/span_query.rb'
|
6
|
+
require 'spans/span_first_query.rb'
|
7
|
+
require 'spans/span_near_query.rb'
|
8
|
+
require 'spans/span_not_query.rb'
|
9
|
+
require 'spans/span_or_query.rb'
|
10
|
+
require 'spans/span_scorer.rb'
|
11
|
+
require 'spans/span_term_query.rb'
|
12
|
+
require 'spans/span_weight.rb'
|
@@ -0,0 +1,304 @@
|
|
1
|
+
module Ferret::Search::Spans
|
2
|
+
class NearSpansEnum < SpansEnum
|
3
|
+
|
4
|
+
class CellQueue < Ferret::Utils::PriorityQueue
|
5
|
+
def less_than(o1, o2)
|
6
|
+
if (o1.doc == o2.doc)
|
7
|
+
if (o1.start == o2.start)
|
8
|
+
if (o1.finish == o2.finish)
|
9
|
+
return o1.index > o2.index
|
10
|
+
else
|
11
|
+
return o1.finish < o2.finish
|
12
|
+
end
|
13
|
+
else
|
14
|
+
return o1.start < o2.start
|
15
|
+
end
|
16
|
+
else
|
17
|
+
return o1.doc < o2.doc
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
# Wraps a SpansEnum, and can be used to form a linked list.
|
24
|
+
class SpansCell < SpansEnum
|
25
|
+
attr_accessor :next, :index
|
26
|
+
|
27
|
+
def initialize(parent, spans, index)
|
28
|
+
@parent = parent
|
29
|
+
@spans = spans
|
30
|
+
@index = index
|
31
|
+
@length = -1
|
32
|
+
end
|
33
|
+
|
34
|
+
def next?()
|
35
|
+
if (@length != -1) # subtract old length
|
36
|
+
@parent.total_length -= @length
|
37
|
+
end
|
38
|
+
|
39
|
+
more = @spans.next? # move to next
|
40
|
+
|
41
|
+
if more
|
42
|
+
@length = finish() - start() # compute new length
|
43
|
+
@parent.total_length += @length # add new length to total
|
44
|
+
|
45
|
+
if (@parent.max.nil? or doc() > @parent.max.doc or # maintain max
|
46
|
+
(doc() == @parent.max.doc and finish() > @parent.max.finish))
|
47
|
+
@parent.max = self
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
return more
|
52
|
+
end
|
53
|
+
|
54
|
+
def skip_to(target)
|
55
|
+
if (@length != -1) # subtract old length
|
56
|
+
@parent.total_length -= @length
|
57
|
+
end
|
58
|
+
|
59
|
+
more = @spans.skip_to(target) # skip
|
60
|
+
|
61
|
+
if (more)
|
62
|
+
@length = finish() - start() # compute new length
|
63
|
+
@parent.total_length += @length # add new length to total
|
64
|
+
|
65
|
+
if (@parent.max == nil or doc() > @parent.max.doc() or # maintain max
|
66
|
+
(doc() == @parent.max.doc and finish() > @parent.max.finish))
|
67
|
+
@parent.max = self
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
return more
|
72
|
+
end
|
73
|
+
|
74
|
+
def doc() return @spans.doc() end
|
75
|
+
def start() return @spans.start() end
|
76
|
+
def finish() return @spans.finish() end
|
77
|
+
|
78
|
+
def to_s() return "#{@spans}##{@index}" end
|
79
|
+
end
|
80
|
+
|
81
|
+
attr_accessor :total_length, :max
|
82
|
+
|
83
|
+
def initialize(query, reader)
|
84
|
+
@ordered = [] # spans in query order
|
85
|
+
|
86
|
+
@first = nil # linked list of spans
|
87
|
+
@last = nil # sorted by doc only
|
88
|
+
|
89
|
+
@total_length = 0 # sum of current lengths
|
90
|
+
|
91
|
+
@queue = nil # sorted queue of spans
|
92
|
+
@max = nil # max element in queue
|
93
|
+
|
94
|
+
@more = true # true iff not done
|
95
|
+
@first_time = true # true before first next?
|
96
|
+
|
97
|
+
|
98
|
+
@query = query
|
99
|
+
@slop = query.slop
|
100
|
+
@in_order = query.in_order?
|
101
|
+
|
102
|
+
clauses = query.clauses # initialize spans & list
|
103
|
+
@queue = CellQueue.new(clauses.length)
|
104
|
+
clauses.length.times do |i|
|
105
|
+
# construct clause spans
|
106
|
+
cell = SpansCell.new(self, clauses[i].spans(reader), i)
|
107
|
+
@ordered << cell # add to ordered
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def next?()
|
112
|
+
if (@first_time)
|
113
|
+
init_list(true)
|
114
|
+
list_to_queue() # initialize queue
|
115
|
+
@first_time = false
|
116
|
+
elsif (@more)
|
117
|
+
@more = min().next? # trigger further scanning
|
118
|
+
@queue.adjust_top() if (@more) # maintain queue
|
119
|
+
end
|
120
|
+
|
121
|
+
while (@more)
|
122
|
+
queue_stale = false
|
123
|
+
|
124
|
+
if (min().doc != @max.doc) # maintain list
|
125
|
+
queue_to_list()
|
126
|
+
queue_stale = true
|
127
|
+
end
|
128
|
+
|
129
|
+
# skip to doc w/ all clauses
|
130
|
+
|
131
|
+
while (@more and @first.doc < @last.doc)
|
132
|
+
@more = @first.skip_to(@last.doc) # skip first upto last
|
133
|
+
first_to_last() # and move it to the end
|
134
|
+
queue_stale = true
|
135
|
+
end
|
136
|
+
|
137
|
+
return false if not @more
|
138
|
+
|
139
|
+
# found doc w/ all clauses
|
140
|
+
|
141
|
+
if (queue_stale) # maintain the queue
|
142
|
+
list_to_queue()
|
143
|
+
queue_stale = false
|
144
|
+
end
|
145
|
+
|
146
|
+
return true if at_match?
|
147
|
+
|
148
|
+
# trigger further scanning
|
149
|
+
if (@in_order and check_slop?())
|
150
|
+
# There is a non ordered match within slop and an ordered match is needed.
|
151
|
+
@more = first_non_ordered_next_to_partial_list()
|
152
|
+
if (@more)
|
153
|
+
partial_list_to_queue()
|
154
|
+
end
|
155
|
+
else
|
156
|
+
@more = min().next?()
|
157
|
+
if (@more)
|
158
|
+
@queue.adjust_top() # maintain queue
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
return false # no more matches
|
163
|
+
end
|
164
|
+
|
165
|
+
def each()
|
166
|
+
cell = @first
|
167
|
+
while (cell)
|
168
|
+
yield cell
|
169
|
+
cell=cell.next
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
def skip_to(target)
|
174
|
+
if (@first_time) # initialize
|
175
|
+
init_list(false)
|
176
|
+
each() do |cell|
|
177
|
+
@more = cell.skip_to(target) # skip all
|
178
|
+
break if not @more
|
179
|
+
end
|
180
|
+
|
181
|
+
if (@more)
|
182
|
+
list_to_queue()
|
183
|
+
end
|
184
|
+
@first_time = false
|
185
|
+
|
186
|
+
else # normal case
|
187
|
+
while (@more and min().doc < target) # skip as needed
|
188
|
+
@more = min().skip_to(target)
|
189
|
+
@queue.adjust_top() if (@more)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
if (@more)
|
194
|
+
return true if (at_match?()) # at a match?
|
195
|
+
return next? # no, scan
|
196
|
+
end
|
197
|
+
|
198
|
+
return false
|
199
|
+
end
|
200
|
+
|
201
|
+
def min() @queue.top() end
|
202
|
+
|
203
|
+
def doc() min().doc() end
|
204
|
+
def start() min().start() end
|
205
|
+
def finish() @max.finish() end
|
206
|
+
|
207
|
+
|
208
|
+
def to_s()
|
209
|
+
buffer = "spans(#{@query})@"
|
210
|
+
if @first_time
|
211
|
+
buffer << "START"
|
212
|
+
else
|
213
|
+
buffer << (@queue.size>0 ? ("#{doc}:#{start()}-#{finish}") : "END")
|
214
|
+
end
|
215
|
+
return buffer
|
216
|
+
end
|
217
|
+
|
218
|
+
def init_list(nxt)
|
219
|
+
@ordered.each do |cell|
|
220
|
+
@more = cell.next? if nxt
|
221
|
+
if @more
|
222
|
+
add_to_list(cell) # add to list
|
223
|
+
else
|
224
|
+
break
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
def add_to_list(cell)
|
230
|
+
if (@last != nil) # add next to end of list
|
231
|
+
@last.next = cell
|
232
|
+
else
|
233
|
+
@first = cell
|
234
|
+
end
|
235
|
+
@last = cell
|
236
|
+
cell.next = nil
|
237
|
+
end
|
238
|
+
|
239
|
+
def first_to_last()
|
240
|
+
@last.next = @first # move first to end of list
|
241
|
+
@last = @first
|
242
|
+
@first = @first.next
|
243
|
+
@last.next = nil
|
244
|
+
end
|
245
|
+
|
246
|
+
def queue_to_list()
|
247
|
+
@last = @first = nil
|
248
|
+
while (@queue.top() != nil)
|
249
|
+
add_to_list(@queue.pop())
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
def first_non_ordered_next_to_partial_list()
|
254
|
+
# Creates a partial list consisting of first non ordered and earlier.
|
255
|
+
# Returns first non ordered .next?.
|
256
|
+
@last = @first = nil
|
257
|
+
ordered_index = 0
|
258
|
+
while (@queue.top() != nil)
|
259
|
+
cell = @queue.pop()
|
260
|
+
add_to_list(cell)
|
261
|
+
if (cell.index == ordered_index)
|
262
|
+
ordered_index += 1
|
263
|
+
else
|
264
|
+
return cell.next?()
|
265
|
+
# FIXME: continue here, rename to eg. checkOrderedMatch():
|
266
|
+
# when check_slop?() and not ordered, repeat cell.next?().
|
267
|
+
# when check_slop?() and ordered, add to list and repeat queue.pop()
|
268
|
+
# without check_slop?(): no match, rebuild the queue from the partial list.
|
269
|
+
# When queue is empty and check_slop?() and ordered there is a match.
|
270
|
+
end
|
271
|
+
end
|
272
|
+
raise RuntimeException, "Unexpected: ordered"
|
273
|
+
end
|
274
|
+
|
275
|
+
def list_to_queue()
|
276
|
+
@queue.clear() # rebuild queue
|
277
|
+
partial_list_to_queue()
|
278
|
+
end
|
279
|
+
|
280
|
+
def partial_list_to_queue()
|
281
|
+
each() { |cell| @queue.push(cell) } # add to queue from list
|
282
|
+
end
|
283
|
+
|
284
|
+
def at_match?()
|
285
|
+
return ((min().doc() == @max.doc()) and check_slop?() and
|
286
|
+
(not @in_order or match_is_ordered?()))
|
287
|
+
end
|
288
|
+
|
289
|
+
def check_slop?()
|
290
|
+
match_length = @max.finish() - min.start()
|
291
|
+
return ((match_length - @total_length) <= @slop)
|
292
|
+
end
|
293
|
+
|
294
|
+
def match_is_ordered?()
|
295
|
+
last_start = -1
|
296
|
+
@ordered.each do |cell|
|
297
|
+
start = cell.start
|
298
|
+
return false if start <= last_start
|
299
|
+
last_start = start
|
300
|
+
end
|
301
|
+
return true
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|