ferret 0.9.6 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,86 +0,0 @@
|
|
1
|
-
module Ferret::Analysis
|
2
|
-
# A TokenFilter is a TokenStream whose input is another token stream.
|
3
|
-
#
|
4
|
-
# This is an abstract class.
|
5
|
-
class TokenFilter < TokenStream
|
6
|
-
# Close the input TokenStream.
|
7
|
-
def close()
|
8
|
-
@input.close()
|
9
|
-
end
|
10
|
-
|
11
|
-
protected
|
12
|
-
# Construct a token stream filtering the given input.
|
13
|
-
def initialize(input)
|
14
|
-
@input = input
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
# Normalizes token text to lower case.
|
19
|
-
class LowerCaseFilter < TokenFilter
|
20
|
-
def next()
|
21
|
-
t = @input.next()
|
22
|
-
|
23
|
-
if (t == nil)
|
24
|
-
return nil
|
25
|
-
end
|
26
|
-
|
27
|
-
t.text = t.text.downcase()
|
28
|
-
|
29
|
-
return t
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
# Removes stop words from a token stream. To will need to pass your own
|
34
|
-
# set of stopwords to use this stop filter. If you with to use the default
|
35
|
-
# list of stopwords then use the StopAnalyzer.
|
36
|
-
class StopFilter < TokenFilter
|
37
|
-
# Constructs a filter which removes words from the input
|
38
|
-
# TokenStream that are named in the array of words.
|
39
|
-
def initialize(input, stop_set)
|
40
|
-
super(input);
|
41
|
-
@stop_set = stop_set
|
42
|
-
end
|
43
|
-
|
44
|
-
def StopFilter.new_with_file(input, path)
|
45
|
-
ws = WordListLoader.word_set_from_file(path)
|
46
|
-
return StopFilter.new(input, ws)
|
47
|
-
end
|
48
|
-
|
49
|
-
# Returns the next input Token whose termText() is not a stop word.
|
50
|
-
def next()
|
51
|
-
# return the first non-stop word found
|
52
|
-
while token = @input.next()
|
53
|
-
return token if ! @stop_set.include?(token.text)
|
54
|
-
end
|
55
|
-
return nil
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Transforms the token stream as per the Porter stemming algorithm.
|
60
|
-
# Note: the input to the stemming filter must already be in lower case,
|
61
|
-
# so you will need to use LowerCaseFilter or LowerCaseTokenizer further
|
62
|
-
# down the Tokenizer chain in order for this to work properly!
|
63
|
-
#
|
64
|
-
# To use this filter with other analyzers, you'll want to write an
|
65
|
-
# Analyzer class that sets up the TokenStream chain as you want it.
|
66
|
-
# To use this with LowerCaseTokenizer, for example, you'd write an
|
67
|
-
# analyzer like this:
|
68
|
-
#
|
69
|
-
# def MyAnalyzer < Analyzer
|
70
|
-
# def token_stream(field, reader)
|
71
|
-
# return PorterStemFilter.new(LowerCaseTokenizer.new(reader))
|
72
|
-
# end
|
73
|
-
# end
|
74
|
-
class PorterStemFilter < TokenFilter
|
75
|
-
# Returns the next input Token, after being stemmed
|
76
|
-
def next()
|
77
|
-
token = @input.next()
|
78
|
-
if (token == nil)
|
79
|
-
return nil
|
80
|
-
else
|
81
|
-
token.text = Stemmable.stem_porter(token.text)
|
82
|
-
end
|
83
|
-
token
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module Ferret::Analysis
|
2
|
-
# A TokenStream enumerates the sequence of tokens, either from
|
3
|
-
# fields of a document or from query text.
|
4
|
-
#
|
5
|
-
# This is an abstract class. Concrete subclasses are:
|
6
|
-
# * Tokenizer, a TokenStream whose input is a Reader; and
|
7
|
-
# * TokenFilter, a TokenStream whose input is another TokenStream.
|
8
|
-
class TokenStream
|
9
|
-
# Returns the next token in the stream, or null at EOS.
|
10
|
-
def next
|
11
|
-
raise NotImplementedError
|
12
|
-
end
|
13
|
-
|
14
|
-
# Releases resources associated with this stream.
|
15
|
-
def close
|
16
|
-
raise NotImplementedError
|
17
|
-
end
|
18
|
-
|
19
|
-
# Iterates through the tokens in the field
|
20
|
-
def each # :yields: token
|
21
|
-
while (n = self.next())
|
22
|
-
yield n
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
@@ -1,112 +0,0 @@
|
|
1
|
-
require 'strscan'
|
2
|
-
|
3
|
-
module Ferret::Analysis
|
4
|
-
# A Tokenizer is a TokenStream whose input is a Reader.
|
5
|
-
#
|
6
|
-
# This is an abstract class.
|
7
|
-
class Tokenizer < TokenStream
|
8
|
-
# By default, closes the input Reader.
|
9
|
-
def close()
|
10
|
-
@input.close()
|
11
|
-
end
|
12
|
-
|
13
|
-
protected
|
14
|
-
# Construct a token stream processing the given input.
|
15
|
-
def initialize(input)
|
16
|
-
@input = input
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
# An abstract base class for simple regular expression oriented
|
21
|
-
# tokenizers. Very powerful tokenizers can be created using this class as
|
22
|
-
# can be seen from the StandardTokenizer class. Bellow is an example of a
|
23
|
-
# simple implementation of a LetterTokenizer using an RegExpTokenizer.
|
24
|
-
# Basically, a token is a sequence of alphabetic characters separated by
|
25
|
-
# one or more non-alphabetic characters.
|
26
|
-
#
|
27
|
-
# class LetterTokenizer < RegExpTokenizer
|
28
|
-
# def token_re()
|
29
|
-
# /[[:alpha:]]+/
|
30
|
-
# end
|
31
|
-
# end
|
32
|
-
class RegExpTokenizer < Tokenizer
|
33
|
-
|
34
|
-
# Initialize with an IO implementing input such as a file.
|
35
|
-
#
|
36
|
-
# input:: must have a read(count) method which returns an array or string
|
37
|
-
# of _count_ chars.
|
38
|
-
def initialize(input)
|
39
|
-
#@token_buffer = Token.new("", 0, 0)
|
40
|
-
if input.is_a? String
|
41
|
-
@ss = StringScanner.new(input)
|
42
|
-
else
|
43
|
-
@ss = StringScanner.new(input.read())
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# Returns the next token in the stream, or null at EOS.
|
48
|
-
def next()
|
49
|
-
if @ss.scan_until(token_re)
|
50
|
-
term = @ss.matched
|
51
|
-
term_end = @ss.pos
|
52
|
-
term_start = term_end - term.size
|
53
|
-
else
|
54
|
-
return nil
|
55
|
-
end
|
56
|
-
|
57
|
-
#return @token_buffer.set!(normalize(term), term_start, term_end)
|
58
|
-
return Token.new(normalize(term), term_start, term_end)
|
59
|
-
end
|
60
|
-
|
61
|
-
def close()
|
62
|
-
@ss = nil
|
63
|
-
end
|
64
|
-
|
65
|
-
protected
|
66
|
-
# returns the regular expression used to find the next token
|
67
|
-
TOKEN_RE = /[[:alpha:]]+/
|
68
|
-
def token_re
|
69
|
-
TOKEN_RE
|
70
|
-
end
|
71
|
-
|
72
|
-
# Called on each token to normalize it before it is added to the
|
73
|
-
# token. The default implementation does nothing. Subclasses may
|
74
|
-
# use this to, e.g., lowercase tokens.
|
75
|
-
def normalize(str) return str end
|
76
|
-
end
|
77
|
-
|
78
|
-
|
79
|
-
# A LetterTokenizer is a tokenizer that divides text at non-letters.
|
80
|
-
# That's to say, it defines tokens as maximal strings of adjacent letters,
|
81
|
-
# as defined by the regular expression _/[[:alpha:]]+/_.
|
82
|
-
class LetterTokenizer < RegExpTokenizer
|
83
|
-
protected
|
84
|
-
# Collects only characters which satisfy the regular expression
|
85
|
-
# _/[[:alpha:]]+/_.
|
86
|
-
TOKEN_RE = /[[:alpha:]]+/
|
87
|
-
def token_re
|
88
|
-
TOKEN_RE
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# LowerCaseTokenizer performs the function of LetterTokenizer
|
93
|
-
# and LowerCaseFilter together. It divides text at non-letters and converts
|
94
|
-
# them to lower case.
|
95
|
-
class LowerCaseTokenizer < LetterTokenizer
|
96
|
-
protected
|
97
|
-
def normalize(str)
|
98
|
-
return str.downcase
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
|
-
# A WhiteSpaceTokenizer is a tokenizer that divides text at whiteSpace.
|
103
|
-
# Adjacent sequences of non-WhiteSpace characters form tokens.
|
104
|
-
class WhiteSpaceTokenizer < RegExpTokenizer
|
105
|
-
protected
|
106
|
-
# Collects only characters which are not spaces tabs or carraige returns
|
107
|
-
TOKEN_RE = /\S+/
|
108
|
-
def token_re
|
109
|
-
TOKEN_RE
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
require 'set'
|
2
|
-
module Ferret::Analysis
|
3
|
-
# Loader for text files that represent a list of stopwords.
|
4
|
-
module WordListLoader
|
5
|
-
# Loads a text file and adds every line as an entry to a HashSet (omitting
|
6
|
-
# leading and trailing whitespace). Every line of the file should contain only
|
7
|
-
# one word. The words need to be in lowercase if you make use of an
|
8
|
-
# Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
|
9
|
-
#
|
10
|
-
# path:: path to file containing the wordlist
|
11
|
-
# return:: A HashSet with the file's words
|
12
|
-
def WordListLoader.word_set_from_file(path)
|
13
|
-
result = Set.new()
|
14
|
-
File.open(path) do |word_file|
|
15
|
-
# we have to strip the end of line characters
|
16
|
-
word_file.each {|line| result << line[0..-2] }
|
17
|
-
end
|
18
|
-
return result
|
19
|
-
end
|
20
|
-
|
21
|
-
def WordListLoader.word_set_from_array(word_array)
|
22
|
-
result = Set.new()
|
23
|
-
word_array.each {|word| result << word }
|
24
|
-
return result
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
@@ -1,152 +0,0 @@
|
|
1
|
-
module Ferret::Document
|
2
|
-
# Documents are the unit of indexing and search.
|
3
|
-
#
|
4
|
-
# A Document is a set of fields. Each field has a name and a textual
|
5
|
-
# value. A field may be Field#stored?() with the document, in which case
|
6
|
-
# it is returned with search hits on the document. Thus each document
|
7
|
-
# should typically contain one or more stored fields which uniquely
|
8
|
-
# identify it.
|
9
|
-
#
|
10
|
-
# Note that fields which are _not_ Field#stored?() are _not_ available in
|
11
|
-
# documents retrieved from the index, e.g. with Hits#doc, Searcher#doc or
|
12
|
-
# IndexReader#document.
|
13
|
-
#
|
14
|
-
# Several fields may be added with the same name. In this case, if the
|
15
|
-
# fields are indexed, their text is treated as though appended for the
|
16
|
-
# purposes of search.
|
17
|
-
#
|
18
|
-
# Note that add like the remove_field(s) methods only makes sense prior to
|
19
|
-
# adding a document to an index. These methods cannot be used to change
|
20
|
-
# the content of an existing index! In order to achieve this, a document
|
21
|
-
# has to be deleted from an index and a new changed version of that
|
22
|
-
# document has to be added.
|
23
|
-
class Document
|
24
|
-
attr_accessor :boost
|
25
|
-
|
26
|
-
# Constructs a new document with no fields.
|
27
|
-
def initialize()
|
28
|
-
# Values are multiplied into the value of Field#boost of each field in
|
29
|
-
# this document. Thus, this method in effect sets a default boost for
|
30
|
-
# the fields of this document.
|
31
|
-
#
|
32
|
-
# The default value is 1.0.
|
33
|
-
#
|
34
|
-
# Note: This value is not stored directly with the document in the
|
35
|
-
# index. Documents returned from IndexReader#document and Hits#doc
|
36
|
-
# may thus not have the same value present as when this document was
|
37
|
-
# indexed.
|
38
|
-
@boost = 1.0
|
39
|
-
@fields = {}
|
40
|
-
end
|
41
|
-
|
42
|
-
# Returns an array of all fields. Note that it is possible for two
|
43
|
-
# fields to appear with the same field name. These will be concatenated
|
44
|
-
# in the index.
|
45
|
-
def all_fields
|
46
|
-
@fields.values.flatten
|
47
|
-
end
|
48
|
-
|
49
|
-
# Returns the number of distinct fields held within the document. This
|
50
|
-
# counts fields which have multiple entries as one.
|
51
|
-
def field_count()
|
52
|
-
return @fields.size
|
53
|
-
end
|
54
|
-
|
55
|
-
# Returns the number of entries held within the document. This counts
|
56
|
-
# all sections so for fields which have multiple entries, each entry
|
57
|
-
# is counted
|
58
|
-
def entry_count()
|
59
|
-
return @fields.values.flatten.size
|
60
|
-
end
|
61
|
-
|
62
|
-
# Adds a field to a document. Several fields may be added with the same
|
63
|
-
# name. In this case, if the fields are indexed, their text is treated
|
64
|
-
# as though appended for the purposes of search.
|
65
|
-
#
|
66
|
-
# Note that add like the remove_field(s) methods only makes sense prior
|
67
|
-
# to adding a document to an index. These methods cannot be used to
|
68
|
-
# change the content of an existing index! In order to achieve this, a
|
69
|
-
# document has to be deleted from an index and a new changed version of
|
70
|
-
# that document has to be added.
|
71
|
-
def add_field(field)
|
72
|
-
(@fields[field.name.to_s] ||= []) << field
|
73
|
-
end
|
74
|
-
alias :<< :add_field
|
75
|
-
|
76
|
-
# Removes the first field of this name if it exists.
|
77
|
-
def remove_field(name)
|
78
|
-
@fields[name.to_s].delete_at(0)
|
79
|
-
end
|
80
|
-
|
81
|
-
# Removes all fields with the given name from the document.
|
82
|
-
#
|
83
|
-
# If there is no field with the specified name, the document remains
|
84
|
-
# unchanged.
|
85
|
-
#
|
86
|
-
# Note that the remove_field(s) methods like the add method only make
|
87
|
-
# sense prior to adding a document to an index. These methods cannot be
|
88
|
-
# used to change the content of an existing index! In order to achieve
|
89
|
-
# this, a document has to be deleted from an index and a new changed
|
90
|
-
# version of that document has to be added.
|
91
|
-
def remove_fields(name)
|
92
|
-
@fields.delete(name.to_s)
|
93
|
-
end
|
94
|
-
|
95
|
-
# Returns the first field with the given name.
|
96
|
-
# This method can return _nil_.
|
97
|
-
#
|
98
|
-
# name:: the name of the field
|
99
|
-
# Return:: a _Field_ array
|
100
|
-
def field(name)
|
101
|
-
@fields[name.to_s] ? @fields[name.to_s][0] : nil
|
102
|
-
end
|
103
|
-
|
104
|
-
# Returns an array of all fields with the given name.
|
105
|
-
# This method can return _nil_.
|
106
|
-
#
|
107
|
-
# name:: the name of the field
|
108
|
-
# Return:: a _Field_ array
|
109
|
-
def fields(name)
|
110
|
-
@fields[name.to_s]
|
111
|
-
end
|
112
|
-
|
113
|
-
# Returns an array of values of the field specified as the method
|
114
|
-
# parameter. This method can return _nil_.
|
115
|
-
#
|
116
|
-
# name:: the name of the field
|
117
|
-
# Return:: a _String_ of field values
|
118
|
-
def values(name)
|
119
|
-
return nil if @fields[name.to_s].nil?
|
120
|
-
@fields[name.to_s].map {|f| f.data if not f.binary? }.join(" ")
|
121
|
-
end
|
122
|
-
alias :[] :values
|
123
|
-
|
124
|
-
# Sets the data in field +field+ to +text+. If there is more than one
|
125
|
-
# field of that name then it will set the data in the first field of that
|
126
|
-
# name. If there is no field of that name, then a new one will be created
|
127
|
-
def []=(field_name, data)
|
128
|
-
field = field(field_name.to_s)
|
129
|
-
if field
|
130
|
-
field.data = data
|
131
|
-
else
|
132
|
-
add_field(Field.new(field_name.to_s, data))
|
133
|
-
end
|
134
|
-
end
|
135
|
-
|
136
|
-
# Returns an array of binaries of the field specified as the method
|
137
|
-
# parameter. This method can return _nil_.
|
138
|
-
#
|
139
|
-
# name:: the name of the field
|
140
|
-
# Return:: a _String_ of field values
|
141
|
-
def binaries(name)
|
142
|
-
binaries = []
|
143
|
-
@fields[name.to_s].each {|f| binaries << f.data if f.binary? }
|
144
|
-
return binaries
|
145
|
-
end
|
146
|
-
|
147
|
-
# Prints the fields of a document for human consumption.#/
|
148
|
-
def to_s()
|
149
|
-
return "Document{\n #{@fields.values.join("\n ")}\n}"
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
@@ -1,312 +0,0 @@
|
|
1
|
-
module Ferret::Document
|
2
|
-
# A field is a section of a Document. Each field has two parts, a name
|
3
|
-
# and a value. Values may be free text, provided as a String or as a
|
4
|
-
# Reader, or they may be atomic keywords, which are not further processed.
|
5
|
-
# Such keywords may be used to represent dates, urls, etc. Fields are
|
6
|
-
# optionally stored in the index, so that they may be returned with hits
|
7
|
-
# on the document.
|
8
|
-
class Field
|
9
|
-
|
10
|
-
# This value will be
|
11
|
-
# multiplied into the score of all hits on this field of this
|
12
|
-
# document.
|
13
|
-
#
|
14
|
-
# The boost is multiplied by Document#boost of the document
|
15
|
-
# containing this field. If a document has multiple fields with the same
|
16
|
-
# name, all such values are multiplied together. This product is then
|
17
|
-
# multipled by the value Similarity#length_norm(String,int), and
|
18
|
-
# rounded by Similarity#encode_norm(float) before it is stored in the
|
19
|
-
# index. One should attempt to ensure that this product does not overflow
|
20
|
-
# the range of that encoding.
|
21
|
-
#
|
22
|
-
# See Document#set_boost(float)
|
23
|
-
# See Similarity#length_norm(String, int)
|
24
|
-
# See Similarity#encode_norm(float)
|
25
|
-
#
|
26
|
-
# Note: this value is not stored directly with the document in the index.
|
27
|
-
# Documents returned from IndexReader#document(int) and
|
28
|
-
# Hits#doc(int) may thus not have the same value present as when this field
|
29
|
-
# was indexed.
|
30
|
-
attr_accessor :boost, :data
|
31
|
-
attr_reader :name
|
32
|
-
|
33
|
-
# True iff the value of the field is to be stored in the index for
|
34
|
-
# return with search hits. It is an error for this to be true if a
|
35
|
-
# field is Reader-valued.
|
36
|
-
def stored?() return @stored end
|
37
|
-
|
38
|
-
# True iff the value of the field is to be indexed, so that it may be
|
39
|
-
# searched on.
|
40
|
-
def indexed?() return @indexed end
|
41
|
-
|
42
|
-
# True iff the value of the field should be tokenized as text prior to
|
43
|
-
# indexing. Un-tokenized fields are indexed as a single word and may
|
44
|
-
# not be Reader-valued.
|
45
|
-
def tokenized?() return @tokenized end
|
46
|
-
|
47
|
-
# True if the field is to be stored as a binary value. This can be used
|
48
|
-
# to store images or other binary data in the index if you wish
|
49
|
-
def binary?() return @binary end
|
50
|
-
|
51
|
-
# True if you want to compress the data that you store. This is a good
|
52
|
-
# idea for really large text fields. The ruby Zlib library is used to do
|
53
|
-
# the compression
|
54
|
-
def compressed?() return @compressed end
|
55
|
-
|
56
|
-
# True iff the term or terms used to index this field are stored as a
|
57
|
-
# term vector, available from IndexReader#term_freq_vector(). These
|
58
|
-
# methods do not provide access to the original content of the field,
|
59
|
-
# only to terms used to index it. If the original content must be
|
60
|
-
# preserved, use the _stored_ attribute instead.
|
61
|
-
#
|
62
|
-
# See IndexReader#term_freq_vector()
|
63
|
-
def store_term_vector?() return @store_term_vector end
|
64
|
-
|
65
|
-
# True if the positions of the indexed terms in this field are stored.
|
66
|
-
def store_positions?() return @store_position end
|
67
|
-
|
68
|
-
# True if the offsets of this field are stored. The offsets are the
|
69
|
-
# positions of the start and end characters of the token in the whole
|
70
|
-
# field string
|
71
|
-
def store_offsets?() return @store_offset end
|
72
|
-
|
73
|
-
# True if the norms are not stored for this field. No norms means that
|
74
|
-
# index-time boosting and field length normalization will be disabled.
|
75
|
-
# The benefit is less memory usage as norms take up one byte per indexed
|
76
|
-
# field for every document in the index.
|
77
|
-
def omit_norms?() return @omit_norms end
|
78
|
-
|
79
|
-
class Store < Ferret::Utils::Parameter
|
80
|
-
# Store the original field value in the index in a compressed form.
|
81
|
-
# This is useful for long documents and for binary valued fields.
|
82
|
-
COMPRESS = Store.new("COMPRESS")
|
83
|
-
|
84
|
-
# Store the original field value in the index. This is useful for
|
85
|
-
# short texts like a document's title which should be displayed with
|
86
|
-
# the results. The value is stored in its original form, i.e. no
|
87
|
-
# analyzer is used before it is stored.
|
88
|
-
YES = Store.new("YES")
|
89
|
-
|
90
|
-
# Do not store the field value in the index.
|
91
|
-
NO = Store.new("NO")
|
92
|
-
end
|
93
|
-
|
94
|
-
class Index < Ferret::Utils::Parameter
|
95
|
-
# Do not index the field value. This field can thus not be searched,
|
96
|
-
# but one can still access its contents provided it is Field.Store
|
97
|
-
# stored
|
98
|
-
NO = Index.new("NO")
|
99
|
-
|
100
|
-
# Index the field's value so it can be searched. An Analyzer will be
|
101
|
-
# used to tokenize and possibly further normalize the text before its
|
102
|
-
# terms will be stored in the index. This is useful for common text.
|
103
|
-
TOKENIZED = Index.new("TOKENIZED")
|
104
|
-
|
105
|
-
# Index the field's value without using an Analyzer, so it can be
|
106
|
-
# searched. As no analyzer is used the value will be stored as a
|
107
|
-
# single term. This is useful for unique Ids like product numbers.
|
108
|
-
UNTOKENIZED = Index.new("UNTOKENIZED")
|
109
|
-
|
110
|
-
# Index the field's value without an Analyzer, and disable the storing
|
111
|
-
# of norms. No norms means that index-time boosting and field length
|
112
|
-
# normalization will be disabled. The benefit is less memory usage as
|
113
|
-
# norms take up one byte per indexed field for every document in the
|
114
|
-
# index.
|
115
|
-
NO_NORMS = Index.new("NO_NORMS");
|
116
|
-
end
|
117
|
-
|
118
|
-
class TermVector < Ferret::Utils::Parameter
|
119
|
-
# Do not store term vectors.
|
120
|
-
NO = TermVector.new("NO")
|
121
|
-
|
122
|
-
# Store the term vectors of each document. A term vector is a list of
|
123
|
-
# the document's terms and their number of occurences in that
|
124
|
-
# document.
|
125
|
-
YES = TermVector.new("YES")
|
126
|
-
|
127
|
-
# Store the term vector + token position information
|
128
|
-
#
|
129
|
-
# See #YES
|
130
|
-
WITH_POSITIONS = TermVector.new("WITH_POSITIONS")
|
131
|
-
|
132
|
-
# Store the term vector + Token offset information
|
133
|
-
#
|
134
|
-
# See #YES
|
135
|
-
WITH_OFFSETS = TermVector.new("WITH_OFFSETS")
|
136
|
-
|
137
|
-
# Store the term vector + Token position and offset information
|
138
|
-
#
|
139
|
-
# See #YES See #WITH_POSITIONS See #WITH_OFFSETS
|
140
|
-
WITH_POSITIONS_OFFSETS = TermVector.new("WITH_POSITIONS_OFFSETS")
|
141
|
-
end
|
142
|
-
|
143
|
-
# Create a field by specifying its name, value and how it will
|
144
|
-
# be saved in the index.
|
145
|
-
#
|
146
|
-
# name:: The name of the field
|
147
|
-
# value:: The string to process
|
148
|
-
# store:: Whether _value_ should be stored in the index
|
149
|
-
# index:: Whether the field should be indexed, and if so, if it should
|
150
|
-
# be tokenized before indexing
|
151
|
-
#
|
152
|
-
# store_term_vector:: Whether term vector should be stored
|
153
|
-
# * the field is neither stored nor indexed
|
154
|
-
# * the field is not indexed but term_vector is _TermVector::YES_
|
155
|
-
#
|
156
|
-
# binary:: Whether you want to store binary data in this field. Default is
|
157
|
-
# false
|
158
|
-
# boost:: the boost for this field. Default is 1.0. A larger number makes
|
159
|
-
# this field more important.
|
160
|
-
def initialize(name,
|
161
|
-
value,
|
162
|
-
store = Store::YES,
|
163
|
-
index = Index::UNTOKENIZED,
|
164
|
-
term_vector = TermVector::NO,
|
165
|
-
binary = false,
|
166
|
-
boost = 1.0)
|
167
|
-
if (index == Index::NO and store == Store::NO)
|
168
|
-
raise ArgumentError, "it doesn't make sense to have a field that " +
|
169
|
-
"is neither indexed nor stored"
|
170
|
-
end
|
171
|
-
if (index == Index::NO && term_vector != TermVector::NO)
|
172
|
-
raise ArgumentError, "cannot store term vector information for a " +
|
173
|
-
"field that is not indexed"
|
174
|
-
end
|
175
|
-
|
176
|
-
# The name of the field (e.g., "date", "subject", "title", or "body")
|
177
|
-
@name = name.to_s
|
178
|
-
|
179
|
-
# the one and only data object for all different kind of field values
|
180
|
-
@data = value
|
181
|
-
self.store = store
|
182
|
-
self.index = index
|
183
|
-
self.term_vector = term_vector
|
184
|
-
@binary = binary
|
185
|
-
@boost = boost
|
186
|
-
end
|
187
|
-
|
188
|
-
def store=(store)
|
189
|
-
case store
|
190
|
-
when Store::YES
|
191
|
-
@stored = true
|
192
|
-
@compressed = false
|
193
|
-
when Store::COMPRESS
|
194
|
-
@stored = true
|
195
|
-
@compressed = true
|
196
|
-
when Store::NO
|
197
|
-
@stored = false
|
198
|
-
@compressed = false
|
199
|
-
else
|
200
|
-
raise "unknown stored parameter " + store.to_s
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
def index=(index)
|
205
|
-
@omit_norms = false
|
206
|
-
case index
|
207
|
-
when Index::NO
|
208
|
-
@indexed = false
|
209
|
-
@tokenized = false
|
210
|
-
when Index::TOKENIZED
|
211
|
-
@indexed = true
|
212
|
-
@tokenized = true
|
213
|
-
when Index::UNTOKENIZED
|
214
|
-
@indexed = true
|
215
|
-
@tokenized = false
|
216
|
-
when Index::NO_NORMS
|
217
|
-
@indexed = true
|
218
|
-
@tokenized = false
|
219
|
-
@omit_norms = true
|
220
|
-
else
|
221
|
-
raise "unknown stored parameter " + index.to_s
|
222
|
-
end
|
223
|
-
end
|
224
|
-
|
225
|
-
def term_vector=(term_vector)
|
226
|
-
case term_vector
|
227
|
-
when TermVector::NO
|
228
|
-
@store_term_vector = false
|
229
|
-
@store_position = false
|
230
|
-
@store_offset = false
|
231
|
-
when TermVector::YES
|
232
|
-
@store_term_vector = true
|
233
|
-
@store_position = false
|
234
|
-
@store_offset = false
|
235
|
-
when TermVector::WITH_POSITIONS
|
236
|
-
@store_term_vector = true
|
237
|
-
@store_position = true
|
238
|
-
@store_offset = false
|
239
|
-
when TermVector::WITH_OFFSETS
|
240
|
-
@store_term_vector = true
|
241
|
-
@store_position = false
|
242
|
-
@store_offset = true
|
243
|
-
when TermVector::WITH_POSITIONS_OFFSETS
|
244
|
-
@store_term_vector = true
|
245
|
-
@store_position = true
|
246
|
-
@store_offset = true
|
247
|
-
else
|
248
|
-
raise "unknown term_vector parameter " + store_term_vector.to_s
|
249
|
-
end
|
250
|
-
end
|
251
|
-
|
252
|
-
# Returns the string value of the data that is stored in this field
|
253
|
-
def string_value
|
254
|
-
if @data.instance_of? String
|
255
|
-
return @data
|
256
|
-
elsif @data.respond_to? :read
|
257
|
-
return @data.read()
|
258
|
-
else
|
259
|
-
# if it is binary object try to return a string representation
|
260
|
-
return @data.to_s
|
261
|
-
end
|
262
|
-
end
|
263
|
-
|
264
|
-
# if the data is stored as a binary, just return it.
|
265
|
-
def binary_value
|
266
|
-
return @data
|
267
|
-
end
|
268
|
-
|
269
|
-
# Returns the string value of the data that is stored in this field
|
270
|
-
def reader_value
|
271
|
-
if @data.respond_to? :read
|
272
|
-
return @data
|
273
|
-
elsif @data.instance_of? String
|
274
|
-
return Ferret::Utils::StringHelper::StringReader.new(@data)
|
275
|
-
else
|
276
|
-
# if it is binary object try to return a string representation
|
277
|
-
return Ferret::Utils::StringHelper::StringReader.new(@data.to_s)
|
278
|
-
end
|
279
|
-
end
|
280
|
-
|
281
|
-
# Create a stored field with binary value. Optionally the value
|
282
|
-
# may be compressed. But it obviously won't be tokenized or
|
283
|
-
# term vectored or anything like that.
|
284
|
-
#
|
285
|
-
# name:: The name of the field
|
286
|
-
# value:: The binary value
|
287
|
-
# store:: How _value_ should be stored (compressed or not.)
|
288
|
-
def Field.new_binary_field(name, value, stored)
|
289
|
-
if (stored == Store::NO)
|
290
|
-
raise ArgumentError, "binary values can't be unstored"
|
291
|
-
end
|
292
|
-
Field.new(name, value, stored, Index::NO, TermVector::NO, true)
|
293
|
-
end
|
294
|
-
|
295
|
-
# Prints a Field for human consumption.
|
296
|
-
def to_s()
|
297
|
-
str = ""
|
298
|
-
if (@stored)
|
299
|
-
str << "stored"
|
300
|
-
str << (@compressed ? "/compressed," : "/uncompressed,")
|
301
|
-
end
|
302
|
-
str << "indexed," if (@indexed)
|
303
|
-
str << "tokenized," if (@tokenized)
|
304
|
-
str << "store_term_vector," if (@store_term_vector)
|
305
|
-
str << "store_offsets," if (@store_offset)
|
306
|
-
str << "store_positions," if (@store_position)
|
307
|
-
str << "omit_norms," if (@omit_norms)
|
308
|
-
str << "binary," if (@binary)
|
309
|
-
str << "<#{@name}:#{@binary ? '=bin_data=' : data}>"
|
310
|
-
end
|
311
|
-
end
|
312
|
-
end
|