ferret 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
if __FILE__ == $0
|
2
|
+
module Ferret
|
3
|
+
end
|
4
|
+
$:.unshift File.dirname(__FILE__)
|
5
|
+
require 'token_stream'
|
6
|
+
require 'tokenizers'
|
7
|
+
require 'token'
|
8
|
+
end
|
9
|
+
|
10
|
+
module Ferret::Analysis
|
11
|
+
# The standard tokenizer is an advanced tokenizer which tokenizes morst
|
12
|
+
# words correctly as well as tokenizing things like email addresses, web
|
13
|
+
# addresses, phone numbers, etc.
|
14
|
+
|
15
|
+
class StandardTokenizer < RETokenizer
|
16
|
+
ALPHA = /[[:alpha:]]+/
|
17
|
+
APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
|
18
|
+
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
19
|
+
P = /[_\/.,-]/
|
20
|
+
HASDIGIT = /\w*\d\w*/
|
21
|
+
|
22
|
+
protected
|
23
|
+
|
24
|
+
# Collects only characters which are not spaces tabs or carraige returns
|
25
|
+
def token_re()
|
26
|
+
#/#{NUM}|#{EMAIL}|#{ACRONYM}\w*|#{C0MPANY}|#{APOSTROPHE}|\w+/
|
27
|
+
# This is a simplified version of the original Lucene standard
|
28
|
+
# tokenizer. I think it works better. I hope so anyway. Any way to
|
29
|
+
# do this more neatly?
|
30
|
+
/[[:alpha:]]+(('[[:alpha:]]+)+
|
31
|
+
|\.([[:alpha:]]\.)+
|
32
|
+
|(@|\&)\w+([-.]\w+)*
|
33
|
+
)
|
34
|
+
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
35
|
+
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
36
|
+
|(\.\w+)+
|
37
|
+
|
|
38
|
+
)
|
39
|
+
/x
|
40
|
+
end
|
41
|
+
|
42
|
+
# stem the 's and remove the '.'s from acronyms
|
43
|
+
def normalize(str)
|
44
|
+
if str =~ /^#{ACRONYM}$/
|
45
|
+
str.gsub!(/\./, '')
|
46
|
+
elsif str =~ /^#{APOSTROPHE}$/
|
47
|
+
str.gsub!(/'[sS]$/, '')
|
48
|
+
end
|
49
|
+
str
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Add this so we can play around with the standard tokenizer
|
55
|
+
if __FILE__ == $0
|
56
|
+
st = "\033[7m"
|
57
|
+
en = "\033[m"
|
58
|
+
|
59
|
+
$stdin.each do |line|
|
60
|
+
stk = Ferret::Analysis::StandardTokenizer.new(line)
|
61
|
+
while tk = stk.next()
|
62
|
+
puts " <" + tk.term_text + "> from " + tk.start_offset.to_s + " to " + tk.end_offset.to_s
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Ferret::Analysis
|
2
|
+
# A Token is an occurence of a term from the text of a field. It consists
|
3
|
+
# of a term's text, the start and end offset of the term in the text of the
|
4
|
+
# field, and a type string.
|
5
|
+
#
|
6
|
+
# The start and end offsets permit applications to re-associate a token with
|
7
|
+
# its source text, e.g., to display highlighted query terms in a document
|
8
|
+
# browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
9
|
+
# display, etc.
|
10
|
+
#
|
11
|
+
# The type is an interned string, assigned by a lexical analyzer (a.k.a.
|
12
|
+
# tokenizer), naming the lexical or syntactic class that the token belongs
|
13
|
+
# to. For example an end of sentence marker token might be implemented with
|
14
|
+
# type "eos". The default token type is "word".
|
15
|
+
#
|
16
|
+
# start_offset:: is the position of the first character corresponding to
|
17
|
+
# this token in the source text
|
18
|
+
# end_offset:: is equal to one greater than the position of the last
|
19
|
+
# character corresponding of this token Note that the
|
20
|
+
# difference between @end_offset and @start_offset may not be
|
21
|
+
# equal to @term_text.length(), as the term text may have been
|
22
|
+
# altered by a stemmer or some other filter.
|
23
|
+
class Token
|
24
|
+
include Comparable
|
25
|
+
attr_accessor :term_text
|
26
|
+
attr_reader :position_increment, :start_offset, :end_offset, :type
|
27
|
+
|
28
|
+
# Constructs a Token with the given term text, and start & end offsets.
|
29
|
+
# The type defaults to "word."
|
30
|
+
def initialize(txt, so, eo, typ="word", pos_inc=1)
|
31
|
+
@term_text = txt
|
32
|
+
@start_offset = so
|
33
|
+
@end_offset = eo
|
34
|
+
@type = typ # lexical type
|
35
|
+
@position_increment = pos_inc
|
36
|
+
end
|
37
|
+
|
38
|
+
# Tokens are sorted by the position in the text at which they occur, ie
|
39
|
+
# the start_offset. If two tokens have the same start offset, (see
|
40
|
+
# position_increment=) then, they are sorted by the end_offset and then
|
41
|
+
# lexically by the token text.
|
42
|
+
def <=>(o)
|
43
|
+
r = @start_offset <=> o.start_offset
|
44
|
+
return r if r != 0
|
45
|
+
r = @end_offset <=> o.end_offset
|
46
|
+
return r if r != 0
|
47
|
+
r = @term_text <=> o.term_text
|
48
|
+
return r
|
49
|
+
end
|
50
|
+
|
51
|
+
# Set the position increment. This determines the position of this token
|
52
|
+
# relative to the previous Token in a TokenStream, used in phrase
|
53
|
+
# searching.
|
54
|
+
#
|
55
|
+
# The default value is one.
|
56
|
+
#
|
57
|
+
# Some common uses for this are:
|
58
|
+
#
|
59
|
+
# * Set it to zero to put multiple terms in the same position. This is
|
60
|
+
# useful if, e.g., a word has multiple stems. Searches for phrases
|
61
|
+
# including either stem will match. In this case, all but the first
|
62
|
+
# stem's increment should be set to zero: the increment of the first
|
63
|
+
# instance should be one. Repeating a token with an increment of zero
|
64
|
+
# can also be used to boost the scores of matches on that token.
|
65
|
+
#
|
66
|
+
# * Set it to values greater than one to inhibit exact phrase matches.
|
67
|
+
# If, for example, one does not want phrases to match across removed
|
68
|
+
# stop words, then one could build a stop word filter that removes stop
|
69
|
+
# words and also sets the increment to the number of stop words removed
|
70
|
+
# before each non-stop word. Then exact phrase queries will only match
|
71
|
+
# when the terms occur with no intervening stop words.
|
72
|
+
def position_increment=(pos_inc)
|
73
|
+
if (pos_inc < 0)
|
74
|
+
raise ArgumentError, "Increment must be zero or greater: " + pos_inc
|
75
|
+
end
|
76
|
+
@position_increment = pos_inc
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Ferret::Analysis
|
2
|
+
# A TokenFilter is a TokenStream whose input is another token stream.
|
3
|
+
#
|
4
|
+
# This is an abstract class.
|
5
|
+
class TokenFilter < TokenStream
|
6
|
+
# Close the input TokenStream.
|
7
|
+
def close()
|
8
|
+
@input.close()
|
9
|
+
end
|
10
|
+
|
11
|
+
protected
|
12
|
+
# Construct a token stream filtering the given input.
|
13
|
+
def initialize(input)
|
14
|
+
@input = input
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Normalizes token text to lower case.
|
19
|
+
class LowerCaseFilter < TokenFilter
|
20
|
+
def next()
|
21
|
+
t = @input.next()
|
22
|
+
|
23
|
+
if (t == nil)
|
24
|
+
return nil
|
25
|
+
end
|
26
|
+
|
27
|
+
t.term_text = t.term_text.downcase()
|
28
|
+
|
29
|
+
return t
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Removes stop words from a token stream. To will need to pass your own
|
34
|
+
# set of stopwords to use this stop filter. If you with to use the default
|
35
|
+
# list of stopwords then use the StopAnalyzer.
|
36
|
+
class StopFilter < TokenFilter
|
37
|
+
# Constructs a filter which removes words from the input
|
38
|
+
# TokenStream that are named in the array of words.
|
39
|
+
def initialize(input, stop_set)
|
40
|
+
super(input);
|
41
|
+
@stop_set = stop_set
|
42
|
+
end
|
43
|
+
|
44
|
+
def StopFilter.new_with_file(input, path)
|
45
|
+
ws = WordListLoader.word_set_from_file(path)
|
46
|
+
return StopFilter.new(input, ws)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns the next input Token whose termText() is not a stop word.
|
50
|
+
def next()
|
51
|
+
# return the first non-stop word found
|
52
|
+
while token = @input.next()
|
53
|
+
return token if ! @stop_set.include?(token.term_text)
|
54
|
+
end
|
55
|
+
return nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Transforms the token stream as per the Porter stemming algorithm.
|
60
|
+
# Note: the input to the stemming filter must already be in lower case,
|
61
|
+
# so you will need to use LowerCaseFilter or LowerCaseTokenizer further
|
62
|
+
# down the Tokenizer chain in order for this to work properly!
|
63
|
+
#
|
64
|
+
# To use this filter with other analyzers, you'll want to write an
|
65
|
+
# Analyzer class that sets up the TokenStream chain as you want it.
|
66
|
+
# To use this with LowerCaseTokenizer, for example, you'd write an
|
67
|
+
# analyzer like this:
|
68
|
+
#
|
69
|
+
# def MyAnalyzer < Analyzer
|
70
|
+
# def token_stream(field, reader)
|
71
|
+
# return PorterStemFilter.new(LowerCaseTokenizer.new(reader))
|
72
|
+
# end
|
73
|
+
# end
|
74
|
+
class PorterStemFilter < TokenFilter
|
75
|
+
# Returns the next input Token, after being stemmed
|
76
|
+
def next()
|
77
|
+
token = @input.next()
|
78
|
+
if (token == nil)
|
79
|
+
return nil
|
80
|
+
else
|
81
|
+
token.term_text = Stemmable.stem_porter(token.term_text)
|
82
|
+
end
|
83
|
+
token
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Ferret::Analysis
|
2
|
+
# A TokenStream enumerates the sequence of tokens, either from
|
3
|
+
# fields of a document or from query text.
|
4
|
+
#
|
5
|
+
# This is an abstract class. Concrete subclasses are:
|
6
|
+
# * Tokenizer, a TokenStream whose input is a Reader; and
|
7
|
+
# * TokenFilter, a TokenStream whose input is another TokenStream.
|
8
|
+
class TokenStream
|
9
|
+
# Returns the next token in the stream, or null at EOS.
|
10
|
+
def next
|
11
|
+
raise NotImplementedError
|
12
|
+
end
|
13
|
+
|
14
|
+
# Releases resources associated with this stream.
|
15
|
+
def close
|
16
|
+
raise NotImplementedError
|
17
|
+
end
|
18
|
+
|
19
|
+
# Iterates through the tokens in the field
|
20
|
+
def each # :yields: token
|
21
|
+
while (n = self.next())
|
22
|
+
yield n
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Ferret::Analysis
|
4
|
+
# A Tokenizer is a TokenStream whose input is a Reader.
|
5
|
+
#
|
6
|
+
# This is an abstract class.
|
7
|
+
class Tokenizer < TokenStream
|
8
|
+
# By default, closes the input Reader.
|
9
|
+
def close()
|
10
|
+
@input.close()
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
# Construct a token stream processing the given input.
|
15
|
+
def initialize(input)
|
16
|
+
@input = input
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# An abstract base class for simple regular expression oriented
|
21
|
+
# tokenizers. Very powerful tokenizers can be created using this class as
|
22
|
+
# can be seen from the StandardTokenizer class. Bellow is an example of a
|
23
|
+
# simple implementation of a LetterTokenizer using an RETokenizer.
|
24
|
+
# Basically, a token is a sequence of alphabetic characters separated by
|
25
|
+
# one or more non-alphabetic characters.
|
26
|
+
#
|
27
|
+
# class LetterTokenizer < RETokenizer
|
28
|
+
# def token_re()
|
29
|
+
# /[a-zA-Z]+/
|
30
|
+
# end
|
31
|
+
# end
|
32
|
+
class RETokenizer < Tokenizer
|
33
|
+
|
34
|
+
# Initialize with an IO implementing input such as a file.
|
35
|
+
#
|
36
|
+
# input:: must have a read(count) method which returns an array or string
|
37
|
+
# of _count_ chars.
|
38
|
+
def initialize(input)
|
39
|
+
if input.is_a? String
|
40
|
+
@ss = StringScanner.new(input)
|
41
|
+
else
|
42
|
+
@ss = StringScanner.new(input.read())
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns the next token in the stream, or null at EOS.
|
47
|
+
def next()
|
48
|
+
if @ss.scan_until(token_re)
|
49
|
+
term = @ss.matched
|
50
|
+
term_end = @ss.pos
|
51
|
+
term_start = term_end - term.size
|
52
|
+
else
|
53
|
+
return nil
|
54
|
+
end
|
55
|
+
|
56
|
+
return Token.new(normalize(term), term_start, term_end)
|
57
|
+
end
|
58
|
+
|
59
|
+
def close()
|
60
|
+
@ss = nil
|
61
|
+
end
|
62
|
+
|
63
|
+
protected
|
64
|
+
# returns the regular expression used to find the next token
|
65
|
+
def token_re
|
66
|
+
/[a-zA-Z]+/
|
67
|
+
end
|
68
|
+
|
69
|
+
# Called on each token to normalize it before it is added to the
|
70
|
+
# token. The default implementation does nothing. Subclasses may
|
71
|
+
# use this to, e.g., lowercase tokens.
|
72
|
+
def normalize(str) return str end
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
# A LetterTokenizer is a tokenizer that divides text at non-letters.
|
77
|
+
# That's to say, it defines tokens as maximal strings of adjacent letters,
|
78
|
+
# as defined by the regular expression _/[a-zA-Z]+/_.
|
79
|
+
class LetterTokenizer < RETokenizer
|
80
|
+
protected
|
81
|
+
# Collects only characters which satisfy the regular expression
|
82
|
+
# _/[a-zA-Z]+/_.
|
83
|
+
def token_re()
|
84
|
+
/[a-zA-Z]+/
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# LowerCaseTokenizer performs the function of LetterTokenizer
|
89
|
+
# and LowerCaseFilter together. It divides text at non-letters and converts
|
90
|
+
# them to lower case.
|
91
|
+
class LowerCaseTokenizer < LetterTokenizer
|
92
|
+
protected
|
93
|
+
def normalize(str)
|
94
|
+
return str.downcase
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# A WhiteSpaceTokenizer is a tokenizer that divides text at whiteSpace.
|
99
|
+
# Adjacent sequences of non-WhiteSpace characters form tokens.
|
100
|
+
class WhiteSpaceTokenizer < RETokenizer
|
101
|
+
protected
|
102
|
+
# Collects only characters which are not spaces tabs or carraige returns
|
103
|
+
def token_re()
|
104
|
+
/\S+/
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'set'
|
2
|
+
module Ferret::Analysis
|
3
|
+
# Loader for text files that represent a list of stopwords.
|
4
|
+
module WordListLoader
|
5
|
+
# Loads a text file and adds every line as an entry to a HashSet (omitting
|
6
|
+
# leading and trailing whitespace). Every line of the file should contain only
|
7
|
+
# one word. The words need to be in lowercase if you make use of an
|
8
|
+
# Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
|
9
|
+
#
|
10
|
+
# path:: path to file containing the wordlist
|
11
|
+
# return:: A HashSet with the file's words
|
12
|
+
def WordListLoader.word_set_from_file(path)
|
13
|
+
result = Set.new()
|
14
|
+
File.open(path) do |word_file|
|
15
|
+
# we have to strip the end of line characters
|
16
|
+
word_file.each {|line| result << line[0..-2] }
|
17
|
+
end
|
18
|
+
return result
|
19
|
+
end
|
20
|
+
|
21
|
+
def WordListLoader.word_set_from_array(word_array)
|
22
|
+
result = Set.new()
|
23
|
+
word_array.each {|word| result << word }
|
24
|
+
return result
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
module Ferret::Document
|
2
|
+
# Documents are the unit of indexing and search.
|
3
|
+
#
|
4
|
+
# A Document is a set of fields. Each field has a name and a textual
|
5
|
+
# value. A field may be Field#stored?() with the document, in which case
|
6
|
+
# it is returned with search hits on the document. Thus each document
|
7
|
+
# should typically contain one or more stored fields which uniquely
|
8
|
+
# identify it.
|
9
|
+
#
|
10
|
+
# Note that fields which are _not_ Field#stored?() are _not_ available in
|
11
|
+
# documents retrieved from the index, e.g. with Hits#doc, Searcher#doc or
|
12
|
+
# IndexReader#document.
|
13
|
+
#
|
14
|
+
# Several fields may be added with the same name. In this case, if the
|
15
|
+
# fields are indexed, their text is treated as though appended for the
|
16
|
+
# purposes of search.
|
17
|
+
#
|
18
|
+
# Note that add like the remove_field(s) methods only makes sense prior to
|
19
|
+
# adding a document to an index. These methods cannot be used to change
|
20
|
+
# the content of an existing index! In order to achieve this, a document
|
21
|
+
# has to be deleted from an index and a new changed version of that
|
22
|
+
# document has to be added.
|
23
|
+
class Document
|
24
|
+
attr_accessor :boost
|
25
|
+
|
26
|
+
# Constructs a new document with no fields.
|
27
|
+
def initialize()
|
28
|
+
# Values are multiplied into the value of Field#boost of each field in
|
29
|
+
# this document. Thus, this method in effect sets a default boost for
|
30
|
+
# the fields of this document.
|
31
|
+
#
|
32
|
+
# The default value is 1.0.
|
33
|
+
#
|
34
|
+
# Note: This value is not stored directly with the document in the
|
35
|
+
# index. Documents returned from IndexReader#document and Hits#doc
|
36
|
+
# may thus not have the same value present as when this document was
|
37
|
+
# indexed.
|
38
|
+
@boost = 1.0
|
39
|
+
@fields = {}
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns an array of all fields. Note that it is possible for two
|
43
|
+
# fields to appear with the same field name. These will be concatenated
|
44
|
+
# in the index.
|
45
|
+
def all_fields
|
46
|
+
@fields.values.flatten
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns the number of distinct fields held within the document. This
|
50
|
+
# counts fields which have multiple entries as one.
|
51
|
+
def field_count()
|
52
|
+
return @fields.size
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the number of entries held within the document. This counts
|
56
|
+
# all sections so for fields which have multiple entries, each entry
|
57
|
+
# is counted
|
58
|
+
def entry_count()
|
59
|
+
return @fields.values.flatten.size
|
60
|
+
end
|
61
|
+
|
62
|
+
# Adds a field to a document. Several fields may be added with the same
|
63
|
+
# name. In this case, if the fields are indexed, their text is treated
|
64
|
+
# as though appended for the purposes of search.
|
65
|
+
#
|
66
|
+
# Note that add like the remove_field(s) methods only makes sense prior
|
67
|
+
# to adding a document to an index. These methods cannot be used to
|
68
|
+
# change the content of an existing index! In order to achieve this, a
|
69
|
+
# document has to be deleted from an index and a new changed version of
|
70
|
+
# that document has to be added.
|
71
|
+
def add_field(field)
|
72
|
+
(@fields[field.name] ||= []) << field
|
73
|
+
end
|
74
|
+
alias :<< :add_field
|
75
|
+
|
76
|
+
# Removes the first field of this name if it exists.
|
77
|
+
def remove_field(name)
|
78
|
+
@fields[name].delete_at(0)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Removes all fields with the given name from the document.
|
82
|
+
#
|
83
|
+
# If there is no field with the specified name, the document remains
|
84
|
+
# unchanged.
|
85
|
+
#
|
86
|
+
# Note that the remove_field(s) methods like the add method only make
|
87
|
+
# sense prior to adding a document to an index. These methods cannot be
|
88
|
+
# used to change the content of an existing index! In order to achieve
|
89
|
+
# this, a document has to be deleted from an index and a new changed
|
90
|
+
# version of that document has to be added.
|
91
|
+
def remove_fields(name)
|
92
|
+
@fields.delete(name)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns the first field with the given name.
|
96
|
+
# This method can return _nil_.
|
97
|
+
#
|
98
|
+
# name:: the name of the field
|
99
|
+
# Return:: a _Field_ array
|
100
|
+
def field(name)
|
101
|
+
@fields[name] ? @fields[name][0] : nil
|
102
|
+
end
|
103
|
+
|
104
|
+
# Returns an array of all fields with the given name.
|
105
|
+
# This method can return _nil_.
|
106
|
+
#
|
107
|
+
# name:: the name of the field
|
108
|
+
# Return:: a _Field_ array
|
109
|
+
def fields(name)
|
110
|
+
@fields[name]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Returns an array of values of the field specified as the method
|
114
|
+
# parameter. This method can return _nil_.
|
115
|
+
#
|
116
|
+
# name:: the name of the field
|
117
|
+
# Return:: a _String_ of field values
|
118
|
+
def values(name)
|
119
|
+
return nil if @fields[name].nil?
|
120
|
+
@fields[name].map {|f| f.data if not f.binary? }.join(" ")
|
121
|
+
end
|
122
|
+
alias :[] :values
|
123
|
+
|
124
|
+
# Sets the data in field +field+ to +text+. If there is more than one
|
125
|
+
# field of that name then it will set the data in the first field of that
|
126
|
+
# name.
|
127
|
+
def []=(field_name, data)
|
128
|
+
field = field(field_name)
|
129
|
+
raise ArgumentError, "Field does not exist" unless field
|
130
|
+
field.data = data
|
131
|
+
end
|
132
|
+
|
133
|
+
# Returns an array of binaries of the field specified as the method
|
134
|
+
# parameter. This method can return _nil_.
|
135
|
+
#
|
136
|
+
# name:: the name of the field
|
137
|
+
# Return:: a _String_ of field values
|
138
|
+
def binaries(name)
|
139
|
+
binaries = []
|
140
|
+
@fields[name].each {|f| binaries << f.data if f.binary? }
|
141
|
+
return binaries
|
142
|
+
end
|
143
|
+
|
144
|
+
# Prints the fields of a document for human consumption.#/
|
145
|
+
def to_s()
|
146
|
+
field_str = ""
|
147
|
+
@fields.each_key { |name| field_str += name + " " }
|
148
|
+
field_str[-1] = ">"
|
149
|
+
return "Document<" + field_str
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|