ferret 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +20 -0
- data/README +109 -0
- data/Rakefile +275 -0
- data/TODO +9 -0
- data/TUTORIAL +197 -0
- data/ext/extconf.rb +3 -0
- data/ext/ferret.c +23 -0
- data/ext/ferret.h +85 -0
- data/ext/index_io.c +543 -0
- data/ext/priority_queue.c +227 -0
- data/ext/ram_directory.c +316 -0
- data/ext/segment_merge_queue.c +41 -0
- data/ext/string_helper.c +42 -0
- data/ext/tags +240 -0
- data/ext/term.c +261 -0
- data/ext/term_buffer.c +299 -0
- data/ext/util.c +12 -0
- data/lib/ferret.rb +41 -0
- data/lib/ferret/analysis.rb +11 -0
- data/lib/ferret/analysis/analyzers.rb +93 -0
- data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
- data/lib/ferret/analysis/token.rb +79 -0
- data/lib/ferret/analysis/token_filters.rb +86 -0
- data/lib/ferret/analysis/token_stream.rb +26 -0
- data/lib/ferret/analysis/tokenizers.rb +107 -0
- data/lib/ferret/analysis/word_list_loader.rb +27 -0
- data/lib/ferret/document.rb +2 -0
- data/lib/ferret/document/document.rb +152 -0
- data/lib/ferret/document/field.rb +304 -0
- data/lib/ferret/index.rb +26 -0
- data/lib/ferret/index/compound_file_io.rb +343 -0
- data/lib/ferret/index/document_writer.rb +288 -0
- data/lib/ferret/index/field_infos.rb +259 -0
- data/lib/ferret/index/fields_io.rb +175 -0
- data/lib/ferret/index/index.rb +228 -0
- data/lib/ferret/index/index_file_names.rb +33 -0
- data/lib/ferret/index/index_reader.rb +462 -0
- data/lib/ferret/index/index_writer.rb +488 -0
- data/lib/ferret/index/multi_reader.rb +363 -0
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
- data/lib/ferret/index/segment_infos.rb +130 -0
- data/lib/ferret/index/segment_merge_info.rb +47 -0
- data/lib/ferret/index/segment_merge_queue.rb +16 -0
- data/lib/ferret/index/segment_merger.rb +337 -0
- data/lib/ferret/index/segment_reader.rb +380 -0
- data/lib/ferret/index/segment_term_enum.rb +178 -0
- data/lib/ferret/index/segment_term_vector.rb +58 -0
- data/lib/ferret/index/term.rb +49 -0
- data/lib/ferret/index/term_buffer.rb +88 -0
- data/lib/ferret/index/term_doc_enum.rb +283 -0
- data/lib/ferret/index/term_enum.rb +52 -0
- data/lib/ferret/index/term_info.rb +41 -0
- data/lib/ferret/index/term_infos_io.rb +312 -0
- data/lib/ferret/index/term_vector_offset_info.rb +20 -0
- data/lib/ferret/index/term_vectors_io.rb +552 -0
- data/lib/ferret/query_parser.rb +274 -0
- data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
- data/lib/ferret/search.rb +49 -0
- data/lib/ferret/search/boolean_clause.rb +100 -0
- data/lib/ferret/search/boolean_query.rb +303 -0
- data/lib/ferret/search/boolean_scorer.rb +294 -0
- data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
- data/lib/ferret/search/conjunction_scorer.rb +99 -0
- data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
- data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
- data/lib/ferret/search/explanation.rb +41 -0
- data/lib/ferret/search/field_cache.rb +216 -0
- data/lib/ferret/search/field_doc.rb +31 -0
- data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
- data/lib/ferret/search/filter.rb +11 -0
- data/lib/ferret/search/filtered_query.rb +130 -0
- data/lib/ferret/search/filtered_term_enum.rb +79 -0
- data/lib/ferret/search/fuzzy_query.rb +153 -0
- data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
- data/lib/ferret/search/hit_collector.rb +34 -0
- data/lib/ferret/search/hit_queue.rb +11 -0
- data/lib/ferret/search/index_searcher.rb +173 -0
- data/lib/ferret/search/match_all_docs_query.rb +104 -0
- data/lib/ferret/search/multi_phrase_query.rb +204 -0
- data/lib/ferret/search/multi_term_query.rb +65 -0
- data/lib/ferret/search/non_matching_scorer.rb +22 -0
- data/lib/ferret/search/phrase_positions.rb +55 -0
- data/lib/ferret/search/phrase_query.rb +217 -0
- data/lib/ferret/search/phrase_scorer.rb +153 -0
- data/lib/ferret/search/prefix_query.rb +47 -0
- data/lib/ferret/search/query.rb +111 -0
- data/lib/ferret/search/query_filter.rb +51 -0
- data/lib/ferret/search/range_filter.rb +103 -0
- data/lib/ferret/search/range_query.rb +139 -0
- data/lib/ferret/search/req_excl_scorer.rb +125 -0
- data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
- data/lib/ferret/search/score_doc.rb +38 -0
- data/lib/ferret/search/score_doc_comparator.rb +114 -0
- data/lib/ferret/search/scorer.rb +91 -0
- data/lib/ferret/search/similarity.rb +278 -0
- data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
- data/lib/ferret/search/sort.rb +105 -0
- data/lib/ferret/search/sort_comparator.rb +60 -0
- data/lib/ferret/search/sort_field.rb +87 -0
- data/lib/ferret/search/spans.rb +12 -0
- data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
- data/lib/ferret/search/spans/span_first_query.rb +79 -0
- data/lib/ferret/search/spans/span_near_query.rb +108 -0
- data/lib/ferret/search/spans/span_not_query.rb +130 -0
- data/lib/ferret/search/spans/span_or_query.rb +176 -0
- data/lib/ferret/search/spans/span_query.rb +25 -0
- data/lib/ferret/search/spans/span_scorer.rb +74 -0
- data/lib/ferret/search/spans/span_term_query.rb +105 -0
- data/lib/ferret/search/spans/span_weight.rb +84 -0
- data/lib/ferret/search/spans/spans_enum.rb +44 -0
- data/lib/ferret/search/term_query.rb +128 -0
- data/lib/ferret/search/term_scorer.rb +181 -0
- data/lib/ferret/search/top_docs.rb +24 -0
- data/lib/ferret/search/top_field_docs.rb +17 -0
- data/lib/ferret/search/weight.rb +54 -0
- data/lib/ferret/search/wildcard_query.rb +26 -0
- data/lib/ferret/search/wildcard_term_enum.rb +61 -0
- data/lib/ferret/stemmers.rb +1 -0
- data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
- data/lib/ferret/store.rb +5 -0
- data/lib/ferret/store/buffered_index_io.rb +191 -0
- data/lib/ferret/store/directory.rb +139 -0
- data/lib/ferret/store/fs_store.rb +338 -0
- data/lib/ferret/store/index_io.rb +259 -0
- data/lib/ferret/store/ram_store.rb +282 -0
- data/lib/ferret/utils.rb +7 -0
- data/lib/ferret/utils/bit_vector.rb +105 -0
- data/lib/ferret/utils/date_tools.rb +138 -0
- data/lib/ferret/utils/number_tools.rb +91 -0
- data/lib/ferret/utils/parameter.rb +41 -0
- data/lib/ferret/utils/priority_queue.rb +120 -0
- data/lib/ferret/utils/string_helper.rb +47 -0
- data/lib/ferret/utils/weak_key_hash.rb +51 -0
- data/rake_utils/code_statistics.rb +106 -0
- data/setup.rb +1551 -0
- data/test/benchmark/tb_ram_store.rb +76 -0
- data/test/benchmark/tb_rw_vint.rb +26 -0
- data/test/longrunning/tc_numbertools.rb +60 -0
- data/test/longrunning/tm_store.rb +19 -0
- data/test/test_all.rb +9 -0
- data/test/test_helper.rb +6 -0
- data/test/unit/analysis/tc_analyzer.rb +21 -0
- data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
- data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
- data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
- data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
- data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
- data/test/unit/analysis/tc_stop_filter.rb +14 -0
- data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
- data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
- data/test/unit/analysis/tc_word_list_loader.rb +32 -0
- data/test/unit/document/tc_document.rb +47 -0
- data/test/unit/document/tc_field.rb +80 -0
- data/test/unit/index/tc_compound_file_io.rb +107 -0
- data/test/unit/index/tc_field_infos.rb +119 -0
- data/test/unit/index/tc_fields_io.rb +167 -0
- data/test/unit/index/tc_index.rb +140 -0
- data/test/unit/index/tc_index_reader.rb +622 -0
- data/test/unit/index/tc_index_writer.rb +57 -0
- data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
- data/test/unit/index/tc_segment_infos.rb +74 -0
- data/test/unit/index/tc_segment_term_docs.rb +17 -0
- data/test/unit/index/tc_segment_term_enum.rb +60 -0
- data/test/unit/index/tc_segment_term_vector.rb +71 -0
- data/test/unit/index/tc_term.rb +22 -0
- data/test/unit/index/tc_term_buffer.rb +57 -0
- data/test/unit/index/tc_term_info.rb +19 -0
- data/test/unit/index/tc_term_infos_io.rb +192 -0
- data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
- data/test/unit/index/tc_term_vectors_io.rb +108 -0
- data/test/unit/index/th_doc.rb +244 -0
- data/test/unit/query_parser/tc_query_parser.rb +84 -0
- data/test/unit/search/tc_filter.rb +113 -0
- data/test/unit/search/tc_fuzzy_query.rb +136 -0
- data/test/unit/search/tc_index_searcher.rb +188 -0
- data/test/unit/search/tc_search_and_sort.rb +98 -0
- data/test/unit/search/tc_similarity.rb +37 -0
- data/test/unit/search/tc_sort.rb +48 -0
- data/test/unit/search/tc_sort_field.rb +27 -0
- data/test/unit/search/tc_spans.rb +153 -0
- data/test/unit/store/tc_fs_store.rb +84 -0
- data/test/unit/store/tc_ram_store.rb +35 -0
- data/test/unit/store/tm_store.rb +180 -0
- data/test/unit/store/tm_store_lock.rb +68 -0
- data/test/unit/ts_analysis.rb +16 -0
- data/test/unit/ts_document.rb +4 -0
- data/test/unit/ts_index.rb +18 -0
- data/test/unit/ts_query_parser.rb +3 -0
- data/test/unit/ts_search.rb +10 -0
- data/test/unit/ts_store.rb +6 -0
- data/test/unit/ts_utils.rb +10 -0
- data/test/unit/utils/tc_bit_vector.rb +65 -0
- data/test/unit/utils/tc_date_tools.rb +50 -0
- data/test/unit/utils/tc_number_tools.rb +59 -0
- data/test/unit/utils/tc_parameter.rb +40 -0
- data/test/unit/utils/tc_priority_queue.rb +62 -0
- data/test/unit/utils/tc_string_helper.rb +21 -0
- data/test/unit/utils/tc_weak_key_hash.rb +25 -0
- metadata +251 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
if __FILE__ == $0
|
2
|
+
module Ferret
|
3
|
+
end
|
4
|
+
$:.unshift File.dirname(__FILE__)
|
5
|
+
require 'token_stream'
|
6
|
+
require 'tokenizers'
|
7
|
+
require 'token'
|
8
|
+
end
|
9
|
+
|
10
|
+
module Ferret::Analysis
|
11
|
+
# The standard tokenizer is an advanced tokenizer which tokenizes morst
|
12
|
+
# words correctly as well as tokenizing things like email addresses, web
|
13
|
+
# addresses, phone numbers, etc.
|
14
|
+
|
15
|
+
class StandardTokenizer < RETokenizer
|
16
|
+
ALPHA = /[[:alpha:]]+/
|
17
|
+
APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
|
18
|
+
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
19
|
+
P = /[_\/.,-]/
|
20
|
+
HASDIGIT = /\w*\d\w*/
|
21
|
+
|
22
|
+
protected
|
23
|
+
|
24
|
+
# Collects only characters which are not spaces tabs or carraige returns
|
25
|
+
def token_re()
|
26
|
+
#/#{NUM}|#{EMAIL}|#{ACRONYM}\w*|#{C0MPANY}|#{APOSTROPHE}|\w+/
|
27
|
+
# This is a simplified version of the original Lucene standard
|
28
|
+
# tokenizer. I think it works better. I hope so anyway. Any way to
|
29
|
+
# do this more neatly?
|
30
|
+
/[[:alpha:]]+(('[[:alpha:]]+)+
|
31
|
+
|\.([[:alpha:]]\.)+
|
32
|
+
|(@|\&)\w+([-.]\w+)*
|
33
|
+
)
|
34
|
+
|\w+(([\-._]\w+)*\@\w+([-.]\w+)+
|
35
|
+
|#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
|
36
|
+
|(\.\w+)+
|
37
|
+
|
|
38
|
+
)
|
39
|
+
/x
|
40
|
+
end
|
41
|
+
|
42
|
+
# stem the 's and remove the '.'s from acronyms
|
43
|
+
def normalize(str)
|
44
|
+
if str =~ /^#{ACRONYM}$/
|
45
|
+
str.gsub!(/\./, '')
|
46
|
+
elsif str =~ /^#{APOSTROPHE}$/
|
47
|
+
str.gsub!(/'[sS]$/, '')
|
48
|
+
end
|
49
|
+
str
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Add this so we can play around with the standard tokenizer
|
55
|
+
if __FILE__ == $0
|
56
|
+
st = "\033[7m"
|
57
|
+
en = "\033[m"
|
58
|
+
|
59
|
+
$stdin.each do |line|
|
60
|
+
stk = Ferret::Analysis::StandardTokenizer.new(line)
|
61
|
+
while tk = stk.next()
|
62
|
+
puts " <" + tk.term_text + "> from " + tk.start_offset.to_s + " to " + tk.end_offset.to_s
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Ferret::Analysis
|
2
|
+
# A Token is an occurence of a term from the text of a field. It consists
|
3
|
+
# of a term's text, the start and end offset of the term in the text of the
|
4
|
+
# field, and a type string.
|
5
|
+
#
|
6
|
+
# The start and end offsets permit applications to re-associate a token with
|
7
|
+
# its source text, e.g., to display highlighted query terms in a document
|
8
|
+
# browser, or to show matching text fragments in a KWIC (KeyWord In Context)
|
9
|
+
# display, etc.
|
10
|
+
#
|
11
|
+
# The type is an interned string, assigned by a lexical analyzer (a.k.a.
|
12
|
+
# tokenizer), naming the lexical or syntactic class that the token belongs
|
13
|
+
# to. For example an end of sentence marker token might be implemented with
|
14
|
+
# type "eos". The default token type is "word".
|
15
|
+
#
|
16
|
+
# start_offset:: is the position of the first character corresponding to
|
17
|
+
# this token in the source text
|
18
|
+
# end_offset:: is equal to one greater than the position of the last
|
19
|
+
# character corresponding of this token Note that the
|
20
|
+
# difference between @end_offset and @start_offset may not be
|
21
|
+
# equal to @term_text.length(), as the term text may have been
|
22
|
+
# altered by a stemmer or some other filter.
|
23
|
+
class Token
|
24
|
+
include Comparable
|
25
|
+
attr_accessor :term_text
|
26
|
+
attr_reader :position_increment, :start_offset, :end_offset, :type
|
27
|
+
|
28
|
+
# Constructs a Token with the given term text, and start & end offsets.
|
29
|
+
# The type defaults to "word."
|
30
|
+
def initialize(txt, so, eo, typ="word", pos_inc=1)
|
31
|
+
@term_text = txt
|
32
|
+
@start_offset = so
|
33
|
+
@end_offset = eo
|
34
|
+
@type = typ # lexical type
|
35
|
+
@position_increment = pos_inc
|
36
|
+
end
|
37
|
+
|
38
|
+
# Tokens are sorted by the position in the text at which they occur, ie
|
39
|
+
# the start_offset. If two tokens have the same start offset, (see
|
40
|
+
# position_increment=) then, they are sorted by the end_offset and then
|
41
|
+
# lexically by the token text.
|
42
|
+
def <=>(o)
|
43
|
+
r = @start_offset <=> o.start_offset
|
44
|
+
return r if r != 0
|
45
|
+
r = @end_offset <=> o.end_offset
|
46
|
+
return r if r != 0
|
47
|
+
r = @term_text <=> o.term_text
|
48
|
+
return r
|
49
|
+
end
|
50
|
+
|
51
|
+
# Set the position increment. This determines the position of this token
|
52
|
+
# relative to the previous Token in a TokenStream, used in phrase
|
53
|
+
# searching.
|
54
|
+
#
|
55
|
+
# The default value is one.
|
56
|
+
#
|
57
|
+
# Some common uses for this are:
|
58
|
+
#
|
59
|
+
# * Set it to zero to put multiple terms in the same position. This is
|
60
|
+
# useful if, e.g., a word has multiple stems. Searches for phrases
|
61
|
+
# including either stem will match. In this case, all but the first
|
62
|
+
# stem's increment should be set to zero: the increment of the first
|
63
|
+
# instance should be one. Repeating a token with an increment of zero
|
64
|
+
# can also be used to boost the scores of matches on that token.
|
65
|
+
#
|
66
|
+
# * Set it to values greater than one to inhibit exact phrase matches.
|
67
|
+
# If, for example, one does not want phrases to match across removed
|
68
|
+
# stop words, then one could build a stop word filter that removes stop
|
69
|
+
# words and also sets the increment to the number of stop words removed
|
70
|
+
# before each non-stop word. Then exact phrase queries will only match
|
71
|
+
# when the terms occur with no intervening stop words.
|
72
|
+
def position_increment=(pos_inc)
|
73
|
+
if (pos_inc < 0)
|
74
|
+
raise ArgumentError, "Increment must be zero or greater: " + pos_inc
|
75
|
+
end
|
76
|
+
@position_increment = pos_inc
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Ferret::Analysis
|
2
|
+
# A TokenFilter is a TokenStream whose input is another token stream.
|
3
|
+
#
|
4
|
+
# This is an abstract class.
|
5
|
+
class TokenFilter < TokenStream
|
6
|
+
# Close the input TokenStream.
|
7
|
+
def close()
|
8
|
+
@input.close()
|
9
|
+
end
|
10
|
+
|
11
|
+
protected
|
12
|
+
# Construct a token stream filtering the given input.
|
13
|
+
def initialize(input)
|
14
|
+
@input = input
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Normalizes token text to lower case.
|
19
|
+
class LowerCaseFilter < TokenFilter
|
20
|
+
def next()
|
21
|
+
t = @input.next()
|
22
|
+
|
23
|
+
if (t == nil)
|
24
|
+
return nil
|
25
|
+
end
|
26
|
+
|
27
|
+
t.term_text = t.term_text.downcase()
|
28
|
+
|
29
|
+
return t
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Removes stop words from a token stream. To will need to pass your own
|
34
|
+
# set of stopwords to use this stop filter. If you with to use the default
|
35
|
+
# list of stopwords then use the StopAnalyzer.
|
36
|
+
class StopFilter < TokenFilter
|
37
|
+
# Constructs a filter which removes words from the input
|
38
|
+
# TokenStream that are named in the array of words.
|
39
|
+
def initialize(input, stop_set)
|
40
|
+
super(input);
|
41
|
+
@stop_set = stop_set
|
42
|
+
end
|
43
|
+
|
44
|
+
def StopFilter.new_with_file(input, path)
|
45
|
+
ws = WordListLoader.word_set_from_file(path)
|
46
|
+
return StopFilter.new(input, ws)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns the next input Token whose termText() is not a stop word.
|
50
|
+
def next()
|
51
|
+
# return the first non-stop word found
|
52
|
+
while token = @input.next()
|
53
|
+
return token if ! @stop_set.include?(token.term_text)
|
54
|
+
end
|
55
|
+
return nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Transforms the token stream as per the Porter stemming algorithm.
|
60
|
+
# Note: the input to the stemming filter must already be in lower case,
|
61
|
+
# so you will need to use LowerCaseFilter or LowerCaseTokenizer further
|
62
|
+
# down the Tokenizer chain in order for this to work properly!
|
63
|
+
#
|
64
|
+
# To use this filter with other analyzers, you'll want to write an
|
65
|
+
# Analyzer class that sets up the TokenStream chain as you want it.
|
66
|
+
# To use this with LowerCaseTokenizer, for example, you'd write an
|
67
|
+
# analyzer like this:
|
68
|
+
#
|
69
|
+
# def MyAnalyzer < Analyzer
|
70
|
+
# def token_stream(field, reader)
|
71
|
+
# return PorterStemFilter.new(LowerCaseTokenizer.new(reader))
|
72
|
+
# end
|
73
|
+
# end
|
74
|
+
class PorterStemFilter < TokenFilter
|
75
|
+
# Returns the next input Token, after being stemmed
|
76
|
+
def next()
|
77
|
+
token = @input.next()
|
78
|
+
if (token == nil)
|
79
|
+
return nil
|
80
|
+
else
|
81
|
+
token.term_text = Stemmable.stem_porter(token.term_text)
|
82
|
+
end
|
83
|
+
token
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Ferret::Analysis
|
2
|
+
# A TokenStream enumerates the sequence of tokens, either from
|
3
|
+
# fields of a document or from query text.
|
4
|
+
#
|
5
|
+
# This is an abstract class. Concrete subclasses are:
|
6
|
+
# * Tokenizer, a TokenStream whose input is a Reader; and
|
7
|
+
# * TokenFilter, a TokenStream whose input is another TokenStream.
|
8
|
+
class TokenStream
|
9
|
+
# Returns the next token in the stream, or null at EOS.
|
10
|
+
def next
|
11
|
+
raise NotImplementedError
|
12
|
+
end
|
13
|
+
|
14
|
+
# Releases resources associated with this stream.
|
15
|
+
def close
|
16
|
+
raise NotImplementedError
|
17
|
+
end
|
18
|
+
|
19
|
+
# Iterates through the tokens in the field
|
20
|
+
def each # :yields: token
|
21
|
+
while (n = self.next())
|
22
|
+
yield n
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Ferret::Analysis
|
4
|
+
# A Tokenizer is a TokenStream whose input is a Reader.
|
5
|
+
#
|
6
|
+
# This is an abstract class.
|
7
|
+
class Tokenizer < TokenStream
|
8
|
+
# By default, closes the input Reader.
|
9
|
+
def close()
|
10
|
+
@input.close()
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
# Construct a token stream processing the given input.
|
15
|
+
def initialize(input)
|
16
|
+
@input = input
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# An abstract base class for simple regular expression oriented
|
21
|
+
# tokenizers. Very powerful tokenizers can be created using this class as
|
22
|
+
# can be seen from the StandardTokenizer class. Bellow is an example of a
|
23
|
+
# simple implementation of a LetterTokenizer using an RETokenizer.
|
24
|
+
# Basically, a token is a sequence of alphabetic characters separated by
|
25
|
+
# one or more non-alphabetic characters.
|
26
|
+
#
|
27
|
+
# class LetterTokenizer < RETokenizer
|
28
|
+
# def token_re()
|
29
|
+
# /[a-zA-Z]+/
|
30
|
+
# end
|
31
|
+
# end
|
32
|
+
class RETokenizer < Tokenizer
|
33
|
+
|
34
|
+
# Initialize with an IO implementing input such as a file.
|
35
|
+
#
|
36
|
+
# input:: must have a read(count) method which returns an array or string
|
37
|
+
# of _count_ chars.
|
38
|
+
def initialize(input)
|
39
|
+
if input.is_a? String
|
40
|
+
@ss = StringScanner.new(input)
|
41
|
+
else
|
42
|
+
@ss = StringScanner.new(input.read())
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns the next token in the stream, or null at EOS.
|
47
|
+
def next()
|
48
|
+
if @ss.scan_until(token_re)
|
49
|
+
term = @ss.matched
|
50
|
+
term_end = @ss.pos
|
51
|
+
term_start = term_end - term.size
|
52
|
+
else
|
53
|
+
return nil
|
54
|
+
end
|
55
|
+
|
56
|
+
return Token.new(normalize(term), term_start, term_end)
|
57
|
+
end
|
58
|
+
|
59
|
+
def close()
|
60
|
+
@ss = nil
|
61
|
+
end
|
62
|
+
|
63
|
+
protected
|
64
|
+
# returns the regular expression used to find the next token
|
65
|
+
def token_re
|
66
|
+
/[a-zA-Z]+/
|
67
|
+
end
|
68
|
+
|
69
|
+
# Called on each token to normalize it before it is added to the
|
70
|
+
# token. The default implementation does nothing. Subclasses may
|
71
|
+
# use this to, e.g., lowercase tokens.
|
72
|
+
def normalize(str) return str end
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
# A LetterTokenizer is a tokenizer that divides text at non-letters.
|
77
|
+
# That's to say, it defines tokens as maximal strings of adjacent letters,
|
78
|
+
# as defined by the regular expression _/[a-zA-Z]+/_.
|
79
|
+
class LetterTokenizer < RETokenizer
|
80
|
+
protected
|
81
|
+
# Collects only characters which satisfy the regular expression
|
82
|
+
# _/[a-zA-Z]+/_.
|
83
|
+
def token_re()
|
84
|
+
/[a-zA-Z]+/
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# LowerCaseTokenizer performs the function of LetterTokenizer
|
89
|
+
# and LowerCaseFilter together. It divides text at non-letters and converts
|
90
|
+
# them to lower case.
|
91
|
+
class LowerCaseTokenizer < LetterTokenizer
|
92
|
+
protected
|
93
|
+
def normalize(str)
|
94
|
+
return str.downcase
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# A WhiteSpaceTokenizer is a tokenizer that divides text at whiteSpace.
|
99
|
+
# Adjacent sequences of non-WhiteSpace characters form tokens.
|
100
|
+
class WhiteSpaceTokenizer < RETokenizer
|
101
|
+
protected
|
102
|
+
# Collects only characters which are not spaces tabs or carraige returns
|
103
|
+
def token_re()
|
104
|
+
/\S+/
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'set'
|
2
|
+
module Ferret::Analysis
|
3
|
+
# Loader for text files that represent a list of stopwords.
|
4
|
+
module WordListLoader
|
5
|
+
# Loads a text file and adds every line as an entry to a HashSet (omitting
|
6
|
+
# leading and trailing whitespace). Every line of the file should contain only
|
7
|
+
# one word. The words need to be in lowercase if you make use of an
|
8
|
+
# Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
|
9
|
+
#
|
10
|
+
# path:: path to file containing the wordlist
|
11
|
+
# return:: A HashSet with the file's words
|
12
|
+
def WordListLoader.word_set_from_file(path)
|
13
|
+
result = Set.new()
|
14
|
+
File.open(path) do |word_file|
|
15
|
+
# we have to strip the end of line characters
|
16
|
+
word_file.each {|line| result << line[0..-2] }
|
17
|
+
end
|
18
|
+
return result
|
19
|
+
end
|
20
|
+
|
21
|
+
def WordListLoader.word_set_from_array(word_array)
|
22
|
+
result = Set.new()
|
23
|
+
word_array.each {|word| result << word }
|
24
|
+
return result
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
module Ferret::Document
|
2
|
+
# Documents are the unit of indexing and search.
|
3
|
+
#
|
4
|
+
# A Document is a set of fields. Each field has a name and a textual
|
5
|
+
# value. A field may be Field#stored?() with the document, in which case
|
6
|
+
# it is returned with search hits on the document. Thus each document
|
7
|
+
# should typically contain one or more stored fields which uniquely
|
8
|
+
# identify it.
|
9
|
+
#
|
10
|
+
# Note that fields which are _not_ Field#stored?() are _not_ available in
|
11
|
+
# documents retrieved from the index, e.g. with Hits#doc, Searcher#doc or
|
12
|
+
# IndexReader#document.
|
13
|
+
#
|
14
|
+
# Several fields may be added with the same name. In this case, if the
|
15
|
+
# fields are indexed, their text is treated as though appended for the
|
16
|
+
# purposes of search.
|
17
|
+
#
|
18
|
+
# Note that add like the remove_field(s) methods only makes sense prior to
|
19
|
+
# adding a document to an index. These methods cannot be used to change
|
20
|
+
# the content of an existing index! In order to achieve this, a document
|
21
|
+
# has to be deleted from an index and a new changed version of that
|
22
|
+
# document has to be added.
|
23
|
+
class Document
|
24
|
+
attr_accessor :boost
|
25
|
+
|
26
|
+
# Constructs a new document with no fields.
|
27
|
+
def initialize()
|
28
|
+
# Values are multiplied into the value of Field#boost of each field in
|
29
|
+
# this document. Thus, this method in effect sets a default boost for
|
30
|
+
# the fields of this document.
|
31
|
+
#
|
32
|
+
# The default value is 1.0.
|
33
|
+
#
|
34
|
+
# Note: This value is not stored directly with the document in the
|
35
|
+
# index. Documents returned from IndexReader#document and Hits#doc
|
36
|
+
# may thus not have the same value present as when this document was
|
37
|
+
# indexed.
|
38
|
+
@boost = 1.0
|
39
|
+
@fields = {}
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns an array of all fields. Note that it is possible for two
|
43
|
+
# fields to appear with the same field name. These will be concatenated
|
44
|
+
# in the index.
|
45
|
+
def all_fields
|
46
|
+
@fields.values.flatten
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns the number of distinct fields held within the document. This
|
50
|
+
# counts fields which have multiple entries as one.
|
51
|
+
def field_count()
|
52
|
+
return @fields.size
|
53
|
+
end
|
54
|
+
|
55
|
+
# Returns the number of entries held within the document. This counts
|
56
|
+
# all sections so for fields which have multiple entries, each entry
|
57
|
+
# is counted
|
58
|
+
def entry_count()
|
59
|
+
return @fields.values.flatten.size
|
60
|
+
end
|
61
|
+
|
62
|
+
# Adds a field to a document. Several fields may be added with the same
|
63
|
+
# name. In this case, if the fields are indexed, their text is treated
|
64
|
+
# as though appended for the purposes of search.
|
65
|
+
#
|
66
|
+
# Note that add like the remove_field(s) methods only makes sense prior
|
67
|
+
# to adding a document to an index. These methods cannot be used to
|
68
|
+
# change the content of an existing index! In order to achieve this, a
|
69
|
+
# document has to be deleted from an index and a new changed version of
|
70
|
+
# that document has to be added.
|
71
|
+
def add_field(field)
|
72
|
+
(@fields[field.name] ||= []) << field
|
73
|
+
end
|
74
|
+
alias :<< :add_field
|
75
|
+
|
76
|
+
# Removes the first field of this name if it exists.
|
77
|
+
def remove_field(name)
|
78
|
+
@fields[name].delete_at(0)
|
79
|
+
end
|
80
|
+
|
81
|
+
# Removes all fields with the given name from the document.
|
82
|
+
#
|
83
|
+
# If there is no field with the specified name, the document remains
|
84
|
+
# unchanged.
|
85
|
+
#
|
86
|
+
# Note that the remove_field(s) methods like the add method only make
|
87
|
+
# sense prior to adding a document to an index. These methods cannot be
|
88
|
+
# used to change the content of an existing index! In order to achieve
|
89
|
+
# this, a document has to be deleted from an index and a new changed
|
90
|
+
# version of that document has to be added.
|
91
|
+
def remove_fields(name)
|
92
|
+
@fields.delete(name)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns the first field with the given name.
|
96
|
+
# This method can return _nil_.
|
97
|
+
#
|
98
|
+
# name:: the name of the field
|
99
|
+
# Return:: a _Field_ array
|
100
|
+
def field(name)
|
101
|
+
@fields[name] ? @fields[name][0] : nil
|
102
|
+
end
|
103
|
+
|
104
|
+
# Returns an array of all fields with the given name.
|
105
|
+
# This method can return _nil_.
|
106
|
+
#
|
107
|
+
# name:: the name of the field
|
108
|
+
# Return:: a _Field_ array
|
109
|
+
def fields(name)
|
110
|
+
@fields[name]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Returns an array of values of the field specified as the method
|
114
|
+
# parameter. This method can return _nil_.
|
115
|
+
#
|
116
|
+
# name:: the name of the field
|
117
|
+
# Return:: a _String_ of field values
|
118
|
+
def values(name)
|
119
|
+
return nil if @fields[name].nil?
|
120
|
+
@fields[name].map {|f| f.data if not f.binary? }.join(" ")
|
121
|
+
end
|
122
|
+
alias :[] :values
|
123
|
+
|
124
|
+
# Sets the data in field +field+ to +text+. If there is more than one
|
125
|
+
# field of that name then it will set the data in the first field of that
|
126
|
+
# name.
|
127
|
+
def []=(field_name, data)
|
128
|
+
field = field(field_name)
|
129
|
+
raise ArgumentError, "Field does not exist" unless field
|
130
|
+
field.data = data
|
131
|
+
end
|
132
|
+
|
133
|
+
# Returns an array of binaries of the field specified as the method
|
134
|
+
# parameter. This method can return _nil_.
|
135
|
+
#
|
136
|
+
# name:: the name of the field
|
137
|
+
# Return:: a _String_ of field values
|
138
|
+
def binaries(name)
|
139
|
+
binaries = []
|
140
|
+
@fields[name].each {|f| binaries << f.data if f.binary? }
|
141
|
+
return binaries
|
142
|
+
end
|
143
|
+
|
144
|
+
# Prints the fields of a document for human consumption.#/
|
145
|
+
def to_s()
|
146
|
+
field_str = ""
|
147
|
+
@fields.each_key { |name| field_str += name + " " }
|
148
|
+
field_str[-1] = ">"
|
149
|
+
return "Document<" + field_str
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|