ferret 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,65 @@
1
+ if __FILE__ == $0
2
+ module Ferret
3
+ end
4
+ $:.unshift File.dirname(__FILE__)
5
+ require 'token_stream'
6
+ require 'tokenizers'
7
+ require 'token'
8
+ end
9
+
10
+ module Ferret::Analysis
11
+ # The standard tokenizer is an advanced tokenizer which tokenizes morst
12
+ # words correctly as well as tokenizing things like email addresses, web
13
+ # addresses, phone numbers, etc.
14
+
15
+ class StandardTokenizer < RETokenizer
16
+ ALPHA = /[[:alpha:]]+/
17
+ APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
18
+ ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
19
+ P = /[_\/.,-]/
20
+ HASDIGIT = /\w*\d\w*/
21
+
22
+ protected
23
+
24
+ # Collects only characters which are not spaces tabs or carraige returns
25
+ def token_re()
26
+ #/#{NUM}|#{EMAIL}|#{ACRONYM}\w*|#{C0MPANY}|#{APOSTROPHE}|\w+/
27
+ # This is a simplified version of the original Lucene standard
28
+ # tokenizer. I think it works better. I hope so anyway. Any way to
29
+ # do this more neatly?
30
+ /[[:alpha:]]+(('[[:alpha:]]+)+
31
+ |\.([[:alpha:]]\.)+
32
+ |(@|\&)\w+([-.]\w+)*
33
+ )
34
+ |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
35
+ |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
36
+ |(\.\w+)+
37
+ |
38
+ )
39
+ /x
40
+ end
41
+
42
+ # stem the 's and remove the '.'s from acronyms
43
+ def normalize(str)
44
+ if str =~ /^#{ACRONYM}$/
45
+ str.gsub!(/\./, '')
46
+ elsif str =~ /^#{APOSTROPHE}$/
47
+ str.gsub!(/'[sS]$/, '')
48
+ end
49
+ str
50
+ end
51
+ end
52
+ end
53
+
54
+ # Add this so we can play around with the standard tokenizer
55
+ if __FILE__ == $0
56
+ st = "\033[7m"
57
+ en = "\033[m"
58
+
59
+ $stdin.each do |line|
60
+ stk = Ferret::Analysis::StandardTokenizer.new(line)
61
+ while tk = stk.next()
62
+ puts " <" + tk.term_text + "> from " + tk.start_offset.to_s + " to " + tk.end_offset.to_s
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,79 @@
1
+ module Ferret::Analysis
2
+ # A Token is an occurence of a term from the text of a field. It consists
3
+ # of a term's text, the start and end offset of the term in the text of the
4
+ # field, and a type string.
5
+ #
6
+ # The start and end offsets permit applications to re-associate a token with
7
+ # its source text, e.g., to display highlighted query terms in a document
8
+ # browser, or to show matching text fragments in a KWIC (KeyWord In Context)
9
+ # display, etc.
10
+ #
11
+ # The type is an interned string, assigned by a lexical analyzer (a.k.a.
12
+ # tokenizer), naming the lexical or syntactic class that the token belongs
13
+ # to. For example an end of sentence marker token might be implemented with
14
+ # type "eos". The default token type is "word".
15
+ #
16
+ # start_offset:: is the position of the first character corresponding to
17
+ # this token in the source text
18
+ # end_offset:: is equal to one greater than the position of the last
19
+ # character corresponding of this token Note that the
20
+ # difference between @end_offset and @start_offset may not be
21
+ # equal to @term_text.length(), as the term text may have been
22
+ # altered by a stemmer or some other filter.
23
+ class Token
24
+ include Comparable
25
+ attr_accessor :term_text
26
+ attr_reader :position_increment, :start_offset, :end_offset, :type
27
+
28
+ # Constructs a Token with the given term text, and start & end offsets.
29
+ # The type defaults to "word."
30
+ def initialize(txt, so, eo, typ="word", pos_inc=1)
31
+ @term_text = txt
32
+ @start_offset = so
33
+ @end_offset = eo
34
+ @type = typ # lexical type
35
+ @position_increment = pos_inc
36
+ end
37
+
38
+ # Tokens are sorted by the position in the text at which they occur, ie
39
+ # the start_offset. If two tokens have the same start offset, (see
40
+ # position_increment=) then, they are sorted by the end_offset and then
41
+ # lexically by the token text.
42
+ def <=>(o)
43
+ r = @start_offset <=> o.start_offset
44
+ return r if r != 0
45
+ r = @end_offset <=> o.end_offset
46
+ return r if r != 0
47
+ r = @term_text <=> o.term_text
48
+ return r
49
+ end
50
+
51
+ # Set the position increment. This determines the position of this token
52
+ # relative to the previous Token in a TokenStream, used in phrase
53
+ # searching.
54
+ #
55
+ # The default value is one.
56
+ #
57
+ # Some common uses for this are:
58
+ #
59
+ # * Set it to zero to put multiple terms in the same position. This is
60
+ # useful if, e.g., a word has multiple stems. Searches for phrases
61
+ # including either stem will match. In this case, all but the first
62
+ # stem's increment should be set to zero: the increment of the first
63
+ # instance should be one. Repeating a token with an increment of zero
64
+ # can also be used to boost the scores of matches on that token.
65
+ #
66
+ # * Set it to values greater than one to inhibit exact phrase matches.
67
+ # If, for example, one does not want phrases to match across removed
68
+ # stop words, then one could build a stop word filter that removes stop
69
+ # words and also sets the increment to the number of stop words removed
70
+ # before each non-stop word. Then exact phrase queries will only match
71
+ # when the terms occur with no intervening stop words.
72
+ def position_increment=(pos_inc)
73
+ if (pos_inc < 0)
74
+ raise ArgumentError, "Increment must be zero or greater: " + pos_inc
75
+ end
76
+ @position_increment = pos_inc
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,86 @@
1
+ module Ferret::Analysis
2
+ # A TokenFilter is a TokenStream whose input is another token stream.
3
+ #
4
+ # This is an abstract class.
5
+ class TokenFilter < TokenStream
6
+ # Close the input TokenStream.
7
+ def close()
8
+ @input.close()
9
+ end
10
+
11
+ protected
12
+ # Construct a token stream filtering the given input.
13
+ def initialize(input)
14
+ @input = input
15
+ end
16
+ end
17
+
18
+ # Normalizes token text to lower case.
19
+ class LowerCaseFilter < TokenFilter
20
+ def next()
21
+ t = @input.next()
22
+
23
+ if (t == nil)
24
+ return nil
25
+ end
26
+
27
+ t.term_text = t.term_text.downcase()
28
+
29
+ return t
30
+ end
31
+ end
32
+
33
+ # Removes stop words from a token stream. To will need to pass your own
34
+ # set of stopwords to use this stop filter. If you with to use the default
35
+ # list of stopwords then use the StopAnalyzer.
36
+ class StopFilter < TokenFilter
37
+ # Constructs a filter which removes words from the input
38
+ # TokenStream that are named in the array of words.
39
+ def initialize(input, stop_set)
40
+ super(input);
41
+ @stop_set = stop_set
42
+ end
43
+
44
+ def StopFilter.new_with_file(input, path)
45
+ ws = WordListLoader.word_set_from_file(path)
46
+ return StopFilter.new(input, ws)
47
+ end
48
+
49
+ # Returns the next input Token whose termText() is not a stop word.
50
+ def next()
51
+ # return the first non-stop word found
52
+ while token = @input.next()
53
+ return token if ! @stop_set.include?(token.term_text)
54
+ end
55
+ return nil
56
+ end
57
+ end
58
+
59
+ # Transforms the token stream as per the Porter stemming algorithm.
60
+ # Note: the input to the stemming filter must already be in lower case,
61
+ # so you will need to use LowerCaseFilter or LowerCaseTokenizer further
62
+ # down the Tokenizer chain in order for this to work properly!
63
+ #
64
+ # To use this filter with other analyzers, you'll want to write an
65
+ # Analyzer class that sets up the TokenStream chain as you want it.
66
+ # To use this with LowerCaseTokenizer, for example, you'd write an
67
+ # analyzer like this:
68
+ #
69
+ # def MyAnalyzer < Analyzer
70
+ # def token_stream(field, reader)
71
+ # return PorterStemFilter.new(LowerCaseTokenizer.new(reader))
72
+ # end
73
+ # end
74
+ class PorterStemFilter < TokenFilter
75
+ # Returns the next input Token, after being stemmed
76
+ def next()
77
+ token = @input.next()
78
+ if (token == nil)
79
+ return nil
80
+ else
81
+ token.term_text = Stemmable.stem_porter(token.term_text)
82
+ end
83
+ token
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,26 @@
1
+ module Ferret::Analysis
2
+ # A TokenStream enumerates the sequence of tokens, either from
3
+ # fields of a document or from query text.
4
+ #
5
+ # This is an abstract class. Concrete subclasses are:
6
+ # * Tokenizer, a TokenStream whose input is a Reader; and
7
+ # * TokenFilter, a TokenStream whose input is another TokenStream.
8
+ class TokenStream
9
+ # Returns the next token in the stream, or null at EOS.
10
+ def next
11
+ raise NotImplementedError
12
+ end
13
+
14
+ # Releases resources associated with this stream.
15
+ def close
16
+ raise NotImplementedError
17
+ end
18
+
19
+ # Iterates through the tokens in the field
20
+ def each # :yields: token
21
+ while (n = self.next())
22
+ yield n
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,107 @@
1
+ require 'strscan'
2
+
3
+ module Ferret::Analysis
4
+ # A Tokenizer is a TokenStream whose input is a Reader.
5
+ #
6
+ # This is an abstract class.
7
+ class Tokenizer < TokenStream
8
+ # By default, closes the input Reader.
9
+ def close()
10
+ @input.close()
11
+ end
12
+
13
+ protected
14
+ # Construct a token stream processing the given input.
15
+ def initialize(input)
16
+ @input = input
17
+ end
18
+ end
19
+
20
+ # An abstract base class for simple regular expression oriented
21
+ # tokenizers. Very powerful tokenizers can be created using this class as
22
+ # can be seen from the StandardTokenizer class. Bellow is an example of a
23
+ # simple implementation of a LetterTokenizer using an RETokenizer.
24
+ # Basically, a token is a sequence of alphabetic characters separated by
25
+ # one or more non-alphabetic characters.
26
+ #
27
+ # class LetterTokenizer < RETokenizer
28
+ # def token_re()
29
+ # /[a-zA-Z]+/
30
+ # end
31
+ # end
32
+ class RETokenizer < Tokenizer
33
+
34
+ # Initialize with an IO implementing input such as a file.
35
+ #
36
+ # input:: must have a read(count) method which returns an array or string
37
+ # of _count_ chars.
38
+ def initialize(input)
39
+ if input.is_a? String
40
+ @ss = StringScanner.new(input)
41
+ else
42
+ @ss = StringScanner.new(input.read())
43
+ end
44
+ end
45
+
46
+ # Returns the next token in the stream, or null at EOS.
47
+ def next()
48
+ if @ss.scan_until(token_re)
49
+ term = @ss.matched
50
+ term_end = @ss.pos
51
+ term_start = term_end - term.size
52
+ else
53
+ return nil
54
+ end
55
+
56
+ return Token.new(normalize(term), term_start, term_end)
57
+ end
58
+
59
+ def close()
60
+ @ss = nil
61
+ end
62
+
63
+ protected
64
+ # returns the regular expression used to find the next token
65
+ def token_re
66
+ /[a-zA-Z]+/
67
+ end
68
+
69
+ # Called on each token to normalize it before it is added to the
70
+ # token. The default implementation does nothing. Subclasses may
71
+ # use this to, e.g., lowercase tokens.
72
+ def normalize(str) return str end
73
+ end
74
+
75
+
76
+ # A LetterTokenizer is a tokenizer that divides text at non-letters.
77
+ # That's to say, it defines tokens as maximal strings of adjacent letters,
78
+ # as defined by the regular expression _/[a-zA-Z]+/_.
79
+ class LetterTokenizer < RETokenizer
80
+ protected
81
+ # Collects only characters which satisfy the regular expression
82
+ # _/[a-zA-Z]+/_.
83
+ def token_re()
84
+ /[a-zA-Z]+/
85
+ end
86
+ end
87
+
88
+ # LowerCaseTokenizer performs the function of LetterTokenizer
89
+ # and LowerCaseFilter together. It divides text at non-letters and converts
90
+ # them to lower case.
91
+ class LowerCaseTokenizer < LetterTokenizer
92
+ protected
93
+ def normalize(str)
94
+ return str.downcase
95
+ end
96
+ end
97
+
98
+ # A WhiteSpaceTokenizer is a tokenizer that divides text at whiteSpace.
99
+ # Adjacent sequences of non-WhiteSpace characters form tokens.
100
+ class WhiteSpaceTokenizer < RETokenizer
101
+ protected
102
+ # Collects only characters which are not spaces tabs or carraige returns
103
+ def token_re()
104
+ /\S+/
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,27 @@
1
+ require 'set'
2
+ module Ferret::Analysis
3
+ # Loader for text files that represent a list of stopwords.
4
+ module WordListLoader
5
+ # Loads a text file and adds every line as an entry to a HashSet (omitting
6
+ # leading and trailing whitespace). Every line of the file should contain only
7
+ # one word. The words need to be in lowercase if you make use of an
8
+ # Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
9
+ #
10
+ # path:: path to file containing the wordlist
11
+ # return:: A HashSet with the file's words
12
+ def WordListLoader.word_set_from_file(path)
13
+ result = Set.new()
14
+ File.open(path) do |word_file|
15
+ # we have to strip the end of line characters
16
+ word_file.each {|line| result << line[0..-2] }
17
+ end
18
+ return result
19
+ end
20
+
21
+ def WordListLoader.word_set_from_array(word_array)
22
+ result = Set.new()
23
+ word_array.each {|word| result << word }
24
+ return result
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,2 @@
1
+ require 'ferret/document/field'
2
+ require 'ferret/document/document'
@@ -0,0 +1,152 @@
1
+ module Ferret::Document
2
+ # Documents are the unit of indexing and search.
3
+ #
4
+ # A Document is a set of fields. Each field has a name and a textual
5
+ # value. A field may be Field#stored?() with the document, in which case
6
+ # it is returned with search hits on the document. Thus each document
7
+ # should typically contain one or more stored fields which uniquely
8
+ # identify it.
9
+ #
10
+ # Note that fields which are _not_ Field#stored?() are _not_ available in
11
+ # documents retrieved from the index, e.g. with Hits#doc, Searcher#doc or
12
+ # IndexReader#document.
13
+ #
14
+ # Several fields may be added with the same name. In this case, if the
15
+ # fields are indexed, their text is treated as though appended for the
16
+ # purposes of search.
17
+ #
18
+ # Note that add like the remove_field(s) methods only makes sense prior to
19
+ # adding a document to an index. These methods cannot be used to change
20
+ # the content of an existing index! In order to achieve this, a document
21
+ # has to be deleted from an index and a new changed version of that
22
+ # document has to be added.
23
+ class Document
24
+ attr_accessor :boost
25
+
26
+ # Constructs a new document with no fields.
27
+ def initialize()
28
+ # Values are multiplied into the value of Field#boost of each field in
29
+ # this document. Thus, this method in effect sets a default boost for
30
+ # the fields of this document.
31
+ #
32
+ # The default value is 1.0.
33
+ #
34
+ # Note: This value is not stored directly with the document in the
35
+ # index. Documents returned from IndexReader#document and Hits#doc
36
+ # may thus not have the same value present as when this document was
37
+ # indexed.
38
+ @boost = 1.0
39
+ @fields = {}
40
+ end
41
+
42
+ # Returns an array of all fields. Note that it is possible for two
43
+ # fields to appear with the same field name. These will be concatenated
44
+ # in the index.
45
+ def all_fields
46
+ @fields.values.flatten
47
+ end
48
+
49
+ # Returns the number of distinct fields held within the document. This
50
+ # counts fields which have multiple entries as one.
51
+ def field_count()
52
+ return @fields.size
53
+ end
54
+
55
+ # Returns the number of entries held within the document. This counts
56
+ # all sections so for fields which have multiple entries, each entry
57
+ # is counted
58
+ def entry_count()
59
+ return @fields.values.flatten.size
60
+ end
61
+
62
+ # Adds a field to a document. Several fields may be added with the same
63
+ # name. In this case, if the fields are indexed, their text is treated
64
+ # as though appended for the purposes of search.
65
+ #
66
+ # Note that add like the remove_field(s) methods only makes sense prior
67
+ # to adding a document to an index. These methods cannot be used to
68
+ # change the content of an existing index! In order to achieve this, a
69
+ # document has to be deleted from an index and a new changed version of
70
+ # that document has to be added.
71
+ def add_field(field)
72
+ (@fields[field.name] ||= []) << field
73
+ end
74
+ alias :<< :add_field
75
+
76
+ # Removes the first field of this name if it exists.
77
+ def remove_field(name)
78
+ @fields[name].delete_at(0)
79
+ end
80
+
81
+ # Removes all fields with the given name from the document.
82
+ #
83
+ # If there is no field with the specified name, the document remains
84
+ # unchanged.
85
+ #
86
+ # Note that the remove_field(s) methods like the add method only make
87
+ # sense prior to adding a document to an index. These methods cannot be
88
+ # used to change the content of an existing index! In order to achieve
89
+ # this, a document has to be deleted from an index and a new changed
90
+ # version of that document has to be added.
91
+ def remove_fields(name)
92
+ @fields.delete(name)
93
+ end
94
+
95
+ # Returns the first field with the given name.
96
+ # This method can return _nil_.
97
+ #
98
+ # name:: the name of the field
99
+ # Return:: a _Field_ array
100
+ def field(name)
101
+ @fields[name] ? @fields[name][0] : nil
102
+ end
103
+
104
+ # Returns an array of all fields with the given name.
105
+ # This method can return _nil_.
106
+ #
107
+ # name:: the name of the field
108
+ # Return:: a _Field_ array
109
+ def fields(name)
110
+ @fields[name]
111
+ end
112
+
113
+ # Returns an array of values of the field specified as the method
114
+ # parameter. This method can return _nil_.
115
+ #
116
+ # name:: the name of the field
117
+ # Return:: a _String_ of field values
118
+ def values(name)
119
+ return nil if @fields[name].nil?
120
+ @fields[name].map {|f| f.data if not f.binary? }.join(" ")
121
+ end
122
+ alias :[] :values
123
+
124
+ # Sets the data in field +field+ to +text+. If there is more than one
125
+ # field of that name then it will set the data in the first field of that
126
+ # name.
127
+ def []=(field_name, data)
128
+ field = field(field_name)
129
+ raise ArgumentError, "Field does not exist" unless field
130
+ field.data = data
131
+ end
132
+
133
+ # Returns an array of binaries of the field specified as the method
134
+ # parameter. This method can return _nil_.
135
+ #
136
+ # name:: the name of the field
137
+ # Return:: a _String_ of field values
138
+ def binaries(name)
139
+ binaries = []
140
+ @fields[name].each {|f| binaries << f.data if f.binary? }
141
+ return binaries
142
+ end
143
+
144
+ # Prints the fields of a document for human consumption.#/
145
+ def to_s()
146
+ field_str = ""
147
+ @fields.each_key { |name| field_str += name + " " }
148
+ field_str[-1] = ">"
149
+ return "Document<" + field_str
150
+ end
151
+ end
152
+ end