ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,65 @@
1
+ if __FILE__ == $0
2
+ module Ferret
3
+ end
4
+ $:.unshift File.dirname(__FILE__)
5
+ require 'token_stream'
6
+ require 'tokenizers'
7
+ require 'token'
8
+ end
9
+
10
+ module Ferret::Analysis
11
+ # The standard tokenizer is an advanced tokenizer which tokenizes morst
12
+ # words correctly as well as tokenizing things like email addresses, web
13
+ # addresses, phone numbers, etc.
14
+
15
+ class StandardTokenizer < RETokenizer
16
+ ALPHA = /[[:alpha:]]+/
17
+ APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
18
+ ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
19
+ P = /[_\/.,-]/
20
+ HASDIGIT = /\w*\d\w*/
21
+
22
+ protected
23
+
24
+ # Collects only characters which are not spaces tabs or carraige returns
25
+ def token_re()
26
+ #/#{NUM}|#{EMAIL}|#{ACRONYM}\w*|#{C0MPANY}|#{APOSTROPHE}|\w+/
27
+ # This is a simplified version of the original Lucene standard
28
+ # tokenizer. I think it works better. I hope so anyway. Any way to
29
+ # do this more neatly?
30
+ /[[:alpha:]]+(('[[:alpha:]]+)+
31
+ |\.([[:alpha:]]\.)+
32
+ |(@|\&)\w+([-.]\w+)*
33
+ )
34
+ |\w+(([\-._]\w+)*\@\w+([-.]\w+)+
35
+ |#{P}#{HASDIGIT}(#{P}\w+#{P}#{HASDIGIT})*(#{P}\w+)?
36
+ |(\.\w+)+
37
+ |
38
+ )
39
+ /x
40
+ end
41
+
42
+ # stem the 's and remove the '.'s from acronyms
43
+ def normalize(str)
44
+ if str =~ /^#{ACRONYM}$/
45
+ str.gsub!(/\./, '')
46
+ elsif str =~ /^#{APOSTROPHE}$/
47
+ str.gsub!(/'[sS]$/, '')
48
+ end
49
+ str
50
+ end
51
+ end
52
+ end
53
+
54
+ # Add this so we can play around with the standard tokenizer
55
+ if __FILE__ == $0
56
+ st = "\033[7m"
57
+ en = "\033[m"
58
+
59
+ $stdin.each do |line|
60
+ stk = Ferret::Analysis::StandardTokenizer.new(line)
61
+ while tk = stk.next()
62
+ puts " <" + tk.term_text + "> from " + tk.start_offset.to_s + " to " + tk.end_offset.to_s
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,79 @@
1
+ module Ferret::Analysis
2
+ # A Token is an occurence of a term from the text of a field. It consists
3
+ # of a term's text, the start and end offset of the term in the text of the
4
+ # field, and a type string.
5
+ #
6
+ # The start and end offsets permit applications to re-associate a token with
7
+ # its source text, e.g., to display highlighted query terms in a document
8
+ # browser, or to show matching text fragments in a KWIC (KeyWord In Context)
9
+ # display, etc.
10
+ #
11
+ # The type is an interned string, assigned by a lexical analyzer (a.k.a.
12
+ # tokenizer), naming the lexical or syntactic class that the token belongs
13
+ # to. For example an end of sentence marker token might be implemented with
14
+ # type "eos". The default token type is "word".
15
+ #
16
+ # start_offset:: is the position of the first character corresponding to
17
+ # this token in the source text
18
+ # end_offset:: is equal to one greater than the position of the last
19
+ # character corresponding of this token Note that the
20
+ # difference between @end_offset and @start_offset may not be
21
+ # equal to @term_text.length(), as the term text may have been
22
+ # altered by a stemmer or some other filter.
23
+ class Token
24
+ include Comparable
25
+ attr_accessor :term_text
26
+ attr_reader :position_increment, :start_offset, :end_offset, :type
27
+
28
+ # Constructs a Token with the given term text, and start & end offsets.
29
+ # The type defaults to "word."
30
+ def initialize(txt, so, eo, typ="word", pos_inc=1)
31
+ @term_text = txt
32
+ @start_offset = so
33
+ @end_offset = eo
34
+ @type = typ # lexical type
35
+ @position_increment = pos_inc
36
+ end
37
+
38
+ # Tokens are sorted by the position in the text at which they occur, ie
39
+ # the start_offset. If two tokens have the same start offset, (see
40
+ # position_increment=) then, they are sorted by the end_offset and then
41
+ # lexically by the token text.
42
+ def <=>(o)
43
+ r = @start_offset <=> o.start_offset
44
+ return r if r != 0
45
+ r = @end_offset <=> o.end_offset
46
+ return r if r != 0
47
+ r = @term_text <=> o.term_text
48
+ return r
49
+ end
50
+
51
+ # Set the position increment. This determines the position of this token
52
+ # relative to the previous Token in a TokenStream, used in phrase
53
+ # searching.
54
+ #
55
+ # The default value is one.
56
+ #
57
+ # Some common uses for this are:
58
+ #
59
+ # * Set it to zero to put multiple terms in the same position. This is
60
+ # useful if, e.g., a word has multiple stems. Searches for phrases
61
+ # including either stem will match. In this case, all but the first
62
+ # stem's increment should be set to zero: the increment of the first
63
+ # instance should be one. Repeating a token with an increment of zero
64
+ # can also be used to boost the scores of matches on that token.
65
+ #
66
+ # * Set it to values greater than one to inhibit exact phrase matches.
67
+ # If, for example, one does not want phrases to match across removed
68
+ # stop words, then one could build a stop word filter that removes stop
69
+ # words and also sets the increment to the number of stop words removed
70
+ # before each non-stop word. Then exact phrase queries will only match
71
+ # when the terms occur with no intervening stop words.
72
+ def position_increment=(pos_inc)
73
+ if (pos_inc < 0)
74
+ raise ArgumentError, "Increment must be zero or greater: " + pos_inc
75
+ end
76
+ @position_increment = pos_inc
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,86 @@
1
+ module Ferret::Analysis
2
+ # A TokenFilter is a TokenStream whose input is another token stream.
3
+ #
4
+ # This is an abstract class.
5
+ class TokenFilter < TokenStream
6
+ # Close the input TokenStream.
7
+ def close()
8
+ @input.close()
9
+ end
10
+
11
+ protected
12
+ # Construct a token stream filtering the given input.
13
+ def initialize(input)
14
+ @input = input
15
+ end
16
+ end
17
+
18
+ # Normalizes token text to lower case.
19
+ class LowerCaseFilter < TokenFilter
20
+ def next()
21
+ t = @input.next()
22
+
23
+ if (t == nil)
24
+ return nil
25
+ end
26
+
27
+ t.term_text = t.term_text.downcase()
28
+
29
+ return t
30
+ end
31
+ end
32
+
33
+ # Removes stop words from a token stream. To will need to pass your own
34
+ # set of stopwords to use this stop filter. If you with to use the default
35
+ # list of stopwords then use the StopAnalyzer.
36
+ class StopFilter < TokenFilter
37
+ # Constructs a filter which removes words from the input
38
+ # TokenStream that are named in the array of words.
39
+ def initialize(input, stop_set)
40
+ super(input);
41
+ @stop_set = stop_set
42
+ end
43
+
44
+ def StopFilter.new_with_file(input, path)
45
+ ws = WordListLoader.word_set_from_file(path)
46
+ return StopFilter.new(input, ws)
47
+ end
48
+
49
+ # Returns the next input Token whose termText() is not a stop word.
50
+ def next()
51
+ # return the first non-stop word found
52
+ while token = @input.next()
53
+ return token if ! @stop_set.include?(token.term_text)
54
+ end
55
+ return nil
56
+ end
57
+ end
58
+
59
+ # Transforms the token stream as per the Porter stemming algorithm.
60
+ # Note: the input to the stemming filter must already be in lower case,
61
+ # so you will need to use LowerCaseFilter or LowerCaseTokenizer further
62
+ # down the Tokenizer chain in order for this to work properly!
63
+ #
64
+ # To use this filter with other analyzers, you'll want to write an
65
+ # Analyzer class that sets up the TokenStream chain as you want it.
66
+ # To use this with LowerCaseTokenizer, for example, you'd write an
67
+ # analyzer like this:
68
+ #
69
+ # def MyAnalyzer < Analyzer
70
+ # def token_stream(field, reader)
71
+ # return PorterStemFilter.new(LowerCaseTokenizer.new(reader))
72
+ # end
73
+ # end
74
+ class PorterStemFilter < TokenFilter
75
+ # Returns the next input Token, after being stemmed
76
+ def next()
77
+ token = @input.next()
78
+ if (token == nil)
79
+ return nil
80
+ else
81
+ token.term_text = Stemmable.stem_porter(token.term_text)
82
+ end
83
+ token
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,26 @@
1
+ module Ferret::Analysis
2
+ # A TokenStream enumerates the sequence of tokens, either from
3
+ # fields of a document or from query text.
4
+ #
5
+ # This is an abstract class. Concrete subclasses are:
6
+ # * Tokenizer, a TokenStream whose input is a Reader; and
7
+ # * TokenFilter, a TokenStream whose input is another TokenStream.
8
+ class TokenStream
9
+ # Returns the next token in the stream, or null at EOS.
10
+ def next
11
+ raise NotImplementedError
12
+ end
13
+
14
+ # Releases resources associated with this stream.
15
+ def close
16
+ raise NotImplementedError
17
+ end
18
+
19
+ # Iterates through the tokens in the field
20
+ def each # :yields: token
21
+ while (n = self.next())
22
+ yield n
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,107 @@
1
+ require 'strscan'
2
+
3
+ module Ferret::Analysis
4
+ # A Tokenizer is a TokenStream whose input is a Reader.
5
+ #
6
+ # This is an abstract class.
7
+ class Tokenizer < TokenStream
8
+ # By default, closes the input Reader.
9
+ def close()
10
+ @input.close()
11
+ end
12
+
13
+ protected
14
+ # Construct a token stream processing the given input.
15
+ def initialize(input)
16
+ @input = input
17
+ end
18
+ end
19
+
20
+ # An abstract base class for simple regular expression oriented
21
+ # tokenizers. Very powerful tokenizers can be created using this class as
22
+ # can be seen from the StandardTokenizer class. Bellow is an example of a
23
+ # simple implementation of a LetterTokenizer using an RETokenizer.
24
+ # Basically, a token is a sequence of alphabetic characters separated by
25
+ # one or more non-alphabetic characters.
26
+ #
27
+ # class LetterTokenizer < RETokenizer
28
+ # def token_re()
29
+ # /[a-zA-Z]+/
30
+ # end
31
+ # end
32
+ class RETokenizer < Tokenizer
33
+
34
+ # Initialize with an IO implementing input such as a file.
35
+ #
36
+ # input:: must have a read(count) method which returns an array or string
37
+ # of _count_ chars.
38
+ def initialize(input)
39
+ if input.is_a? String
40
+ @ss = StringScanner.new(input)
41
+ else
42
+ @ss = StringScanner.new(input.read())
43
+ end
44
+ end
45
+
46
+ # Returns the next token in the stream, or null at EOS.
47
+ def next()
48
+ if @ss.scan_until(token_re)
49
+ term = @ss.matched
50
+ term_end = @ss.pos
51
+ term_start = term_end - term.size
52
+ else
53
+ return nil
54
+ end
55
+
56
+ return Token.new(normalize(term), term_start, term_end)
57
+ end
58
+
59
+ def close()
60
+ @ss = nil
61
+ end
62
+
63
+ protected
64
+ # returns the regular expression used to find the next token
65
+ def token_re
66
+ /[a-zA-Z]+/
67
+ end
68
+
69
+ # Called on each token to normalize it before it is added to the
70
+ # token. The default implementation does nothing. Subclasses may
71
+ # use this to, e.g., lowercase tokens.
72
+ def normalize(str) return str end
73
+ end
74
+
75
+
76
+ # A LetterTokenizer is a tokenizer that divides text at non-letters.
77
+ # That's to say, it defines tokens as maximal strings of adjacent letters,
78
+ # as defined by the regular expression _/[a-zA-Z]+/_.
79
+ class LetterTokenizer < RETokenizer
80
+ protected
81
+ # Collects only characters which satisfy the regular expression
82
+ # _/[a-zA-Z]+/_.
83
+ def token_re()
84
+ /[a-zA-Z]+/
85
+ end
86
+ end
87
+
88
+ # LowerCaseTokenizer performs the function of LetterTokenizer
89
+ # and LowerCaseFilter together. It divides text at non-letters and converts
90
+ # them to lower case.
91
+ class LowerCaseTokenizer < LetterTokenizer
92
+ protected
93
+ def normalize(str)
94
+ return str.downcase
95
+ end
96
+ end
97
+
98
+ # A WhiteSpaceTokenizer is a tokenizer that divides text at whiteSpace.
99
+ # Adjacent sequences of non-WhiteSpace characters form tokens.
100
+ class WhiteSpaceTokenizer < RETokenizer
101
+ protected
102
+ # Collects only characters which are not spaces tabs or carraige returns
103
+ def token_re()
104
+ /\S+/
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,27 @@
1
+ require 'set'
2
+ module Ferret::Analysis
3
+ # Loader for text files that represent a list of stopwords.
4
+ module WordListLoader
5
+ # Loads a text file and adds every line as an entry to a HashSet (omitting
6
+ # leading and trailing whitespace). Every line of the file should contain only
7
+ # one word. The words need to be in lowercase if you make use of an
8
+ # Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
9
+ #
10
+ # path:: path to file containing the wordlist
11
+ # return:: A HashSet with the file's words
12
+ def WordListLoader.word_set_from_file(path)
13
+ result = Set.new()
14
+ File.open(path) do |word_file|
15
+ # we have to strip the end of line characters
16
+ word_file.each {|line| result << line[0..-2] }
17
+ end
18
+ return result
19
+ end
20
+
21
+ def WordListLoader.word_set_from_array(word_array)
22
+ result = Set.new()
23
+ word_array.each {|word| result << word }
24
+ return result
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,2 @@
1
+ require 'ferret/document/field'
2
+ require 'ferret/document/document'
@@ -0,0 +1,152 @@
1
+ module Ferret::Document
2
+ # Documents are the unit of indexing and search.
3
+ #
4
+ # A Document is a set of fields. Each field has a name and a textual
5
+ # value. A field may be Field#stored?() with the document, in which case
6
+ # it is returned with search hits on the document. Thus each document
7
+ # should typically contain one or more stored fields which uniquely
8
+ # identify it.
9
+ #
10
+ # Note that fields which are _not_ Field#stored?() are _not_ available in
11
+ # documents retrieved from the index, e.g. with Hits#doc, Searcher#doc or
12
+ # IndexReader#document.
13
+ #
14
+ # Several fields may be added with the same name. In this case, if the
15
+ # fields are indexed, their text is treated as though appended for the
16
+ # purposes of search.
17
+ #
18
+ # Note that add like the remove_field(s) methods only makes sense prior to
19
+ # adding a document to an index. These methods cannot be used to change
20
+ # the content of an existing index! In order to achieve this, a document
21
+ # has to be deleted from an index and a new changed version of that
22
+ # document has to be added.
23
+ class Document
24
+ attr_accessor :boost
25
+
26
+ # Constructs a new document with no fields.
27
+ def initialize()
28
+ # Values are multiplied into the value of Field#boost of each field in
29
+ # this document. Thus, this method in effect sets a default boost for
30
+ # the fields of this document.
31
+ #
32
+ # The default value is 1.0.
33
+ #
34
+ # Note: This value is not stored directly with the document in the
35
+ # index. Documents returned from IndexReader#document and Hits#doc
36
+ # may thus not have the same value present as when this document was
37
+ # indexed.
38
+ @boost = 1.0
39
+ @fields = {}
40
+ end
41
+
42
+ # Returns an array of all fields. Note that it is possible for two
43
+ # fields to appear with the same field name. These will be concatenated
44
+ # in the index.
45
+ def all_fields
46
+ @fields.values.flatten
47
+ end
48
+
49
+ # Returns the number of distinct fields held within the document. This
50
+ # counts fields which have multiple entries as one.
51
+ def field_count()
52
+ return @fields.size
53
+ end
54
+
55
+ # Returns the number of entries held within the document. This counts
56
+ # all sections so for fields which have multiple entries, each entry
57
+ # is counted
58
+ def entry_count()
59
+ return @fields.values.flatten.size
60
+ end
61
+
62
+ # Adds a field to a document. Several fields may be added with the same
63
+ # name. In this case, if the fields are indexed, their text is treated
64
+ # as though appended for the purposes of search.
65
+ #
66
+ # Note that add like the remove_field(s) methods only makes sense prior
67
+ # to adding a document to an index. These methods cannot be used to
68
+ # change the content of an existing index! In order to achieve this, a
69
+ # document has to be deleted from an index and a new changed version of
70
+ # that document has to be added.
71
+ def add_field(field)
72
+ (@fields[field.name] ||= []) << field
73
+ end
74
+ alias :<< :add_field
75
+
76
+ # Removes the first field of this name if it exists.
77
+ def remove_field(name)
78
+ @fields[name].delete_at(0)
79
+ end
80
+
81
+ # Removes all fields with the given name from the document.
82
+ #
83
+ # If there is no field with the specified name, the document remains
84
+ # unchanged.
85
+ #
86
+ # Note that the remove_field(s) methods like the add method only make
87
+ # sense prior to adding a document to an index. These methods cannot be
88
+ # used to change the content of an existing index! In order to achieve
89
+ # this, a document has to be deleted from an index and a new changed
90
+ # version of that document has to be added.
91
+ def remove_fields(name)
92
+ @fields.delete(name)
93
+ end
94
+
95
+ # Returns the first field with the given name.
96
+ # This method can return _nil_.
97
+ #
98
+ # name:: the name of the field
99
+ # Return:: a _Field_ array
100
+ def field(name)
101
+ @fields[name] ? @fields[name][0] : nil
102
+ end
103
+
104
+ # Returns an array of all fields with the given name.
105
+ # This method can return _nil_.
106
+ #
107
+ # name:: the name of the field
108
+ # Return:: a _Field_ array
109
+ def fields(name)
110
+ @fields[name]
111
+ end
112
+
113
+ # Returns an array of values of the field specified as the method
114
+ # parameter. This method can return _nil_.
115
+ #
116
+ # name:: the name of the field
117
+ # Return:: a _String_ of field values
118
+ def values(name)
119
+ return nil if @fields[name].nil?
120
+ @fields[name].map {|f| f.data if not f.binary? }.join(" ")
121
+ end
122
+ alias :[] :values
123
+
124
+ # Sets the data in field +field+ to +text+. If there is more than one
125
+ # field of that name then it will set the data in the first field of that
126
+ # name.
127
+ def []=(field_name, data)
128
+ field = field(field_name)
129
+ raise ArgumentError, "Field does not exist" unless field
130
+ field.data = data
131
+ end
132
+
133
+ # Returns an array of binaries of the field specified as the method
134
+ # parameter. This method can return _nil_.
135
+ #
136
+ # name:: the name of the field
137
+ # Return:: a _String_ of field values
138
+ def binaries(name)
139
+ binaries = []
140
+ @fields[name].each {|f| binaries << f.data if f.binary? }
141
+ return binaries
142
+ end
143
+
144
+ # Prints the fields of a document for human consumption.#/
145
+ def to_s()
146
+ field_str = ""
147
+ @fields.each_key { |name| field_str += name + " " }
148
+ field_str[-1] = ">"
149
+ return "Document<" + field_str
150
+ end
151
+ end
152
+ end