ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
 - data/Rakefile +34 -13
 - data/TODO +1 -0
 - data/TUTORIAL +1 -1
 - data/ext/analysis.c +87 -70
 - data/ext/analysis.h +18 -6
 - data/ext/array.c +1 -2
 - data/ext/array.h +1 -1
 - data/ext/bitvector.c +10 -6
 - data/ext/bitvector.h +2 -2
 - data/ext/compound_io.c +30 -27
 - data/ext/document.c +15 -15
 - data/ext/document.h +5 -5
 - data/ext/except.c +2 -0
 - data/ext/except.h +25 -23
 - data/ext/extconf.rb +1 -0
 - data/ext/ferret.c +10 -8
 - data/ext/ferret.h +9 -8
 - data/ext/field.c +29 -25
 - data/ext/filter.c +52 -14
 - data/ext/frtio.h +13 -0
 - data/ext/fs_store.c +115 -170
 - data/ext/global.c +9 -8
 - data/ext/global.h +17 -13
 - data/ext/hash.c +13 -19
 - data/ext/hash.h +11 -11
 - data/ext/hashset.c +5 -7
 - data/ext/hashset.h +9 -8
 - data/ext/helper.c +1 -1
 - data/ext/helper.h +2 -1
 - data/ext/inc/except.h +25 -23
 - data/ext/inc/lang.h +11 -1
 - data/ext/ind.c +33 -21
 - data/ext/index.h +44 -39
 - data/ext/index_io.c +61 -57
 - data/ext/index_rw.c +418 -361
 - data/ext/lang.c +10 -0
 - data/ext/lang.h +11 -1
 - data/ext/nix_io.c +135 -0
 - data/ext/priorityqueue.c +16 -16
 - data/ext/priorityqueue.h +9 -6
 - data/ext/q_boolean.c +128 -76
 - data/ext/q_const_score.c +20 -20
 - data/ext/q_filtered_query.c +20 -20
 - data/ext/q_fuzzy.c +37 -23
 - data/ext/q_match_all.c +15 -19
 - data/ext/q_multi_phrase.c +87 -46
 - data/ext/q_parser.c +247 -119
 - data/ext/q_phrase.c +86 -52
 - data/ext/q_prefix.c +25 -14
 - data/ext/q_range.c +59 -14
 - data/ext/q_span.c +263 -172
 - data/ext/q_term.c +62 -51
 - data/ext/q_wildcard.c +24 -13
 - data/ext/r_analysis.c +328 -80
 - data/ext/r_doc.c +11 -6
 - data/ext/r_index_io.c +40 -32
 - data/ext/r_qparser.c +15 -14
 - data/ext/r_search.c +270 -152
 - data/ext/r_store.c +32 -17
 - data/ext/ram_store.c +38 -22
 - data/ext/search.c +617 -87
 - data/ext/search.h +227 -163
 - data/ext/similarity.c +54 -45
 - data/ext/similarity.h +3 -3
 - data/ext/sort.c +132 -53
 - data/ext/store.c +21 -2
 - data/ext/store.h +14 -14
 - data/ext/tags +4322 -232
 - data/ext/term.c +140 -109
 - data/ext/termdocs.c +74 -60
 - data/ext/vector.c +181 -152
 - data/ext/w32_io.c +150 -0
 - data/lib/ferret.rb +1 -1
 - data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
 - data/lib/ferret/document/field.rb +1 -1
 - data/lib/ferret/index/field_infos.rb +1 -1
 - data/lib/ferret/index/term.rb +1 -1
 - data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
 - data/lib/ferret/search.rb +1 -0
 - data/lib/ferret/search/boolean_query.rb +0 -4
 - data/lib/ferret/search/index_searcher.rb +21 -8
 - data/lib/ferret/search/multi_phrase_query.rb +7 -0
 - data/lib/ferret/search/multi_searcher.rb +261 -0
 - data/lib/ferret/search/phrase_query.rb +1 -1
 - data/lib/ferret/search/query.rb +34 -5
 - data/lib/ferret/search/sort.rb +7 -3
 - data/lib/ferret/search/sort_field.rb +8 -4
 - data/lib/ferret/store/fs_store.rb +13 -6
 - data/lib/ferret/store/index_io.rb +0 -14
 - data/lib/ferret/store/ram_store.rb +3 -2
 - data/lib/rferret.rb +1 -1
 - data/test/unit/analysis/ctc_analyzer.rb +131 -0
 - data/test/unit/analysis/ctc_tokenstream.rb +98 -9
 - data/test/unit/index/tc_index.rb +40 -1
 - data/test/unit/index/tc_term.rb +7 -0
 - data/test/unit/index/th_doc.rb +8 -0
 - data/test/unit/query_parser/tc_query_parser.rb +6 -4
 - data/test/unit/search/rtc_sort_field.rb +6 -6
 - data/test/unit/search/tc_index_searcher.rb +8 -0
 - data/test/unit/search/tc_multi_searcher.rb +275 -0
 - data/test/unit/search/tc_multi_searcher2.rb +126 -0
 - data/test/unit/search/tc_search_and_sort.rb +66 -0
 - metadata +31 -26
 - data/test/unit/query_parser/rtc_query_parser.rb +0 -138
 
    
        data/lib/ferret/search/query.rb
    CHANGED
    
    | 
         @@ -62,15 +62,44 @@ module Ferret::Search 
     | 
|
| 
       62 
62 
     | 
    
         | 
| 
       63 
63 
     | 
    
         
             
                # Expert: called when re-writing queries under MultiSearcher.
         
     | 
| 
       64 
64 
     | 
    
         
             
                # 
         
     | 
| 
       65 
     | 
    
         
            -
                #  
     | 
| 
       66 
     | 
    
         
            -
                #  
     | 
| 
      
 65 
     | 
    
         
            +
                # Create a single query suitable for use by all subsearchers (in 1-1
         
     | 
| 
      
 66 
     | 
    
         
            +
                # correspondence with queries). This is an optimization of the OR of
         
     | 
| 
      
 67 
     | 
    
         
            +
                # all queries. We handle the common optimization cases of equal
         
     | 
| 
      
 68 
     | 
    
         
            +
                # queries and overlapping clauses of boolean OR queries (as generated
         
     | 
| 
      
 69 
     | 
    
         
            +
                # by MultiTermQuery.rewrite() and RangeQuery.rewrite()).
         
     | 
| 
      
 70 
     | 
    
         
            +
                # Be careful overriding this method as queries[0] determines which
         
     | 
| 
      
 71 
     | 
    
         
            +
                # method will be called and is not necessarily of the same type as
         
     | 
| 
      
 72 
     | 
    
         
            +
                # the other queries.
         
     | 
| 
       67 
73 
     | 
    
         
             
                def combine(queries) 
         
     | 
| 
      
 74 
     | 
    
         
            +
                  uniques = Set.new
         
     | 
| 
       68 
75 
     | 
    
         
             
                  queries.each do |query|
         
     | 
| 
       69 
     | 
    
         
            -
                     
     | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
      
 76 
     | 
    
         
            +
                    clauses = []
         
     | 
| 
      
 77 
     | 
    
         
            +
                    # check if we can split the query into clauses
         
     | 
| 
      
 78 
     | 
    
         
            +
                    splittable = query.respond_to? :clauses
         
     | 
| 
      
 79 
     | 
    
         
            +
                    if splittable
         
     | 
| 
      
 80 
     | 
    
         
            +
                      splittable = query.coord_disabled?
         
     | 
| 
      
 81 
     | 
    
         
            +
                      clauses = query.clauses
         
     | 
| 
      
 82 
     | 
    
         
            +
                      clauses.each do |clause|
         
     | 
| 
      
 83 
     | 
    
         
            +
                        splittable = clause.occur == BooleanClause::Occur::SHOULD
         
     | 
| 
      
 84 
     | 
    
         
            +
                        break unless splittable
         
     | 
| 
      
 85 
     | 
    
         
            +
                      end
         
     | 
| 
      
 86 
     | 
    
         
            +
                    end
         
     | 
| 
      
 87 
     | 
    
         
            +
                    if splittable
         
     | 
| 
      
 88 
     | 
    
         
            +
                      clauses.each { |clause| uniques << clause.query }
         
     | 
| 
      
 89 
     | 
    
         
            +
                    else
         
     | 
| 
      
 90 
     | 
    
         
            +
                      uniques << query
         
     | 
| 
       71 
91 
     | 
    
         
             
                    end
         
     | 
| 
       72 
92 
     | 
    
         
             
                  end
         
     | 
| 
       73 
     | 
    
         
            -
                  return  
     | 
| 
      
 93 
     | 
    
         
            +
                  # optimization: if we have just one query, just return it
         
     | 
| 
      
 94 
     | 
    
         
            +
                  if uniques.size == 1
         
     | 
| 
      
 95 
     | 
    
         
            +
                    uniques.each { |query| return query }
         
     | 
| 
      
 96 
     | 
    
         
            +
                  end
         
     | 
| 
      
 97 
     | 
    
         
            +
                  
         
     | 
| 
      
 98 
     | 
    
         
            +
                  result = BooleanQuery.new(true)
         
     | 
| 
      
 99 
     | 
    
         
            +
                  uniques.each do |query|
         
     | 
| 
      
 100 
     | 
    
         
            +
                    result.add_query(query, BooleanClause::Occur::SHOULD)
         
     | 
| 
      
 101 
     | 
    
         
            +
                  end
         
     | 
| 
      
 102 
     | 
    
         
            +
                  return result
         
     | 
| 
       74 
103 
     | 
    
         
             
                end
         
     | 
| 
       75 
104 
     | 
    
         | 
| 
       76 
105 
     | 
    
         
             
                # Expert: adds all terms occuring in this query to the terms set
         
     | 
    
        data/lib/ferret/search/sort.rb
    CHANGED
    
    | 
         @@ -84,8 +84,12 @@ module Ferret::Search 
     | 
|
| 
       84 
84 
     | 
    
         
             
                  fields = fields.map {|field| field.is_a?(Symbol) ? field.to_s : field}
         
     | 
| 
       85 
85 
     | 
    
         
             
                  if fields[0].is_a?(String)
         
     | 
| 
       86 
86 
     | 
    
         
             
                    @fields = fields.map do |field|
         
     | 
| 
       87 
     | 
    
         
            -
                       
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
      
 87 
     | 
    
         
            +
                      if (field.is_a?(String))
         
     | 
| 
      
 88 
     | 
    
         
            +
                        next SortField.new(field, {:sort_type => SortField::SortType::AUTO,
         
     | 
| 
      
 89 
     | 
    
         
            +
                                                   :reverse => reverse})
         
     | 
| 
      
 90 
     | 
    
         
            +
                      else
         
     | 
| 
      
 91 
     | 
    
         
            +
                        next field
         
     | 
| 
      
 92 
     | 
    
         
            +
                      end
         
     | 
| 
       89 
93 
     | 
    
         
             
                    end
         
     | 
| 
       90 
94 
     | 
    
         
             
                  end
         
     | 
| 
       91 
95 
     | 
    
         
             
                  doc_sort_added = false
         
     | 
| 
         @@ -102,7 +106,7 @@ module Ferret::Search 
     | 
|
| 
       102 
106 
     | 
    
         
             
                INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
         
     | 
| 
       103 
107 
     | 
    
         | 
| 
       104 
108 
     | 
    
         
             
                def to_s() 
         
     | 
| 
       105 
     | 
    
         
            -
                  return @fields.map {|field| "#{field}"}.join(", ")
         
     | 
| 
      
 109 
     | 
    
         
            +
                  return "Sort[" + @fields.map {|field| "#{field}"}.join(", ") + "]"
         
     | 
| 
       106 
110 
     | 
    
         
             
                end
         
     | 
| 
       107 
111 
     | 
    
         
             
              end
         
     | 
| 
       108 
112 
     | 
    
         
             
            end
         
     | 
| 
         @@ -20,11 +20,11 @@ module Ferret::Search 
     | 
|
| 
       20 
20 
     | 
    
         | 
| 
       21 
21 
     | 
    
         
             
                  # Sort by document score (relevancy).  Sort values are Float and higher
         
     | 
| 
       22 
22 
     | 
    
         
             
                  # values are at the front. 
         
     | 
| 
       23 
     | 
    
         
            -
                  SCORE = SortType.new(" 
     | 
| 
      
 23 
     | 
    
         
            +
                  SCORE = SortType.new("SCORE")
         
     | 
| 
       24 
24 
     | 
    
         | 
| 
       25 
25 
     | 
    
         
             
                  # Sort by document number (order).  Sort values are Integer and lower
         
     | 
| 
       26 
26 
     | 
    
         
             
                  # values are at the front. 
         
     | 
| 
       27 
     | 
    
         
            -
                  DOC = SortType.new(" 
     | 
| 
      
 27 
     | 
    
         
            +
                  DOC = SortType.new("DOC")
         
     | 
| 
       28 
28 
     | 
    
         | 
| 
       29 
29 
     | 
    
         
             
                  # Guess sort type of sort based on field contents. We try parsing the
         
     | 
| 
       30 
30 
     | 
    
         
             
                  # field as an integer and then as a floating point number. If we are
         
     | 
| 
         @@ -37,7 +37,7 @@ module Ferret::Search 
     | 
|
| 
       37 
37 
     | 
    
         | 
| 
       38 
38 
     | 
    
         
             
                  # Sort using term values as encoded Integers.  Sort values are Integer
         
     | 
| 
       39 
39 
     | 
    
         
             
                  # and lower values are at the front. 
         
     | 
| 
       40 
     | 
    
         
            -
                  INTEGER = SortType.new(" 
     | 
| 
      
 40 
     | 
    
         
            +
                  INTEGER = SortType.new("integer", lambda{|str| str.to_i})
         
     | 
| 
       41 
41 
     | 
    
         | 
| 
       42 
42 
     | 
    
         
             
                  # Sort using term values as encoded Floats.  Sort values are Float and
         
     | 
| 
       43 
43 
     | 
    
         
             
                  # lower values are at the front. 
         
     | 
| 
         @@ -79,7 +79,11 @@ module Ferret::Search 
     | 
|
| 
       79 
79 
     | 
    
         
             
                FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
         
     | 
| 
       80 
80 
     | 
    
         | 
| 
       81 
81 
     | 
    
         
             
                def to_s() 
         
     | 
| 
       82 
     | 
    
         
            -
                   
     | 
| 
      
 82 
     | 
    
         
            +
                  if @name
         
     | 
| 
      
 83 
     | 
    
         
            +
                    buffer = "#@name:<#@sort_type>"
         
     | 
| 
      
 84 
     | 
    
         
            +
                  else
         
     | 
| 
      
 85 
     | 
    
         
            +
                    buffer = "<#{@sort_type}>"
         
     | 
| 
      
 86 
     | 
    
         
            +
                  end
         
     | 
| 
       83 
87 
     | 
    
         
             
                  buffer << '!' if @reverse
         
     | 
| 
       84 
88 
     | 
    
         
             
                  return buffer
         
     | 
| 
       85 
89 
     | 
    
         
             
                end
         
     | 
| 
         @@ -102,13 +102,13 @@ module Ferret::Store 
     | 
|
| 
       102 
102 
     | 
    
         
             
                    # delete all the files
         
     | 
| 
       103 
103 
     | 
    
         
             
                    refresh_dir
         
     | 
| 
       104 
104 
     | 
    
         
             
                    each do |fname|
         
     | 
| 
       105 
     | 
    
         
            -
                       
     | 
| 
      
 105 
     | 
    
         
            +
                      FileUtils.rm_rf(dir_path(fname))
         
     | 
| 
       106 
106 
     | 
    
         
             
                    end
         
     | 
| 
       107 
107 
     | 
    
         
             
                    # clear all the locks
         
     | 
| 
       108 
108 
     | 
    
         
             
                    refresh_lock_dir
         
     | 
| 
       109 
109 
     | 
    
         
             
                    @lock_dir.each do |lock_fname|
         
     | 
| 
       110 
110 
     | 
    
         
             
                      next if lock_fname == '.' or lock_fname == '..'
         
     | 
| 
       111 
     | 
    
         
            -
                       
     | 
| 
      
 111 
     | 
    
         
            +
                      FileUtils.rm_rf(@lock_dir.path + '/' + lock_fname)
         
     | 
| 
       112 
112 
     | 
    
         
             
                    end
         
     | 
| 
       113 
113 
     | 
    
         
             
                  end
         
     | 
| 
       114 
114 
     | 
    
         
             
                end
         
     | 
| 
         @@ -159,7 +159,13 @@ module Ferret::Store 
     | 
|
| 
       159 
159 
     | 
    
         
             
                # This replacement should be atomic.
         
     | 
| 
       160 
160 
     | 
    
         
             
                def rename(from, to)
         
     | 
| 
       161 
161 
     | 
    
         
             
                  synchronize do
         
     | 
| 
       162 
     | 
    
         
            -
                     
     | 
| 
      
 162 
     | 
    
         
            +
                    begin
         
     | 
| 
      
 163 
     | 
    
         
            +
                      File.rename(dir_path(from), dir_path(to))
         
     | 
| 
      
 164 
     | 
    
         
            +
                    rescue
         
     | 
| 
      
 165 
     | 
    
         
            +
                      # try again, this time forcing the delete
         
     | 
| 
      
 166 
     | 
    
         
            +
                      FileUtils.rm_rf(dir_path(to))
         
     | 
| 
      
 167 
     | 
    
         
            +
                      FileUtils.cp(dir_path(from), dir_path(to))
         
     | 
| 
      
 168 
     | 
    
         
            +
                    end
         
     | 
| 
       163 
169 
     | 
    
         
             
                  end
         
     | 
| 
       164 
170 
     | 
    
         
             
                end
         
     | 
| 
       165 
171 
     | 
    
         | 
| 
         @@ -208,11 +214,11 @@ module Ferret::Store 
     | 
|
| 
       208 
214 
     | 
    
         
             
                  def initialize(lock_file)
         
     | 
| 
       209 
215 
     | 
    
         
             
                    @lock_file = lock_file
         
     | 
| 
       210 
216 
     | 
    
         
             
                    #@clean = FSLock.make_finalizer(lock_file)
         
     | 
| 
       211 
     | 
    
         
            -
                    @clean = lambda {  
     | 
| 
      
 217 
     | 
    
         
            +
                    @clean = lambda { FileUtils.rm_rf(lock_file)}
         
     | 
| 
       212 
218 
     | 
    
         
             
                  end
         
     | 
| 
       213 
219 
     | 
    
         | 
| 
       214 
220 
     | 
    
         
             
                  def FSLock.make_finalizer(lock_file)
         
     | 
| 
       215 
     | 
    
         
            -
                    lambda {  
     | 
| 
      
 221 
     | 
    
         
            +
                    lambda { FileUtils.rm_rf(lock_file)}
         
     | 
| 
       216 
222 
     | 
    
         
             
                  end
         
     | 
| 
       217 
223 
     | 
    
         | 
| 
       218 
224 
     | 
    
         
             
                  # obtain the lock on the data source 
         
     | 
| 
         @@ -238,7 +244,7 @@ module Ferret::Store 
     | 
|
| 
       238 
244 
     | 
    
         
             
                  def release 
         
     | 
| 
       239 
245 
     | 
    
         
             
                    return if FSDirectory.locks_disabled?
         
     | 
| 
       240 
246 
     | 
    
         
             
                    begin
         
     | 
| 
       241 
     | 
    
         
            -
                       
     | 
| 
      
 247 
     | 
    
         
            +
                      FileUtils.rm_rf(@lock_file)
         
     | 
| 
       242 
248 
     | 
    
         
             
                      ObjectSpace.undefine_finalizer(self)
         
     | 
| 
       243 
249 
     | 
    
         
             
                    rescue SystemCallError
         
     | 
| 
       244 
250 
     | 
    
         
             
                      # maybe we tried to release a lock that wasn't locked. This
         
     | 
| 
         @@ -364,6 +370,7 @@ module Ferret::Store 
     | 
|
| 
       364 
370 
     | 
    
         
             
                  # This method is only used by the c extension to free the directory
         
     | 
| 
       365 
371 
     | 
    
         
             
                  def close_internal
         
     | 
| 
       366 
372 
     | 
    
         
             
                  end
         
     | 
| 
      
 373 
     | 
    
         
            +
             
     | 
| 
       367 
374 
     | 
    
         
             
                #end private
         
     | 
| 
       368 
375 
     | 
    
         
             
              end
         
     | 
| 
       369 
376 
     | 
    
         
             
            end
         
     | 
| 
         @@ -213,20 +213,6 @@ module Ferret::Store 
     | 
|
| 
       213 
213 
     | 
    
         
             
                  last = start + length
         
     | 
| 
       214 
214 
     | 
    
         
             
                  (start ... last).each do |i|
         
     | 
| 
       215 
215 
     | 
    
         
             
                    write_byte(buf[i])
         
     | 
| 
       216 
     | 
    
         
            -
            #          code = buf[i]
         
     | 
| 
       217 
     | 
    
         
            -
            #          if code >= 0x01 and code <= 0x7F
         
     | 
| 
       218 
     | 
    
         
            -
            #            write_byte(code)
         
     | 
| 
       219 
     | 
    
         
            -
            #          else
         
     | 
| 
       220 
     | 
    
         
            -
            #            # We need to write unicode characters. ToDo: test that this works.
         
     | 
| 
       221 
     | 
    
         
            -
            #            if code > 0x80 and code <= 0x7FF or code == 0
         
     | 
| 
       222 
     | 
    
         
            -
            #              write_byte(0xC0 | code >> 6)
         
     | 
| 
       223 
     | 
    
         
            -
            #              write_byte(0x80 | code & 0x3F)
         
     | 
| 
       224 
     | 
    
         
            -
            #            else
         
     | 
| 
       225 
     | 
    
         
            -
            #              write_byte(0xE0 | (code >> 12))
         
     | 
| 
       226 
     | 
    
         
            -
            #              write_byte(0x80 | ((code >> 6) & 0x3F))
         
     | 
| 
       227 
     | 
    
         
            -
            #              write_byte(0x80 | (code & 0x3F))
         
     | 
| 
       228 
     | 
    
         
            -
            #            end
         
     | 
| 
       229 
     | 
    
         
            -
            #          end
         
     | 
| 
       230 
216 
     | 
    
         
             
                  end
         
     | 
| 
       231 
217 
     | 
    
         
             
                end
         
     | 
| 
       232 
218 
     | 
    
         | 
| 
         @@ -159,9 +159,10 @@ module Ferret::Store 
     | 
|
| 
       159 
159 
     | 
    
         
             
                    flush()
         
     | 
| 
       160 
160 
     | 
    
         
             
                    last_buffer_number = (@file.length / BUFFER_SIZE).to_i
         
     | 
| 
       161 
161 
     | 
    
         
             
                    last_buffer_offset = @file.length % BUFFER_SIZE
         
     | 
| 
       162 
     | 
    
         
            -
             
     | 
| 
      
 162 
     | 
    
         
            +
             
     | 
| 
      
 163 
     | 
    
         
            +
                    (0..last_buffer_number).each do |i|
         
     | 
| 
       163 
164 
     | 
    
         
             
                      len = (i == last_buffer_number ? last_buffer_offset : BUFFER_SIZE)
         
     | 
| 
       164 
     | 
    
         
            -
                      output.write_bytes( 
     | 
| 
      
 165 
     | 
    
         
            +
                      output.write_bytes(@file.buffers[i], len)
         
     | 
| 
       165 
166 
     | 
    
         
             
                    end
         
     | 
| 
       166 
167 
     | 
    
         
             
                  end
         
     | 
| 
       167 
168 
     | 
    
         | 
    
        data/lib/rferret.rb
    CHANGED
    
    
| 
         @@ -399,3 +399,134 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase 
     | 
|
| 
       399 
399 
     | 
    
         
             
                assert(! t.next())
         
     | 
| 
       400 
400 
     | 
    
         
             
              end
         
     | 
| 
       401 
401 
     | 
    
         
             
            end
         
     | 
| 
      
 402 
     | 
    
         
            +
             
     | 
| 
      
 403 
     | 
    
         
            +
            class RegExpAnalyzerTest < Test::Unit::TestCase
         
     | 
| 
      
 404 
     | 
    
         
            +
              include Ferret::Analysis
         
     | 
| 
      
 405 
     | 
    
         
            +
             
     | 
| 
      
 406 
     | 
    
         
            +
              def test_reg_exp_analyzer()
         
     | 
| 
      
 407 
     | 
    
         
            +
                input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
         
     | 
| 
      
 408 
     | 
    
         
            +
                a = RegExpAnalyzer.new()
         
     | 
| 
      
 409 
     | 
    
         
            +
                t = a.token_stream('XXX', input)
         
     | 
| 
      
 410 
     | 
    
         
            +
                t2 = a.token_stream('XXX', "one_Two three")
         
     | 
| 
      
 411 
     | 
    
         
            +
                assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
         
     | 
| 
      
 412 
     | 
    
         
            +
                assert_equal(Token.new('is', 19, 21), t.next)
         
     | 
| 
      
 413 
     | 
    
         
            +
                assert_equal(Token.new('my', 22, 24), t.next)
         
     | 
| 
      
 414 
     | 
    
         
            +
                assert_equal(Token.new('e-mail', 25, 31), t.next)
         
     | 
| 
      
 415 
     | 
    
         
            +
                assert_equal(Token.new('52', 32, 34), t.next)
         
     | 
| 
      
 416 
     | 
    
         
            +
                assert_equal(Token.new('address', 40, 47), t.next)
         
     | 
| 
      
 417 
     | 
    
         
            +
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
      
 418 
     | 
    
         
            +
                assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
         
     | 
| 
      
 419 
     | 
    
         
            +
                assert_equal(Token.new('t.n.t.', 91, 97), t.next)
         
     | 
| 
      
 420 
     | 
    
         
            +
                assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
         
     | 
| 
      
 421 
     | 
    
         
            +
                assert_equal(Token.new('23', 116, 118), t.next)
         
     | 
| 
      
 422 
     | 
    
         
            +
                assert_equal(Token.new('rob\'s', 119, 124), t.next)
         
     | 
| 
      
 423 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 424 
     | 
    
         
            +
                t = t2
         
     | 
| 
      
 425 
     | 
    
         
            +
                assert_equal(Token.new("one_two", 0, 7), t.next())
         
     | 
| 
      
 426 
     | 
    
         
            +
                assert_equal(Token.new("three", 8, 13), t.next())
         
     | 
| 
      
 427 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 428 
     | 
    
         
            +
                a = RegExpAnalyzer.new(/\w{2,}/, false)
         
     | 
| 
      
 429 
     | 
    
         
            +
                t = a.token_stream('XXX', input)
         
     | 
| 
      
 430 
     | 
    
         
            +
                t2 = a.token_stream('XXX', "one Two three")
         
     | 
| 
      
 431 
     | 
    
         
            +
                assert_equal(Token.new('DBalmain', 0, 8), t.next)
         
     | 
| 
      
 432 
     | 
    
         
            +
                assert_equal(Token.new('gmail', 9, 14), t.next)
         
     | 
| 
      
 433 
     | 
    
         
            +
                assert_equal(Token.new('com', 15, 18), t.next)
         
     | 
| 
      
 434 
     | 
    
         
            +
                assert_equal(Token.new('is', 19, 21), t.next)
         
     | 
| 
      
 435 
     | 
    
         
            +
                assert_equal(Token.new('My', 22, 24), t.next)
         
     | 
| 
      
 436 
     | 
    
         
            +
                assert_equal(Token.new('mail', 27, 31), t.next)
         
     | 
| 
      
 437 
     | 
    
         
            +
                assert_equal(Token.new('52', 32, 34), t.next)
         
     | 
| 
      
 438 
     | 
    
         
            +
                assert_equal(Token.new('Address', 40, 47), t.next)
         
     | 
| 
      
 439 
     | 
    
         
            +
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
      
 440 
     | 
    
         
            +
                assert_equal(Token.new('http', 55, 59), t.next)
         
     | 
| 
      
 441 
     | 
    
         
            +
                assert_equal(Token.new('www', 62, 65), t.next)
         
     | 
| 
      
 442 
     | 
    
         
            +
                assert_equal(Token.new('google', 66, 72), t.next)
         
     | 
| 
      
 443 
     | 
    
         
            +
                assert_equal(Token.new('com', 73, 76), t.next)
         
     | 
| 
      
 444 
     | 
    
         
            +
                assert_equal(Token.new('RESULT_3', 77, 85), t.next)
         
     | 
| 
      
 445 
     | 
    
         
            +
                assert_equal(Token.new('html', 86, 90), t.next)
         
     | 
| 
      
 446 
     | 
    
         
            +
                assert_equal(Token.new('123', 98, 101), t.next)
         
     | 
| 
      
 447 
     | 
    
         
            +
                assert_equal(Token.new('1235', 102, 106), t.next)
         
     | 
| 
      
 448 
     | 
    
         
            +
                assert_equal(Token.new('ASD', 107, 110), t.next)
         
     | 
| 
      
 449 
     | 
    
         
            +
                assert_equal(Token.new('1234', 111, 115), t.next)
         
     | 
| 
      
 450 
     | 
    
         
            +
                assert_equal(Token.new('23', 116, 118), t.next)
         
     | 
| 
      
 451 
     | 
    
         
            +
                assert_equal(Token.new('Rob', 119, 122), t.next)
         
     | 
| 
      
 452 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 453 
     | 
    
         
            +
                assert_equal(Token.new("one", 0, 3), t2.next())
         
     | 
| 
      
 454 
     | 
    
         
            +
                assert_equal(Token.new("Two", 4, 7), t2.next())
         
     | 
| 
      
 455 
     | 
    
         
            +
                assert_equal(Token.new("three", 8, 13), t2.next())
         
     | 
| 
      
 456 
     | 
    
         
            +
                assert(! t2.next())
         
     | 
| 
      
 457 
     | 
    
         
            +
                a = RegExpAnalyzer.new() do |str|
         
     | 
| 
      
 458 
     | 
    
         
            +
                  if str =~ /^[[:alpha:]]\.([[:alpha:]]\.)+$/
         
     | 
| 
      
 459 
     | 
    
         
            +
                    str.gsub!(/\./, '')
         
     | 
| 
      
 460 
     | 
    
         
            +
                  elsif str =~ /'[sS]$/
         
     | 
| 
      
 461 
     | 
    
         
            +
                    str.gsub!(/'[sS]$/, '')
         
     | 
| 
      
 462 
     | 
    
         
            +
                  end
         
     | 
| 
      
 463 
     | 
    
         
            +
                  str
         
     | 
| 
      
 464 
     | 
    
         
            +
                end
         
     | 
| 
      
 465 
     | 
    
         
            +
                t = a.token_stream('XXX', input)
         
     | 
| 
      
 466 
     | 
    
         
            +
                t2 = a.token_stream('XXX', "one's don't T.N.T.")
         
     | 
| 
      
 467 
     | 
    
         
            +
                assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
         
     | 
| 
      
 468 
     | 
    
         
            +
                assert_equal(Token.new('is', 19, 21), t.next)
         
     | 
| 
      
 469 
     | 
    
         
            +
                assert_equal(Token.new('my', 22, 24), t.next)
         
     | 
| 
      
 470 
     | 
    
         
            +
                assert_equal(Token.new('e-mail', 25, 31), t.next)
         
     | 
| 
      
 471 
     | 
    
         
            +
                assert_equal(Token.new('52', 32, 34), t.next)
         
     | 
| 
      
 472 
     | 
    
         
            +
                assert_equal(Token.new('address', 40, 47), t.next)
         
     | 
| 
      
 473 
     | 
    
         
            +
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
      
 474 
     | 
    
         
            +
                assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
         
     | 
| 
      
 475 
     | 
    
         
            +
                assert_equal(Token.new('tnt', 91, 97), t.next)
         
     | 
| 
      
 476 
     | 
    
         
            +
                assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
         
     | 
| 
      
 477 
     | 
    
         
            +
                assert_equal(Token.new('23', 116, 118), t.next)
         
     | 
| 
      
 478 
     | 
    
         
            +
                assert_equal(Token.new('rob', 119, 124), t.next)
         
     | 
| 
      
 479 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 480 
     | 
    
         
            +
                assert_equal(Token.new("one", 0, 5), t2.next())
         
     | 
| 
      
 481 
     | 
    
         
            +
                assert_equal(Token.new("don't", 6, 11), t2.next())
         
     | 
| 
      
 482 
     | 
    
         
            +
                assert_equal(Token.new("tnt", 12, 18), t2.next())
         
     | 
| 
      
 483 
     | 
    
         
            +
                assert(! t2.next())
         
     | 
| 
      
 484 
     | 
    
         
            +
              end
         
     | 
| 
      
 485 
     | 
    
         
            +
            end
         
     | 
| 
      
 486 
     | 
    
         
            +
             
     | 
| 
      
 487 
     | 
    
         
            +
            module Ferret::Analysis
         
     | 
| 
      
 488 
     | 
    
         
            +
              class StemmingStandardAnalyzer < StandardAnalyzer
         
     | 
| 
      
 489 
     | 
    
         
            +
                def token_stream(field, text)
         
     | 
| 
      
 490 
     | 
    
         
            +
                  StemFilter.new(super)
         
     | 
| 
      
 491 
     | 
    
         
            +
                end
         
     | 
| 
      
 492 
     | 
    
         
            +
              end
         
     | 
| 
      
 493 
     | 
    
         
            +
            end
         
     | 
| 
      
 494 
     | 
    
         
            +
             
     | 
| 
      
 495 
     | 
    
         
            +
            class CustomAnalyzerTest < Test::Unit::TestCase
         
     | 
| 
      
 496 
     | 
    
         
            +
              include Ferret::Analysis
         
     | 
| 
      
 497 
     | 
    
         
            +
             
     | 
| 
      
 498 
     | 
    
         
            +
              def test_custom_filter()
         
     | 
| 
      
 499 
     | 
    
         
            +
                input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
         
     | 
| 
      
 500 
     | 
    
         
            +
                a = StemmingStandardAnalyzer.new()
         
     | 
| 
      
 501 
     | 
    
         
            +
                t = a.token_stream("fieldname", input)
         
     | 
| 
      
 502 
     | 
    
         
            +
                assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
         
     | 
| 
      
 503 
     | 
    
         
            +
                assert_equal(Token.new('e-mail', 25, 31), t.next)
         
     | 
| 
      
 504 
     | 
    
         
            +
                assert_equal(Token.new('address', 40, 47), t.next)
         
     | 
| 
      
 505 
     | 
    
         
            +
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
      
 506 
     | 
    
         
            +
                assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
         
     | 
| 
      
 507 
     | 
    
         
            +
                assert_equal(Token.new('tnt', 86, 91), t.next)
         
     | 
| 
      
 508 
     | 
    
         
            +
                assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
         
     | 
| 
      
 509 
     | 
    
         
            +
                assert_equal(Token.new('23', 111, 113), t.next)
         
     | 
| 
      
 510 
     | 
    
         
            +
                assert_equal(Token.new('áägç', 117, 124), t.next)
         
     | 
| 
      
 511 
     | 
    
         
            +
                assert_equal(Token.new('êëì', 126, 132), t.next)
         
     | 
| 
      
 512 
     | 
    
         
            +
                assert_equal(Token.new('úøã', 134, 140), t.next)
         
     | 
| 
      
 513 
     | 
    
         
            +
                assert_equal(Token.new('öîí', 142, 148), t.next)
         
     | 
| 
      
 514 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 515 
     | 
    
         
            +
                input = "Debate Debates DEBATED DEBating Debater";
         
     | 
| 
      
 516 
     | 
    
         
            +
                t = a.token_stream("fieldname", input)
         
     | 
| 
      
 517 
     | 
    
         
            +
                assert_equal(Token.new("debat", 0, 6), t.next)
         
     | 
| 
      
 518 
     | 
    
         
            +
                assert_equal(Token.new("debat", 7, 14), t.next)
         
     | 
| 
      
 519 
     | 
    
         
            +
                assert_equal(Token.new("debat", 15, 22), t.next)
         
     | 
| 
      
 520 
     | 
    
         
            +
                assert_equal(Token.new("debat", 23, 31), t.next)
         
     | 
| 
      
 521 
     | 
    
         
            +
                assert_equal(Token.new("debat", 32, 39), t.next)
         
     | 
| 
      
 522 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 523 
     | 
    
         
            +
                input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
         
     | 
| 
      
 524 
     | 
    
         
            +
                t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
         
     | 
| 
      
 525 
     | 
    
         
            +
                assert_equal(Token.new("dêbate", 0, 7), t.next)
         
     | 
| 
      
 526 
     | 
    
         
            +
                assert_equal(Token.new("dêbate", 8, 16), t.next)
         
     | 
| 
      
 527 
     | 
    
         
            +
                assert_equal(Token.new("dêbate", 17, 25), t.next)
         
     | 
| 
      
 528 
     | 
    
         
            +
                assert_equal(Token.new("dêbate", 26, 35), t.next)
         
     | 
| 
      
 529 
     | 
    
         
            +
                assert_equal(Token.new("dêbater", 36, 44), t.next)
         
     | 
| 
      
 530 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 531 
     | 
    
         
            +
              end
         
     | 
| 
      
 532 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -205,7 +205,7 @@ class StandardTokenizerTest < Test::Unit::TestCase 
     | 
|
| 
       205 
205 
     | 
    
         
             
              include Ferret::Analysis
         
     | 
| 
       206 
206 
     | 
    
         | 
| 
       207 
207 
     | 
    
         
             
              def test_standard_tokenizer()
         
     | 
| 
       208 
     | 
    
         
            -
                input = 'DBalmán@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/ 
     | 
| 
      
 208 
     | 
    
         
            +
                input = 'DBalmán@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
         
     | 
| 
       209 
209 
     | 
    
         
             
                t = StandardTokenizer.new(input)
         
     | 
| 
       210 
210 
     | 
    
         
             
                assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
         
     | 
| 
       211 
211 
     | 
    
         
             
                assert_equal(Token.new('is', 19, 21), t.next)
         
     | 
| 
         @@ -214,7 +214,7 @@ class StandardTokenizerTest < Test::Unit::TestCase 
     | 
|
| 
       214 
214 
     | 
    
         
             
                assert_equal(Token.new('52', 32, 34), t.next)
         
     | 
| 
       215 
215 
     | 
    
         
             
                assert_equal(Token.new('Address', 40, 47), t.next)
         
     | 
| 
       216 
216 
     | 
    
         
             
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
       217 
     | 
    
         
            -
                assert_equal(Token.new('www.google.com/ 
     | 
| 
      
 217 
     | 
    
         
            +
                assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
         
     | 
| 
       218 
218 
     | 
    
         
             
                assert_equal(Token.new('TNT', 86, 91), t.next)
         
     | 
| 
       219 
219 
     | 
    
         
             
                assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
         
     | 
| 
       220 
220 
     | 
    
         
             
                assert_equal(Token.new('23', 111, 113), t.next)
         
     | 
| 
         @@ -235,7 +235,7 @@ class StandardTokenizerTest < Test::Unit::TestCase 
     | 
|
| 
       235 
235 
     | 
    
         
             
                assert_equal(Token.new('52', 32, 34), t.next)
         
     | 
| 
       236 
236 
     | 
    
         
             
                assert_equal(Token.new('address', 40, 47), t.next)
         
     | 
| 
       237 
237 
     | 
    
         
             
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
       238 
     | 
    
         
            -
                assert_equal(Token.new('www.google.com/ 
     | 
| 
      
 238 
     | 
    
         
            +
                assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
         
     | 
| 
       239 
239 
     | 
    
         
             
                assert_equal(Token.new('tnt', 86, 91), t.next)
         
     | 
| 
       240 
240 
     | 
    
         
             
                assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
         
     | 
| 
       241 
241 
     | 
    
         
             
                assert_equal(Token.new('23', 111, 113), t.next)
         
     | 
| 
         @@ -247,6 +247,97 @@ class StandardTokenizerTest < Test::Unit::TestCase 
     | 
|
| 
       247 
247 
     | 
    
         
             
              end
         
     | 
| 
       248 
248 
     | 
    
         
             
            end
         
     | 
| 
       249 
249 
     | 
    
         | 
| 
      
 250 
     | 
    
         
            +
            class RegExpTokenizerTest < Test::Unit::TestCase
         
     | 
| 
      
 251 
     | 
    
         
            +
              include Ferret::Analysis
         
     | 
| 
      
 252 
     | 
    
         
            +
             
     | 
| 
      
 253 
     | 
    
         
            +
              ALPHA      = /[[:alpha:]_-]+/
         
     | 
| 
      
 254 
     | 
    
         
            +
              APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
         
     | 
| 
      
 255 
     | 
    
         
            +
              ACRONYM    = /#{ALPHA}\.(#{ALPHA}\.)+/
         
     | 
| 
      
 256 
     | 
    
         
            +
              ACRONYM_WORD    = /^#{ACRONYM}$/
         
     | 
| 
      
 257 
     | 
    
         
            +
              APOSTROPHE_WORD = /^#{APOSTROPHE}$/
         
     | 
| 
      
 258 
     | 
    
         
            +
             
     | 
| 
      
 259 
     | 
    
         
            +
              def test_reg_exp_tokenizer()
         
     | 
| 
      
 260 
     | 
    
         
            +
                input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
         
     | 
| 
      
 261 
     | 
    
         
            +
                t = RegExpTokenizer.new(input)
         
     | 
| 
      
 262 
     | 
    
         
            +
                assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
         
     | 
| 
      
 263 
     | 
    
         
            +
                assert_equal(Token.new('is', 19, 21), t.next)
         
     | 
| 
      
 264 
     | 
    
         
            +
                assert_equal(Token.new('My', 22, 24), t.next)
         
     | 
| 
      
 265 
     | 
    
         
            +
                assert_equal(Token.new('e-mail', 25, 31), t.next)
         
     | 
| 
      
 266 
     | 
    
         
            +
                assert_equal(Token.new('52', 32, 34), t.next)
         
     | 
| 
      
 267 
     | 
    
         
            +
                assert_equal(Token.new('Address', 40, 47), t.next)
         
     | 
| 
      
 268 
     | 
    
         
            +
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
      
 269 
     | 
    
         
            +
                assert_equal(Token.new('http://www.google.com/RESULT_3.html', 55, 90), t.next)
         
     | 
| 
      
 270 
     | 
    
         
            +
                assert_equal(Token.new('T.N.T.', 91, 97), t.next)
         
     | 
| 
      
 271 
     | 
    
         
            +
                assert_equal(Token.new('123-1235-ASD-1234', 98, 115), t.next)
         
     | 
| 
      
 272 
     | 
    
         
            +
                assert_equal(Token.new('23', 116, 118), t.next)
         
     | 
| 
      
 273 
     | 
    
         
            +
                assert_equal(Token.new('Rob\'s', 119, 124), t.next)
         
     | 
| 
      
 274 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 275 
     | 
    
         
            +
                t.text = "one_two three"
         
     | 
| 
      
 276 
     | 
    
         
            +
                assert_equal(Token.new("one_two", 0, 7), t.next())
         
     | 
| 
      
 277 
     | 
    
         
            +
                assert_equal(Token.new("three", 8, 13), t.next())
         
     | 
| 
      
 278 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 279 
     | 
    
         
            +
                t = LowerCaseFilter.new(RegExpTokenizer.new(input))
         
     | 
| 
      
 280 
     | 
    
         
            +
                t2 = LowerCaseFilter.new(RegExpTokenizer.new(input, /\w{2,}/))
         
     | 
| 
      
 281 
     | 
    
         
            +
                assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
         
     | 
| 
      
 282 
     | 
    
         
            +
                assert_equal(Token.new('is', 19, 21), t.next)
         
     | 
| 
      
 283 
     | 
    
         
            +
                assert_equal(Token.new('my', 22, 24), t.next)
         
     | 
| 
      
 284 
     | 
    
         
            +
                assert_equal(Token.new('e-mail', 25, 31), t.next)
         
     | 
| 
      
 285 
     | 
    
         
            +
                assert_equal(Token.new('52', 32, 34), t.next)
         
     | 
| 
      
 286 
     | 
    
         
            +
                assert_equal(Token.new('address', 40, 47), t.next)
         
     | 
| 
      
 287 
     | 
    
         
            +
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
      
 288 
     | 
    
         
            +
                assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
         
     | 
| 
      
 289 
     | 
    
         
            +
                assert_equal(Token.new('t.n.t.', 91, 97), t.next)
         
     | 
| 
      
 290 
     | 
    
         
            +
                assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
         
     | 
| 
      
 291 
     | 
    
         
            +
                assert_equal(Token.new('23', 116, 118), t.next)
         
     | 
| 
      
 292 
     | 
    
         
            +
                assert_equal(Token.new('rob\'s', 119, 124), t.next)
         
     | 
| 
      
 293 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 294 
     | 
    
         
            +
                assert_equal(Token.new('dbalmain', 0, 8), t2.next)
         
     | 
| 
      
 295 
     | 
    
         
            +
                assert_equal(Token.new('gmail', 9, 14), t2.next)
         
     | 
| 
      
 296 
     | 
    
         
            +
                assert_equal(Token.new('com', 15, 18), t2.next)
         
     | 
| 
      
 297 
     | 
    
         
            +
                assert_equal(Token.new('is', 19, 21), t2.next)
         
     | 
| 
      
 298 
     | 
    
         
            +
                assert_equal(Token.new('my', 22, 24), t2.next)
         
     | 
| 
      
 299 
     | 
    
         
            +
                assert_equal(Token.new('mail', 27, 31), t2.next)
         
     | 
| 
      
 300 
     | 
    
         
            +
                assert_equal(Token.new('52', 32, 34), t2.next)
         
     | 
| 
      
 301 
     | 
    
         
            +
                assert_equal(Token.new('address', 40, 47), t2.next)
         
     | 
| 
      
 302 
     | 
    
         
            +
                assert_equal(Token.new('23', 49, 51), t2.next)
         
     | 
| 
      
 303 
     | 
    
         
            +
                assert_equal(Token.new('http', 55, 59), t2.next)
         
     | 
| 
      
 304 
     | 
    
         
            +
                assert_equal(Token.new('www', 62, 65), t2.next)
         
     | 
| 
      
 305 
     | 
    
         
            +
                assert_equal(Token.new('google', 66, 72), t2.next)
         
     | 
| 
      
 306 
     | 
    
         
            +
                assert_equal(Token.new('com', 73, 76), t2.next)
         
     | 
| 
      
 307 
     | 
    
         
            +
                assert_equal(Token.new('result_3', 77, 85), t2.next)
         
     | 
| 
      
 308 
     | 
    
         
            +
                assert_equal(Token.new('html', 86, 90), t2.next)
         
     | 
| 
      
 309 
     | 
    
         
            +
                assert_equal(Token.new('123', 98, 101), t2.next)
         
     | 
| 
      
 310 
     | 
    
         
            +
                assert_equal(Token.new('1235', 102, 106), t2.next)
         
     | 
| 
      
 311 
     | 
    
         
            +
                assert_equal(Token.new('asd', 107, 110), t2.next)
         
     | 
| 
      
 312 
     | 
    
         
            +
                assert_equal(Token.new('1234', 111, 115), t2.next)
         
     | 
| 
      
 313 
     | 
    
         
            +
                assert_equal(Token.new('23', 116, 118), t2.next)
         
     | 
| 
      
 314 
     | 
    
         
            +
                assert_equal(Token.new('rob', 119, 122), t2.next)
         
     | 
| 
      
 315 
     | 
    
         
            +
                assert(! t2.next())
         
     | 
| 
      
 316 
     | 
    
         
            +
                t = RegExpTokenizer.new(input) do |str|
         
     | 
| 
      
 317 
     | 
    
         
            +
                  if str =~ ACRONYM_WORD
         
     | 
| 
      
 318 
     | 
    
         
            +
                    str.gsub!(/\./, '')
         
     | 
| 
      
 319 
     | 
    
         
            +
                  elsif str =~ APOSTROPHE_WORD
         
     | 
| 
      
 320 
     | 
    
         
            +
                    str.gsub!(/'[sS]$/, '')
         
     | 
| 
      
 321 
     | 
    
         
            +
                  end
         
     | 
| 
      
 322 
     | 
    
         
            +
                  str
         
     | 
| 
      
 323 
     | 
    
         
            +
                end
         
     | 
| 
      
 324 
     | 
    
         
            +
                t = LowerCaseFilter.new(t)
         
     | 
| 
      
 325 
     | 
    
         
            +
                assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
         
     | 
| 
      
 326 
     | 
    
         
            +
                assert_equal(Token.new('is', 19, 21), t.next)
         
     | 
| 
      
 327 
     | 
    
         
            +
                assert_equal(Token.new('my', 22, 24), t.next)
         
     | 
| 
      
 328 
     | 
    
         
            +
                assert_equal(Token.new('e-mail', 25, 31), t.next)
         
     | 
| 
      
 329 
     | 
    
         
            +
                assert_equal(Token.new('52', 32, 34), t.next)
         
     | 
| 
      
 330 
     | 
    
         
            +
                assert_equal(Token.new('address', 40, 47), t.next)
         
     | 
| 
      
 331 
     | 
    
         
            +
                assert_equal(Token.new('23', 49, 51), t.next)
         
     | 
| 
      
 332 
     | 
    
         
            +
                assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
         
     | 
| 
      
 333 
     | 
    
         
            +
                assert_equal(Token.new('tnt', 91, 97), t.next)
         
     | 
| 
      
 334 
     | 
    
         
            +
                assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
         
     | 
| 
      
 335 
     | 
    
         
            +
                assert_equal(Token.new('23', 116, 118), t.next)
         
     | 
| 
      
 336 
     | 
    
         
            +
                assert_equal(Token.new('rob', 119, 124), t.next)
         
     | 
| 
      
 337 
     | 
    
         
            +
                assert(! t.next())
         
     | 
| 
      
 338 
     | 
    
         
            +
              end
         
     | 
| 
      
 339 
     | 
    
         
            +
            end
         
     | 
| 
      
 340 
     | 
    
         
            +
             
     | 
| 
       250 
341 
     | 
    
         
             
            class StopFilterTest < Test::Unit::TestCase
         
     | 
| 
       251 
342 
     | 
    
         
             
              include Ferret::Analysis
         
     | 
| 
       252 
343 
     | 
    
         | 
| 
         @@ -383,11 +474,9 @@ module Ferret::Analysis 
     | 
|
| 
       383 
474 
     | 
    
         
             
                def next()
         
     | 
| 
       384 
475 
     | 
    
         
             
                  t = @input.next()
         
     | 
| 
       385 
476 
     | 
    
         | 
| 
       386 
     | 
    
         
            -
                  if (t 
     | 
| 
       387 
     | 
    
         
            -
                    return nil
         
     | 
| 
       388 
     | 
    
         
            -
                  end
         
     | 
| 
      
 477 
     | 
    
         
            +
                  return nil if (t.nil?)
         
     | 
| 
       389 
478 
     | 
    
         | 
| 
       390 
     | 
    
         
            -
                  t.text = t.text 
     | 
| 
      
 479 
     | 
    
         
            +
                  t.text = t.text.capitalize
         
     | 
| 
       391 
480 
     | 
    
         | 
| 
       392 
481 
     | 
    
         
             
                  return t
         
     | 
| 
       393 
482 
     | 
    
         
             
                end
         
     | 
| 
         @@ -402,7 +491,7 @@ class CustomFilterTest < Test::Unit::TestCase 
     | 
|
| 
       402 
491 
     | 
    
         
             
                t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
         
     | 
| 
       403 
492 
     | 
    
         
             
                assert_equal(Token.new("This", 0, 4), t.next)
         
     | 
| 
       404 
493 
     | 
    
         
             
                assert_equal(Token.new("Text", 5, 9), t.next)
         
     | 
| 
       405 
     | 
    
         
            -
                assert_equal(Token.new(" 
     | 
| 
      
 494 
     | 
    
         
            +
                assert_equal(Token.new("Should", 10, 16), t.next)
         
     | 
| 
       406 
495 
     | 
    
         
             
                assert_equal(Token.new("Be", 17, 19), t.next)
         
     | 
| 
       407 
496 
     | 
    
         
             
                assert_equal(Token.new("Capitalized", 20, 31), t.next)
         
     | 
| 
       408 
497 
     | 
    
         
             
                assert_equal(Token.new("I", 36, 37), t.next)
         
     | 
| 
         @@ -412,7 +501,7 @@ class CustomFilterTest < Test::Unit::TestCase 
     | 
|
| 
       412 
501 
     | 
    
         
             
                t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
         
     | 
| 
       413 
502 
     | 
    
         
             
                assert_equal(Token.new("This", 0, 4), t.next)
         
     | 
| 
       414 
503 
     | 
    
         
             
                assert_equal(Token.new("Text", 5, 9), t.next)
         
     | 
| 
       415 
     | 
    
         
            -
                assert_equal(Token.new(" 
     | 
| 
      
 504 
     | 
    
         
            +
                assert_equal(Token.new("Should", 10, 16), t.next)
         
     | 
| 
       416 
505 
     | 
    
         
             
                assert_equal(Token.new("Be", 17, 19), t.next)
         
     | 
| 
       417 
506 
     | 
    
         
             
                assert_equal(Token.new("Capit", 20, 31), t.next)
         
     | 
| 
       418 
507 
     | 
    
         
             
                assert_equal(Token.new("I", 36, 37), t.next)
         
     |