RubyGems - ferret - Versions diffs - 0.11.6 → 0.11.8.4 - Mend

ferret 0.11.6 → 0.11.8.4

Files changed (185) hide show

data/README +10 -22
data/RELEASE_CHANGES +137 -0
data/RELEASE_NOTES +60 -0
data/Rakefile +379 -274
data/TODO +100 -8
data/bin/ferret-browser +0 -0
data/ext/BZLIB_blocksort.c +1094 -0
data/ext/BZLIB_bzlib.c +1578 -0
data/ext/BZLIB_compress.c +672 -0
data/ext/BZLIB_crctable.c +104 -0
data/ext/BZLIB_decompress.c +626 -0
data/ext/BZLIB_huffman.c +205 -0
data/ext/BZLIB_randtable.c +84 -0
data/ext/{api.c → STEMMER_api.c} +7 -10
data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
data/ext/analysis.c +276 -121
data/ext/analysis.h +190 -143
data/ext/api.h +3 -4
data/ext/array.c +5 -3
data/ext/array.h +52 -43
data/ext/bitvector.c +38 -482
data/ext/bitvector.h +446 -124
data/ext/bzlib.h +282 -0
data/ext/bzlib_private.h +503 -0
data/ext/compound_io.c +23 -22
data/ext/config.h +21 -11
data/ext/document.c +43 -40
data/ext/document.h +31 -21
data/ext/except.c +20 -38
data/ext/except.h +89 -76
data/ext/extconf.rb +3 -2
data/ext/ferret.c +49 -35
data/ext/ferret.h +14 -11
data/ext/field_index.c +262 -0
data/ext/field_index.h +52 -0
data/ext/filter.c +11 -10
data/ext/fs_store.c +65 -47
data/ext/global.c +245 -165
data/ext/global.h +252 -54
data/ext/hash.c +200 -243
data/ext/hash.h +205 -163
data/ext/hashset.c +118 -96
data/ext/hashset.h +110 -82
data/ext/header.h +19 -19
data/ext/helper.c +11 -10
data/ext/helper.h +14 -6
data/ext/index.c +745 -366
data/ext/index.h +503 -529
data/ext/internal.h +1020 -0
data/ext/lang.c +10 -0
data/ext/lang.h +35 -15
data/ext/mempool.c +5 -4
data/ext/mempool.h +30 -22
data/ext/modules.h +35 -7
data/ext/multimapper.c +43 -2
data/ext/multimapper.h +32 -23
data/ext/posh.c +0 -0
data/ext/posh.h +4 -38
data/ext/priorityqueue.c +10 -12
data/ext/priorityqueue.h +33 -21
data/ext/q_boolean.c +22 -9
data/ext/q_const_score.c +3 -2
data/ext/q_filtered_query.c +15 -12
data/ext/q_fuzzy.c +147 -135
data/ext/q_match_all.c +3 -2
data/ext/q_multi_term.c +28 -32
data/ext/q_parser.c +451 -173
data/ext/q_phrase.c +158 -79
data/ext/q_prefix.c +16 -18
data/ext/q_range.c +363 -31
data/ext/q_span.c +130 -141
data/ext/q_term.c +21 -21
data/ext/q_wildcard.c +19 -23
data/ext/r_analysis.c +369 -242
data/ext/r_index.c +421 -434
data/ext/r_qparser.c +142 -92
data/ext/r_search.c +790 -407
data/ext/r_store.c +44 -44
data/ext/r_utils.c +264 -96
data/ext/ram_store.c +29 -23
data/ext/scanner.c +895 -0
data/ext/scanner.h +36 -0
data/ext/scanner_mb.c +6701 -0
data/ext/scanner_utf8.c +4415 -0
data/ext/search.c +210 -87
data/ext/search.h +556 -488
data/ext/similarity.c +17 -16
data/ext/similarity.h +51 -44
data/ext/sort.c +157 -354
data/ext/stem_ISO_8859_1_hungarian.h +16 -0
data/ext/stem_ISO_8859_2_romanian.h +16 -0
data/ext/stem_UTF_8_hungarian.h +16 -0
data/ext/stem_UTF_8_romanian.h +16 -0
data/ext/stem_UTF_8_turkish.h +16 -0
data/ext/stopwords.c +287 -278
data/ext/store.c +57 -51
data/ext/store.h +308 -286
data/ext/symbol.c +10 -0
data/ext/symbol.h +23 -0
data/ext/term_vectors.c +14 -293
data/ext/threading.h +22 -22
data/ext/win32.h +12 -4
data/lib/ferret.rb +2 -1
data/lib/ferret/browser.rb +1 -1
data/lib/ferret/field_symbol.rb +94 -0
data/lib/ferret/index.rb +221 -34
data/lib/ferret/number_tools.rb +6 -6
data/lib/ferret/version.rb +3 -0
data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
data/test/test_helper.rb +7 -2
data/test/test_installed.rb +1 -0
data/test/threading/thread_safety_index_test.rb +10 -1
data/test/threading/thread_safety_read_write_test.rb +4 -7
data/test/threading/thread_safety_test.rb +0 -0
data/test/unit/analysis/tc_analyzer.rb +29 -27
data/test/unit/analysis/tc_token_stream.rb +23 -16
data/test/unit/index/tc_index.rb +116 -11
data/test/unit/index/tc_index_reader.rb +27 -27
data/test/unit/index/tc_index_writer.rb +10 -0
data/test/unit/index/th_doc.rb +38 -21
data/test/unit/search/tc_filter.rb +31 -10
data/test/unit/search/tc_index_searcher.rb +6 -0
data/test/unit/search/tm_searcher.rb +53 -1
data/test/unit/store/tc_fs_store.rb +40 -2
data/test/unit/store/tc_ram_store.rb +0 -0
data/test/unit/store/tm_store.rb +0 -0
data/test/unit/store/tm_store_lock.rb +7 -6
data/test/unit/tc_field_symbol.rb +26 -0
data/test/unit/ts_analysis.rb +0 -0
data/test/unit/ts_index.rb +0 -0
data/test/unit/ts_store.rb +0 -0
data/test/unit/ts_utils.rb +0 -0
data/test/unit/utils/tc_number_tools.rb +0 -0
data/test/utils/content_generator.rb +226 -0
metadata +262 -221
data/ext/inc/lang.h +0 -48
data/ext/inc/threading.h +0 -31
data/ext/stem_ISO_8859_1_english.c +0 -1156
data/ext/stem_ISO_8859_1_french.c +0 -1276
data/ext/stem_ISO_8859_1_italian.c +0 -1091
data/ext/stem_ISO_8859_1_norwegian.c +0 -296
data/ext/stem_ISO_8859_1_spanish.c +0 -1119
data/ext/stem_ISO_8859_1_swedish.c +0 -307
data/ext/stem_UTF_8_danish.c +0 -344
data/ext/stem_UTF_8_english.c +0 -1176
data/ext/stem_UTF_8_french.c +0 -1296
data/ext/stem_UTF_8_italian.c +0 -1113
data/ext/stem_UTF_8_norwegian.c +0 -302
data/ext/stem_UTF_8_portuguese.c +0 -1055
data/ext/stem_UTF_8_russian.c +0 -709
data/ext/stem_UTF_8_spanish.c +0 -1137
data/ext/stem_UTF_8_swedish.c +0 -313
data/lib/ferret_version.rb +0 -3

@@ -145,12 +145,12 @@ class String
   def get_lex_format(len)
     case len
-    when  0.. 3: ""
-    when  4.. 5: "%Y"
-    when  6.. 7: "%Y%m"
-    when  8.. 9: "%Y%m%d"
-    when 10..11: "%Y%m%d%H"
-    when 12..13: "%Y%m%d%H%M"
+    when  0.. 3 then ""
+    when  4.. 5 then "%Y"
+    when  6.. 7 then "%Y%m"
+    when  8.. 9 then "%Y%m%d"
+    when 10..11 then "%Y%m%d%H"
+    when 12..13 then "%Y%m%d%H%M"
     else "%Y%m%d%H%M%S"
     end
   end

data/lib/ferret/version.rb ADDED

@@ -0,0 +1,3 @@
+module Ferret
+  VERSION = '0.11.8.4'
+end

data/test/{unit → long_running}/largefile/tc_largefile.rb RENAMED

@@ -25,7 +25,7 @@ class SampleLargeTest < Test::Unit::TestCase
   end
   def test_read_file_after_two_gigs
-    assert @index.reader[RECORDS - 5].load.is_a?Hash
+    assert @index.reader[RECORDS - 5].load.is_a?(Hash)
   end
   def create_index!

data/test/test_helper.rb CHANGED

@@ -1,6 +1,11 @@
 $:.unshift File.dirname(__FILE__)
-$:.unshift File.join(File.dirname(__FILE__), '../lib')
-$:.unshift File.join(File.dirname(__FILE__), '../ext')
+if $test_installed_gem
+  require 'rubygems'
+  require 'ferret'
+else
+  $:.unshift File.join(File.dirname(__FILE__), '../lib')
+  $:.unshift File.join(File.dirname(__FILE__), '../ext')
+end
 ENV['LANG'] = "en_US.UTF-8"
 ENV['LC_CTYPE'] = "en_US.UTF-8"

data/test/test_installed.rb ADDED

	@@ -0,0 +1 @@
1	+ $test_installed_gem = true

data/test/threading/thread_safety_index_test.rb CHANGED

@@ -1,3 +1,5 @@
+$:.unshift('.')
+require 'monitor'
 require File.dirname(__FILE__) + "/../test_helper"
 require File.dirname(__FILE__) + "/number_to_spoken.rb"
 require 'thread'
@@ -21,6 +23,7 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
   def indexing_thread()
     index = Index.new(:path => INDEX_DIR,
                       :analyzer => ANALYZER,
+                      :auto_flush => true,
                       :default_field => :content)
     ITERATIONS.times do
@@ -37,6 +40,10 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
       end
       index.commit
     end
+  rescue Exception => e
+    puts e
+    puts e.backtrace
+    raise 'hell'
   end
   def do_optimize(index)
@@ -74,6 +81,8 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
       threads << Thread.new { indexing_thread }
     end
-    threads.each {|t| t.join}
+    threads.each {|t|
+      t.join
+    }
   end
 end

data/test/threading/thread_safety_read_write_test.rb CHANGED

@@ -1,20 +1,19 @@
 require File.dirname(__FILE__) + "/../test_helper"
-require File.dirname(__FILE__) + "/../utils/number_to_spoken.rb"
+require File.dirname(__FILE__) + "/number_to_spoken.rb"
 require 'thread'
 class IndexThreadSafetyReadWriteTest < Test::Unit::TestCase
   include Ferret::Index
-  include Ferret::Document
   INDEX_DIR = File.expand_path(File.join(File.dirname(__FILE__), "index"))
   ITERATIONS = 10000
   ANALYZER = Ferret::Analysis::Analyzer.new()
   def setup
-    @index = Index.new(:path => 'index2',
+    @index = Index.new(:path => INDEX_DIR,
                        :create => true,
                        :analyzer => ANALYZER,
-                       :default_field => 'contents')
+                       :default_field => :content)
   end
   def search_thread()
@@ -42,10 +41,8 @@ class IndexThreadSafetyReadWriteTest < Test::Unit::TestCase
   end
   def do_add_doc
-    d = Document.new()
     n = rand(0xFFFFFFFF)
-    d << Field.new("id", n.to_s, Field::Store::YES, Field::Index::UNTOKENIZED)
-    d << Field.new("contents", n.to_spoken, Field::Store::NO, Field::Index::TOKENIZED)
+    d = {:id => n.to_s, :content => n.to_spoken}
     puts("Adding #{n}")
     begin
       @index << d

data/test/threading/thread_safety_test.rb CHANGED

File without changes

data/test/unit/analysis/tc_analyzer.rb CHANGED

@@ -1,10 +1,12 @@
+# encoding: utf-8
 require File.dirname(__FILE__) + "/../../test_helper"
 class AnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_analyzer()
-    input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
+    input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#!$'
     a = Analyzer.new()
     t = a.token_stream("fieldname", input)
     t2 = a.token_stream("fieldname", input)
@@ -44,7 +46,7 @@ class AsciiLetterAnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_letter_analyzer()
-    input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
+    input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#!$'
     a = AsciiLetterAnalyzer.new()
     t = a.token_stream("fieldname", input)
     t2 = a.token_stream("fieldname", input)
@@ -85,7 +87,7 @@ class LetterAnalyzerTest < Test::Unit::TestCase
   def test_letter_analyzer()
     Ferret.locale = ""
-    input = 'DBalmän@gmail.com is My e-mail 52   #$ address. 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
+    input = 'DBalmän@gmail.com is My e-mail 52   #$ address. 23#!$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
     a = LetterAnalyzer.new(false)
     t = a.token_stream("fieldname", input)
     t2 = a.token_stream("fieldname", input)
@@ -137,7 +139,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_white_space_analyzer()
-    input = 'DBalmain@gmail.com is My E-Mail 52   #$ ADDRESS. 23#@$'
+    input = 'DBalmain@gmail.com is My E-Mail 52   #$ ADDRESS. 23#!$'
     a = AsciiWhiteSpaceAnalyzer.new()
     t = a.token_stream("fieldname", input)
     t2 = a.token_stream("fieldname", input)
@@ -148,7 +150,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert(! t.next())
     assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t2.next)
     assert_equal(Token.new('is', 19, 21), t2.next)
@@ -157,7 +159,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t2.next)
     assert_equal(Token.new('#$', 37, 39), t2.next)
     assert_equal(Token.new('ADDRESS.', 40, 48), t2.next)
-    assert_equal(Token.new('23#@$', 49, 54), t2.next)
+    assert_equal(Token.new('23#!$', 49, 54), t2.next)
     assert(! t2.next())
     a = AsciiWhiteSpaceAnalyzer.new(true)
     t = a.token_stream("fieldname", input)
@@ -168,7 +170,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert(! t.next())
   end
 end
@@ -177,7 +179,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_white_space_analyzer()
-    input = 'DBalmän@gmail.com is My e-mail 52   #$ address. 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
+    input = 'DBalmän@gmail.com is My e-mail 52   #$ address. 23#!$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
     a = WhiteSpaceAnalyzer.new()
     t = a.token_stream("fieldname", input)
     t2 = a.token_stream("fieldname", input)
@@ -188,7 +190,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert_equal(Token.new('ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ', 55, 86), t.next)
     assert(! t.next())
     assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t2.next)
@@ -198,7 +200,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t2.next)
     assert_equal(Token.new('#$', 37, 39), t2.next)
     assert_equal(Token.new('address.', 40, 48), t2.next)
-    assert_equal(Token.new('23#@$', 49, 54), t2.next)
+    assert_equal(Token.new('23#!$', 49, 54), t2.next)
     assert_equal(Token.new('ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ', 55, 86), t2.next)
     assert(! t2.next())
     a = WhiteSpaceAnalyzer.new(true)
@@ -210,7 +212,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
     assert(! t.next())
   end
@@ -220,7 +222,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_standard_analyzer()
-    input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
+    input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
     a = AsciiStandardAnalyzer.new()
     t = a.token_stream("fieldname", input)
     t2 = a.token_stream("fieldname", input)
@@ -231,7 +233,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
     assert_equal(Token.new('tnt', 86, 91), t.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
     assert(! t.next())
@@ -242,7 +244,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t2.next)
     assert_equal(Token.new('address', 40, 47), t2.next)
     assert_equal(Token.new('23', 49, 51), t2.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
     assert_equal(Token.new('tnt', 86, 91), t2.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
     assert(! t2.next())
@@ -257,7 +259,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('Address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
     assert_equal(Token.new('TNT', 86, 91), t.next)
     assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
     assert(! t.next())
@@ -268,7 +270,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_standard_analyzer()
-    input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
+    input = 'DBalmán@gmail.com is My e-mail and the Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
     a = StandardAnalyzer.new()
     t = a.token_stream("fieldname", input)
     t2 = a.token_stream("fieldname", input)
@@ -278,7 +280,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('mail', 27, 31), t.next)
     assert_equal(Token.new('address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
     assert_equal(Token.new('tnt', 86, 91), t.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
     assert_equal(Token.new('23', 111, 113), t.next)
@@ -293,7 +295,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('mail', 27, 31), t2.next)
     assert_equal(Token.new('address', 40, 47), t2.next)
     assert_equal(Token.new('23', 49, 51), t2.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
     assert_equal(Token.new('tnt', 86, 91), t2.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
     assert_equal(Token.new('23', 111, 113), t2.next)
@@ -311,7 +313,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('mail', 27, 31), t.next)
     assert_equal(Token.new('Address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
     assert_equal(Token.new('TNT', 86, 91), t.next)
     assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
     assert_equal(Token.new('23', 111, 113), t.next)
@@ -329,7 +331,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('and', 32, 35), t.next)
     assert_equal(Token.new('the', 36, 39), t.next)
     assert_equal(Token.new('address', 40, 47), t.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
     assert_equal(Token.new('áägç', 117, 124), t.next)
     assert_equal(Token.new('êëì', 126, 132), t.next)
@@ -342,7 +344,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('and', 32, 35), t2.next)
     assert_equal(Token.new('the', 36, 39), t2.next)
     assert_equal(Token.new('address', 40, 47), t2.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
     assert_equal(Token.new('áägç', 117, 124), t2.next)
     assert_equal(Token.new('êëì', 126, 132), t2.next)
@@ -355,7 +357,7 @@ end if (/utf-8/i =~ Ferret.locale)
 class PerFieldAnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_per_field_analyzer()
-    input = 'DBalmain@gmail.com is My e-mail 52   #$ address. 23#@$'
+    input = 'DBalmain@gmail.com is My e-mail 52   #$ address. 23#!$'
     pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
     pfa['white'] = WhiteSpaceAnalyzer.new(false)
     pfa['white_l'] = WhiteSpaceAnalyzer.new(true)
@@ -370,7 +372,7 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert(! t.next())
     t = pfa.token_stream('white_l', input)
     assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
@@ -380,7 +382,7 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert(! t.next())
     t = pfa.token_stream('letter_u', input)
     assert_equal(Token.new('DBalmain', 0, 8), t.next)
@@ -418,7 +420,7 @@ class RegExpAnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_reg_exp_analyzer()
-    input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
+    input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#!$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
     a = RegExpAnalyzer.new()
     t = a.token_stream('XXX', input)
     t2 = a.token_stream('XXX', "one_Two three")
@@ -510,7 +512,7 @@ class CustomAnalyzerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_custom_filter()
-    input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
+    input = 'DBalmán@gmail.com is My e-mail and the Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
     a = StemmingStandardAnalyzer.new()
     t = a.token_stream("fieldname", input)
     assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
@@ -519,7 +521,7 @@ class CustomAnalyzerTest < Test::Unit::TestCase
     assert_equal(Token.new('mail', 27, 31), t.next)
     assert_equal(Token.new('address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/result', 55, 85), t.next)
     assert_equal(Token.new('tnt', 86, 91), t.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
     assert_equal(Token.new('23', 111, 113), t.next)

data/test/unit/analysis/tc_token_stream.rb CHANGED

@@ -1,3 +1,5 @@
+# encoding: utf-8
 require File.dirname(__FILE__) + "/../../test_helper"
 puts "Loading once"
@@ -27,7 +29,7 @@ class AsciiLetterTokenizerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_letter_tokenizer()
-    input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#@$'
+    input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#!$'
     t = AsciiLetterTokenizer.new(input)
     assert_equal(Token.new("DBalmain", 0, 8), t.next())
     assert_equal(Token.new("gmail", 9, 14), t.next())
@@ -60,7 +62,7 @@ class LetterTokenizerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_letter_tokenizer()
-    input = 'DBalmän@gmail.com is My e-mail 52   #$ address. 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
+    input = 'DBalmän@gmail.com is My e-mail 52   #$ address. 23#!$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
     t = LetterTokenizer.new(input)
     assert_equal(Token.new('DBalmän', 0, 8), t.next)
     assert_equal(Token.new('gmail', 9, 14), t.next)
@@ -115,7 +117,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_whitespace_tokenizer()
-    input = 'DBalmain@gmail.com is My e-mail 52   #$ ADDRESS. 23#@$'
+    input = 'DBalmain@gmail.com is My e-mail 52   #$ ADDRESS. 23#!$'
     t = AsciiWhiteSpaceTokenizer.new(input)
     assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
     assert_equal(Token.new('is', 19, 21), t.next)
@@ -124,7 +126,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert(! t.next())
     t.text = "one_two three"
     assert_equal(Token.new("one_two", 0, 7), t.next())
@@ -138,7 +140,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert(! t.next())
   end
 end
@@ -147,7 +149,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_whitespace_tokenizer()
-    input = 'DBalmän@gmail.com is My e-mail 52   #$ address. 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
+    input = 'DBalmän@gmail.com is My e-mail 52   #$ address. 23#!$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
     t = WhiteSpaceTokenizer.new(input)
     assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
     assert_equal(Token.new('is', 19, 21), t.next)
@@ -156,7 +158,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert_equal(Token.new('ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ', 55, 86), t.next)
     assert(! t.next())
     t.text = "one_two three"
@@ -171,7 +173,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
     assert(! t.next())
     t = WhiteSpaceTokenizer.new(input, true)
@@ -182,7 +184,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('#$', 37, 39), t.next)
     assert_equal(Token.new('address.', 40, 48), t.next)
-    assert_equal(Token.new('23#@$', 49, 54), t.next)
+    assert_equal(Token.new('23#!$', 49, 54), t.next)
     assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
     assert(! t.next())
   end
@@ -192,7 +194,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_standard_tokenizer()
-    input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
+    input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
     t = AsciiStandardTokenizer.new(input)
     assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
     assert_equal(Token.new('is', 19, 21), t.next)
@@ -201,7 +203,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('Address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
     assert_equal(Token.new('TNT', 86, 91), t.next)
     assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
     assert(! t.next())
@@ -217,7 +219,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
     assert_equal(Token.new('tnt', 86, 91), t.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
     assert(! t.next())
@@ -228,7 +230,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
   include Ferret::Analysis
   def test_standard_tokenizer()
-    input = 'DBalmán@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
+    input = 'DBalmán@gmail.com is My e-mail 52   #$ Address. 23#!$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
     t = StandardTokenizer.new(input)
     assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
     assert_equal(Token.new('is', 19, 21), t.next)
@@ -237,7 +239,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('Address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/res_345', 55, 85), t.next)
     assert_equal(Token.new('TNT', 86, 91), t.next)
     assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
     assert_equal(Token.new('23', 111, 113), t.next)
@@ -258,7 +260,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
     assert_equal(Token.new('52', 32, 34), t.next)
     assert_equal(Token.new('address', 40, 47), t.next)
     assert_equal(Token.new('23', 49, 51), t.next)
-    assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
+    assert_equal(Token.new('www.google.com/res_345', 55, 85), t.next)
     assert_equal(Token.new('tnt', 86, 91), t.next)
     assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
     assert_equal(Token.new('23', 111, 113), t.next)
@@ -287,7 +289,7 @@ class RegExpTokenizerTest < Test::Unit::TestCase
   APOSTROPHE_WORD = /^#{APOSTROPHE}$/
   def test_reg_exp_tokenizer()
-    input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
+    input = 'DBalmain@gmail.com is My e-mail 52   #$ Address. 23#!$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
     t = RegExpTokenizer.new(input)
     assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
     assert_equal(Token.new('is', 19, 21), t.next)
@@ -483,6 +485,11 @@ class StemFilterTest < Test::Unit::TestCase
       assert_equal(Token.new("dêbater", 36, 44), t.next)
       assert(! t.next())
     end
+    tz = AsciiLetterTokenizer.new(input)
+    assert_not_nil(StemFilter.new(tz,'HunGarIaN', 'Utf-8'))
+    assert_not_nil(StemFilter.new(tz,'romanIAN', 'iso-8859-2'))
+    assert_raises(ArgumentError) {StemFilter.new(tz, 'Jibberish', 'UTF-8')}
   end
 end