ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -145,12 +145,12 @@ class String
145
145
 
146
146
  def get_lex_format(len)
147
147
  case len
148
- when 0.. 3: ""
149
- when 4.. 5: "%Y"
150
- when 6.. 7: "%Y%m"
151
- when 8.. 9: "%Y%m%d"
152
- when 10..11: "%Y%m%d%H"
153
- when 12..13: "%Y%m%d%H%M"
148
+ when 0.. 3 then ""
149
+ when 4.. 5 then "%Y"
150
+ when 6.. 7 then "%Y%m"
151
+ when 8.. 9 then "%Y%m%d"
152
+ when 10..11 then "%Y%m%d%H"
153
+ when 12..13 then "%Y%m%d%H%M"
154
154
  else "%Y%m%d%H%M%S"
155
155
  end
156
156
  end
@@ -0,0 +1,3 @@
1
+ module Ferret
2
+ VERSION = '0.11.8.4'
3
+ end
@@ -25,7 +25,7 @@ class SampleLargeTest < Test::Unit::TestCase
25
25
  end
26
26
 
27
27
  def test_read_file_after_two_gigs
28
- assert @index.reader[RECORDS - 5].load.is_a?Hash
28
+ assert @index.reader[RECORDS - 5].load.is_a?(Hash)
29
29
  end
30
30
 
31
31
  def create_index!
@@ -1,6 +1,11 @@
1
1
  $:.unshift File.dirname(__FILE__)
2
- $:.unshift File.join(File.dirname(__FILE__), '../lib')
3
- $:.unshift File.join(File.dirname(__FILE__), '../ext')
2
+ if $test_installed_gem
3
+ require 'rubygems'
4
+ require 'ferret'
5
+ else
6
+ $:.unshift File.join(File.dirname(__FILE__), '../lib')
7
+ $:.unshift File.join(File.dirname(__FILE__), '../ext')
8
+ end
4
9
 
5
10
  ENV['LANG'] = "en_US.UTF-8"
6
11
  ENV['LC_CTYPE'] = "en_US.UTF-8"
@@ -0,0 +1 @@
1
+ $test_installed_gem = true
@@ -1,3 +1,5 @@
1
+ $:.unshift('.')
2
+ require 'monitor'
1
3
  require File.dirname(__FILE__) + "/../test_helper"
2
4
  require File.dirname(__FILE__) + "/number_to_spoken.rb"
3
5
  require 'thread'
@@ -21,6 +23,7 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
21
23
  def indexing_thread()
22
24
  index = Index.new(:path => INDEX_DIR,
23
25
  :analyzer => ANALYZER,
26
+ :auto_flush => true,
24
27
  :default_field => :content)
25
28
 
26
29
  ITERATIONS.times do
@@ -37,6 +40,10 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
37
40
  end
38
41
  index.commit
39
42
  end
43
+ rescue Exception => e
44
+ puts e
45
+ puts e.backtrace
46
+ raise 'hell'
40
47
  end
41
48
 
42
49
  def do_optimize(index)
@@ -74,6 +81,8 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
74
81
  threads << Thread.new { indexing_thread }
75
82
  end
76
83
 
77
- threads.each {|t| t.join}
84
+ threads.each {|t|
85
+ t.join
86
+ }
78
87
  end
79
88
  end
@@ -1,20 +1,19 @@
1
1
  require File.dirname(__FILE__) + "/../test_helper"
2
- require File.dirname(__FILE__) + "/../utils/number_to_spoken.rb"
2
+ require File.dirname(__FILE__) + "/number_to_spoken.rb"
3
3
  require 'thread'
4
4
 
5
5
  class IndexThreadSafetyReadWriteTest < Test::Unit::TestCase
6
6
  include Ferret::Index
7
- include Ferret::Document
8
7
 
9
8
  INDEX_DIR = File.expand_path(File.join(File.dirname(__FILE__), "index"))
10
9
  ITERATIONS = 10000
11
10
  ANALYZER = Ferret::Analysis::Analyzer.new()
12
11
 
13
12
  def setup
14
- @index = Index.new(:path => 'index2',
13
+ @index = Index.new(:path => INDEX_DIR,
15
14
  :create => true,
16
15
  :analyzer => ANALYZER,
17
- :default_field => 'contents')
16
+ :default_field => :content)
18
17
  end
19
18
 
20
19
  def search_thread()
@@ -42,10 +41,8 @@ class IndexThreadSafetyReadWriteTest < Test::Unit::TestCase
42
41
  end
43
42
 
44
43
  def do_add_doc
45
- d = Document.new()
46
44
  n = rand(0xFFFFFFFF)
47
- d << Field.new("id", n.to_s, Field::Store::YES, Field::Index::UNTOKENIZED)
48
- d << Field.new("contents", n.to_spoken, Field::Store::NO, Field::Index::TOKENIZED)
45
+ d = {:id => n.to_s, :content => n.to_spoken}
49
46
  puts("Adding #{n}")
50
47
  begin
51
48
  @index << d
File without changes
@@ -1,10 +1,12 @@
1
+ # encoding: utf-8
2
+
1
3
  require File.dirname(__FILE__) + "/../../test_helper"
2
4
 
3
5
  class AnalyzerTest < Test::Unit::TestCase
4
6
  include Ferret::Analysis
5
7
 
6
8
  def test_analyzer()
7
- input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
9
+ input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#!$'
8
10
  a = Analyzer.new()
9
11
  t = a.token_stream("fieldname", input)
10
12
  t2 = a.token_stream("fieldname", input)
@@ -44,7 +46,7 @@ class AsciiLetterAnalyzerTest < Test::Unit::TestCase
44
46
  include Ferret::Analysis
45
47
 
46
48
  def test_letter_analyzer()
47
- input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#@$'
49
+ input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#!$'
48
50
  a = AsciiLetterAnalyzer.new()
49
51
  t = a.token_stream("fieldname", input)
50
52
  t2 = a.token_stream("fieldname", input)
@@ -85,7 +87,7 @@ class LetterAnalyzerTest < Test::Unit::TestCase
85
87
 
86
88
  def test_letter_analyzer()
87
89
  Ferret.locale = ""
88
- input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
90
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
89
91
  a = LetterAnalyzer.new(false)
90
92
  t = a.token_stream("fieldname", input)
91
93
  t2 = a.token_stream("fieldname", input)
@@ -137,7 +139,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
137
139
  include Ferret::Analysis
138
140
 
139
141
  def test_white_space_analyzer()
140
- input = 'DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23#@$'
142
+ input = 'DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23#!$'
141
143
  a = AsciiWhiteSpaceAnalyzer.new()
142
144
  t = a.token_stream("fieldname", input)
143
145
  t2 = a.token_stream("fieldname", input)
@@ -148,7 +150,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
148
150
  assert_equal(Token.new('52', 32, 34), t.next)
149
151
  assert_equal(Token.new('#$', 37, 39), t.next)
150
152
  assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
151
- assert_equal(Token.new('23#@$', 49, 54), t.next)
153
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
152
154
  assert(! t.next())
153
155
  assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t2.next)
154
156
  assert_equal(Token.new('is', 19, 21), t2.next)
@@ -157,7 +159,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
157
159
  assert_equal(Token.new('52', 32, 34), t2.next)
158
160
  assert_equal(Token.new('#$', 37, 39), t2.next)
159
161
  assert_equal(Token.new('ADDRESS.', 40, 48), t2.next)
160
- assert_equal(Token.new('23#@$', 49, 54), t2.next)
162
+ assert_equal(Token.new('23#!$', 49, 54), t2.next)
161
163
  assert(! t2.next())
162
164
  a = AsciiWhiteSpaceAnalyzer.new(true)
163
165
  t = a.token_stream("fieldname", input)
@@ -168,7 +170,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
168
170
  assert_equal(Token.new('52', 32, 34), t.next)
169
171
  assert_equal(Token.new('#$', 37, 39), t.next)
170
172
  assert_equal(Token.new('address.', 40, 48), t.next)
171
- assert_equal(Token.new('23#@$', 49, 54), t.next)
173
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
172
174
  assert(! t.next())
173
175
  end
174
176
  end
@@ -177,7 +179,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
177
179
  include Ferret::Analysis
178
180
 
179
181
  def test_white_space_analyzer()
180
- input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
182
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
181
183
  a = WhiteSpaceAnalyzer.new()
182
184
  t = a.token_stream("fieldname", input)
183
185
  t2 = a.token_stream("fieldname", input)
@@ -188,7 +190,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
188
190
  assert_equal(Token.new('52', 32, 34), t.next)
189
191
  assert_equal(Token.new('#$', 37, 39), t.next)
190
192
  assert_equal(Token.new('address.', 40, 48), t.next)
191
- assert_equal(Token.new('23#@$', 49, 54), t.next)
193
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
192
194
  assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
193
195
  assert(! t.next())
194
196
  assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t2.next)
@@ -198,7 +200,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
198
200
  assert_equal(Token.new('52', 32, 34), t2.next)
199
201
  assert_equal(Token.new('#$', 37, 39), t2.next)
200
202
  assert_equal(Token.new('address.', 40, 48), t2.next)
201
- assert_equal(Token.new('23#@$', 49, 54), t2.next)
203
+ assert_equal(Token.new('23#!$', 49, 54), t2.next)
202
204
  assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t2.next)
203
205
  assert(! t2.next())
204
206
  a = WhiteSpaceAnalyzer.new(true)
@@ -210,7 +212,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
210
212
  assert_equal(Token.new('52', 32, 34), t.next)
211
213
  assert_equal(Token.new('#$', 37, 39), t.next)
212
214
  assert_equal(Token.new('address.', 40, 48), t.next)
213
- assert_equal(Token.new('23#@$', 49, 54), t.next)
215
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
214
216
  assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
215
217
  assert(! t.next())
216
218
  end
@@ -220,7 +222,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
220
222
  include Ferret::Analysis
221
223
 
222
224
  def test_standard_analyzer()
223
- input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
225
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
224
226
  a = AsciiStandardAnalyzer.new()
225
227
  t = a.token_stream("fieldname", input)
226
228
  t2 = a.token_stream("fieldname", input)
@@ -231,7 +233,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
231
233
  assert_equal(Token.new('52', 32, 34), t.next)
232
234
  assert_equal(Token.new('address', 40, 47), t.next)
233
235
  assert_equal(Token.new('23', 49, 51), t.next)
234
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
236
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
235
237
  assert_equal(Token.new('tnt', 86, 91), t.next)
236
238
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
237
239
  assert(! t.next())
@@ -242,7 +244,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
242
244
  assert_equal(Token.new('52', 32, 34), t2.next)
243
245
  assert_equal(Token.new('address', 40, 47), t2.next)
244
246
  assert_equal(Token.new('23', 49, 51), t2.next)
245
- assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
247
+ assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
246
248
  assert_equal(Token.new('tnt', 86, 91), t2.next)
247
249
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
248
250
  assert(! t2.next())
@@ -257,7 +259,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
257
259
  assert_equal(Token.new('52', 32, 34), t.next)
258
260
  assert_equal(Token.new('Address', 40, 47), t.next)
259
261
  assert_equal(Token.new('23', 49, 51), t.next)
260
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
262
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
261
263
  assert_equal(Token.new('TNT', 86, 91), t.next)
262
264
  assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
263
265
  assert(! t.next())
@@ -268,7 +270,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
268
270
  include Ferret::Analysis
269
271
 
270
272
  def test_standard_analyzer()
271
- input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
273
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
272
274
  a = StandardAnalyzer.new()
273
275
  t = a.token_stream("fieldname", input)
274
276
  t2 = a.token_stream("fieldname", input)
@@ -278,7 +280,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
278
280
  assert_equal(Token.new('mail', 27, 31), t.next)
279
281
  assert_equal(Token.new('address', 40, 47), t.next)
280
282
  assert_equal(Token.new('23', 49, 51), t.next)
281
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
283
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
282
284
  assert_equal(Token.new('tnt', 86, 91), t.next)
283
285
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
284
286
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -293,7 +295,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
293
295
  assert_equal(Token.new('mail', 27, 31), t2.next)
294
296
  assert_equal(Token.new('address', 40, 47), t2.next)
295
297
  assert_equal(Token.new('23', 49, 51), t2.next)
296
- assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
298
+ assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
297
299
  assert_equal(Token.new('tnt', 86, 91), t2.next)
298
300
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
299
301
  assert_equal(Token.new('23', 111, 113), t2.next)
@@ -311,7 +313,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
311
313
  assert_equal(Token.new('mail', 27, 31), t.next)
312
314
  assert_equal(Token.new('Address', 40, 47), t.next)
313
315
  assert_equal(Token.new('23', 49, 51), t.next)
314
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
316
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
315
317
  assert_equal(Token.new('TNT', 86, 91), t.next)
316
318
  assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
317
319
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -329,7 +331,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
329
331
  assert_equal(Token.new('and', 32, 35), t.next)
330
332
  assert_equal(Token.new('the', 36, 39), t.next)
331
333
  assert_equal(Token.new('address', 40, 47), t.next)
332
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
334
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
333
335
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
334
336
  assert_equal(Token.new('áägç', 117, 124), t.next)
335
337
  assert_equal(Token.new('êëì', 126, 132), t.next)
@@ -342,7 +344,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
342
344
  assert_equal(Token.new('and', 32, 35), t2.next)
343
345
  assert_equal(Token.new('the', 36, 39), t2.next)
344
346
  assert_equal(Token.new('address', 40, 47), t2.next)
345
- assert_equal(Token.new('www.google.com/results', 55, 84), t2.next)
347
+ assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
346
348
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
347
349
  assert_equal(Token.new('áägç', 117, 124), t2.next)
348
350
  assert_equal(Token.new('êëì', 126, 132), t2.next)
@@ -355,7 +357,7 @@ end if (/utf-8/i =~ Ferret.locale)
355
357
  class PerFieldAnalyzerTest < Test::Unit::TestCase
356
358
  include Ferret::Analysis
357
359
  def test_per_field_analyzer()
358
- input = 'DBalmain@gmail.com is My e-mail 52 #$ address. 23#@$'
360
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$'
359
361
  pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
360
362
  pfa['white'] = WhiteSpaceAnalyzer.new(false)
361
363
  pfa['white_l'] = WhiteSpaceAnalyzer.new(true)
@@ -370,7 +372,7 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
370
372
  assert_equal(Token.new('52', 32, 34), t.next)
371
373
  assert_equal(Token.new('#$', 37, 39), t.next)
372
374
  assert_equal(Token.new('address.', 40, 48), t.next)
373
- assert_equal(Token.new('23#@$', 49, 54), t.next)
375
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
374
376
  assert(! t.next())
375
377
  t = pfa.token_stream('white_l', input)
376
378
  assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
@@ -380,7 +382,7 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
380
382
  assert_equal(Token.new('52', 32, 34), t.next)
381
383
  assert_equal(Token.new('#$', 37, 39), t.next)
382
384
  assert_equal(Token.new('address.', 40, 48), t.next)
383
- assert_equal(Token.new('23#@$', 49, 54), t.next)
385
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
384
386
  assert(! t.next())
385
387
  t = pfa.token_stream('letter_u', input)
386
388
  assert_equal(Token.new('DBalmain', 0, 8), t.next)
@@ -418,7 +420,7 @@ class RegExpAnalyzerTest < Test::Unit::TestCase
418
420
  include Ferret::Analysis
419
421
 
420
422
  def test_reg_exp_analyzer()
421
- input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
423
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
422
424
  a = RegExpAnalyzer.new()
423
425
  t = a.token_stream('XXX', input)
424
426
  t2 = a.token_stream('XXX', "one_Two three")
@@ -510,7 +512,7 @@ class CustomAnalyzerTest < Test::Unit::TestCase
510
512
  include Ferret::Analysis
511
513
 
512
514
  def test_custom_filter()
513
- input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
515
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
514
516
  a = StemmingStandardAnalyzer.new()
515
517
  t = a.token_stream("fieldname", input)
516
518
  assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
@@ -519,7 +521,7 @@ class CustomAnalyzerTest < Test::Unit::TestCase
519
521
  assert_equal(Token.new('mail', 27, 31), t.next)
520
522
  assert_equal(Token.new('address', 40, 47), t.next)
521
523
  assert_equal(Token.new('23', 49, 51), t.next)
522
- assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
524
+ assert_equal(Token.new('www.google.com/result', 55, 85), t.next)
523
525
  assert_equal(Token.new('tnt', 86, 91), t.next)
524
526
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
525
527
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  require File.dirname(__FILE__) + "/../../test_helper"
2
4
 
3
5
  puts "Loading once"
@@ -27,7 +29,7 @@ class AsciiLetterTokenizerTest < Test::Unit::TestCase
27
29
  include Ferret::Analysis
28
30
 
29
31
  def test_letter_tokenizer()
30
- input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#@$'
32
+ input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#!$'
31
33
  t = AsciiLetterTokenizer.new(input)
32
34
  assert_equal(Token.new("DBalmain", 0, 8), t.next())
33
35
  assert_equal(Token.new("gmail", 9, 14), t.next())
@@ -60,7 +62,7 @@ class LetterTokenizerTest < Test::Unit::TestCase
60
62
  include Ferret::Analysis
61
63
 
62
64
  def test_letter_tokenizer()
63
- input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
65
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
64
66
  t = LetterTokenizer.new(input)
65
67
  assert_equal(Token.new('DBalmän', 0, 8), t.next)
66
68
  assert_equal(Token.new('gmail', 9, 14), t.next)
@@ -115,7 +117,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
115
117
  include Ferret::Analysis
116
118
 
117
119
  def test_whitespace_tokenizer()
118
- input = 'DBalmain@gmail.com is My e-mail 52 #$ ADDRESS. 23#@$'
120
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ ADDRESS. 23#!$'
119
121
  t = AsciiWhiteSpaceTokenizer.new(input)
120
122
  assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
121
123
  assert_equal(Token.new('is', 19, 21), t.next)
@@ -124,7 +126,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
124
126
  assert_equal(Token.new('52', 32, 34), t.next)
125
127
  assert_equal(Token.new('#$', 37, 39), t.next)
126
128
  assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
127
- assert_equal(Token.new('23#@$', 49, 54), t.next)
129
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
128
130
  assert(! t.next())
129
131
  t.text = "one_two three"
130
132
  assert_equal(Token.new("one_two", 0, 7), t.next())
@@ -138,7 +140,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
138
140
  assert_equal(Token.new('52', 32, 34), t.next)
139
141
  assert_equal(Token.new('#$', 37, 39), t.next)
140
142
  assert_equal(Token.new('address.', 40, 48), t.next)
141
- assert_equal(Token.new('23#@$', 49, 54), t.next)
143
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
142
144
  assert(! t.next())
143
145
  end
144
146
  end
@@ -147,7 +149,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
147
149
  include Ferret::Analysis
148
150
 
149
151
  def test_whitespace_tokenizer()
150
- input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
152
+ input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
151
153
  t = WhiteSpaceTokenizer.new(input)
152
154
  assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
153
155
  assert_equal(Token.new('is', 19, 21), t.next)
@@ -156,7 +158,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
156
158
  assert_equal(Token.new('52', 32, 34), t.next)
157
159
  assert_equal(Token.new('#$', 37, 39), t.next)
158
160
  assert_equal(Token.new('address.', 40, 48), t.next)
159
- assert_equal(Token.new('23#@$', 49, 54), t.next)
161
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
160
162
  assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
161
163
  assert(! t.next())
162
164
  t.text = "one_two three"
@@ -171,7 +173,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
171
173
  assert_equal(Token.new('52', 32, 34), t.next)
172
174
  assert_equal(Token.new('#$', 37, 39), t.next)
173
175
  assert_equal(Token.new('address.', 40, 48), t.next)
174
- assert_equal(Token.new('23#@$', 49, 54), t.next)
176
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
175
177
  assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
176
178
  assert(! t.next())
177
179
  t = WhiteSpaceTokenizer.new(input, true)
@@ -182,7 +184,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
182
184
  assert_equal(Token.new('52', 32, 34), t.next)
183
185
  assert_equal(Token.new('#$', 37, 39), t.next)
184
186
  assert_equal(Token.new('address.', 40, 48), t.next)
185
- assert_equal(Token.new('23#@$', 49, 54), t.next)
187
+ assert_equal(Token.new('23#!$', 49, 54), t.next)
186
188
  assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
187
189
  assert(! t.next())
188
190
  end
@@ -192,7 +194,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
192
194
  include Ferret::Analysis
193
195
 
194
196
  def test_standard_tokenizer()
195
- input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
197
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
196
198
  t = AsciiStandardTokenizer.new(input)
197
199
  assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
198
200
  assert_equal(Token.new('is', 19, 21), t.next)
@@ -201,7 +203,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
201
203
  assert_equal(Token.new('52', 32, 34), t.next)
202
204
  assert_equal(Token.new('Address', 40, 47), t.next)
203
205
  assert_equal(Token.new('23', 49, 51), t.next)
204
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
206
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
205
207
  assert_equal(Token.new('TNT', 86, 91), t.next)
206
208
  assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
207
209
  assert(! t.next())
@@ -217,7 +219,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
217
219
  assert_equal(Token.new('52', 32, 34), t.next)
218
220
  assert_equal(Token.new('address', 40, 47), t.next)
219
221
  assert_equal(Token.new('23', 49, 51), t.next)
220
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
222
+ assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
221
223
  assert_equal(Token.new('tnt', 86, 91), t.next)
222
224
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
223
225
  assert(! t.next())
@@ -228,7 +230,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
228
230
  include Ferret::Analysis
229
231
 
230
232
  def test_standard_tokenizer()
231
- input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
233
+ input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
232
234
  t = StandardTokenizer.new(input)
233
235
  assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
234
236
  assert_equal(Token.new('is', 19, 21), t.next)
@@ -237,7 +239,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
237
239
  assert_equal(Token.new('52', 32, 34), t.next)
238
240
  assert_equal(Token.new('Address', 40, 47), t.next)
239
241
  assert_equal(Token.new('23', 49, 51), t.next)
240
- assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
242
+ assert_equal(Token.new('www.google.com/res_345', 55, 85), t.next)
241
243
  assert_equal(Token.new('TNT', 86, 91), t.next)
242
244
  assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
243
245
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -258,7 +260,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
258
260
  assert_equal(Token.new('52', 32, 34), t.next)
259
261
  assert_equal(Token.new('address', 40, 47), t.next)
260
262
  assert_equal(Token.new('23', 49, 51), t.next)
261
- assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
263
+ assert_equal(Token.new('www.google.com/res_345', 55, 85), t.next)
262
264
  assert_equal(Token.new('tnt', 86, 91), t.next)
263
265
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
264
266
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -287,7 +289,7 @@ class RegExpTokenizerTest < Test::Unit::TestCase
287
289
  APOSTROPHE_WORD = /^#{APOSTROPHE}$/
288
290
 
289
291
  def test_reg_exp_tokenizer()
290
- input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
292
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
291
293
  t = RegExpTokenizer.new(input)
292
294
  assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
293
295
  assert_equal(Token.new('is', 19, 21), t.next)
@@ -483,6 +485,11 @@ class StemFilterTest < Test::Unit::TestCase
483
485
  assert_equal(Token.new("dêbater", 36, 44), t.next)
484
486
  assert(! t.next())
485
487
  end
488
+
489
+ tz = AsciiLetterTokenizer.new(input)
490
+ assert_not_nil(StemFilter.new(tz,'HunGarIaN', 'Utf-8'))
491
+ assert_not_nil(StemFilter.new(tz,'romanIAN', 'iso-8859-2'))
492
+ assert_raises(ArgumentError) {StemFilter.new(tz, 'Jibberish', 'UTF-8')}
486
493
  end
487
494
  end
488
495