ferret 0.11.6 → 0.11.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +10 -22
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +379 -274
- data/TODO +100 -8
- data/bin/ferret-browser +0 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/{api.c → STEMMER_api.c} +7 -10
- data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
- data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
- data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
- data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
- data/ext/analysis.c +276 -121
- data/ext/analysis.h +190 -143
- data/ext/api.h +3 -4
- data/ext/array.c +5 -3
- data/ext/array.h +52 -43
- data/ext/bitvector.c +38 -482
- data/ext/bitvector.h +446 -124
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +23 -22
- data/ext/config.h +21 -11
- data/ext/document.c +43 -40
- data/ext/document.h +31 -21
- data/ext/except.c +20 -38
- data/ext/except.h +89 -76
- data/ext/extconf.rb +3 -2
- data/ext/ferret.c +49 -35
- data/ext/ferret.h +14 -11
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +11 -10
- data/ext/fs_store.c +65 -47
- data/ext/global.c +245 -165
- data/ext/global.h +252 -54
- data/ext/hash.c +200 -243
- data/ext/hash.h +205 -163
- data/ext/hashset.c +118 -96
- data/ext/hashset.h +110 -82
- data/ext/header.h +19 -19
- data/ext/helper.c +11 -10
- data/ext/helper.h +14 -6
- data/ext/index.c +745 -366
- data/ext/index.h +503 -529
- data/ext/internal.h +1020 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +35 -15
- data/ext/mempool.c +5 -4
- data/ext/mempool.h +30 -22
- data/ext/modules.h +35 -7
- data/ext/multimapper.c +43 -2
- data/ext/multimapper.h +32 -23
- data/ext/posh.c +0 -0
- data/ext/posh.h +4 -38
- data/ext/priorityqueue.c +10 -12
- data/ext/priorityqueue.h +33 -21
- data/ext/q_boolean.c +22 -9
- data/ext/q_const_score.c +3 -2
- data/ext/q_filtered_query.c +15 -12
- data/ext/q_fuzzy.c +147 -135
- data/ext/q_match_all.c +3 -2
- data/ext/q_multi_term.c +28 -32
- data/ext/q_parser.c +451 -173
- data/ext/q_phrase.c +158 -79
- data/ext/q_prefix.c +16 -18
- data/ext/q_range.c +363 -31
- data/ext/q_span.c +130 -141
- data/ext/q_term.c +21 -21
- data/ext/q_wildcard.c +19 -23
- data/ext/r_analysis.c +369 -242
- data/ext/r_index.c +421 -434
- data/ext/r_qparser.c +142 -92
- data/ext/r_search.c +790 -407
- data/ext/r_store.c +44 -44
- data/ext/r_utils.c +264 -96
- data/ext/ram_store.c +29 -23
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +210 -87
- data/ext/search.h +556 -488
- data/ext/similarity.c +17 -16
- data/ext/similarity.h +51 -44
- data/ext/sort.c +157 -354
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +287 -278
- data/ext/store.c +57 -51
- data/ext/store.h +308 -286
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +14 -293
- data/ext/threading.h +22 -22
- data/ext/win32.h +12 -4
- data/lib/ferret.rb +2 -1
- data/lib/ferret/browser.rb +1 -1
- data/lib/ferret/field_symbol.rb +94 -0
- data/lib/ferret/index.rb +221 -34
- data/lib/ferret/number_tools.rb +6 -6
- data/lib/ferret/version.rb +3 -0
- data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
- data/test/test_helper.rb +7 -2
- data/test/test_installed.rb +1 -0
- data/test/threading/thread_safety_index_test.rb +10 -1
- data/test/threading/thread_safety_read_write_test.rb +4 -7
- data/test/threading/thread_safety_test.rb +0 -0
- data/test/unit/analysis/tc_analyzer.rb +29 -27
- data/test/unit/analysis/tc_token_stream.rb +23 -16
- data/test/unit/index/tc_index.rb +116 -11
- data/test/unit/index/tc_index_reader.rb +27 -27
- data/test/unit/index/tc_index_writer.rb +10 -0
- data/test/unit/index/th_doc.rb +38 -21
- data/test/unit/search/tc_filter.rb +31 -10
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/search/tm_searcher.rb +53 -1
- data/test/unit/store/tc_fs_store.rb +40 -2
- data/test/unit/store/tc_ram_store.rb +0 -0
- data/test/unit/store/tm_store.rb +0 -0
- data/test/unit/store/tm_store_lock.rb +7 -6
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +0 -0
- data/test/unit/ts_index.rb +0 -0
- data/test/unit/ts_store.rb +0 -0
- data/test/unit/ts_utils.rb +0 -0
- data/test/unit/utils/tc_number_tools.rb +0 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +262 -221
- data/ext/inc/lang.h +0 -48
- data/ext/inc/threading.h +0 -31
- data/ext/stem_ISO_8859_1_english.c +0 -1156
- data/ext/stem_ISO_8859_1_french.c +0 -1276
- data/ext/stem_ISO_8859_1_italian.c +0 -1091
- data/ext/stem_ISO_8859_1_norwegian.c +0 -296
- data/ext/stem_ISO_8859_1_spanish.c +0 -1119
- data/ext/stem_ISO_8859_1_swedish.c +0 -307
- data/ext/stem_UTF_8_danish.c +0 -344
- data/ext/stem_UTF_8_english.c +0 -1176
- data/ext/stem_UTF_8_french.c +0 -1296
- data/ext/stem_UTF_8_italian.c +0 -1113
- data/ext/stem_UTF_8_norwegian.c +0 -302
- data/ext/stem_UTF_8_portuguese.c +0 -1055
- data/ext/stem_UTF_8_russian.c +0 -709
- data/ext/stem_UTF_8_spanish.c +0 -1137
- data/ext/stem_UTF_8_swedish.c +0 -313
- data/lib/ferret_version.rb +0 -3
data/lib/ferret/number_tools.rb
CHANGED
@@ -145,12 +145,12 @@ class String
|
|
145
145
|
|
146
146
|
def get_lex_format(len)
|
147
147
|
case len
|
148
|
-
when 0.. 3
|
149
|
-
when 4.. 5
|
150
|
-
when 6.. 7
|
151
|
-
when 8.. 9
|
152
|
-
when 10..11
|
153
|
-
when 12..13
|
148
|
+
when 0.. 3 then ""
|
149
|
+
when 4.. 5 then "%Y"
|
150
|
+
when 6.. 7 then "%Y%m"
|
151
|
+
when 8.. 9 then "%Y%m%d"
|
152
|
+
when 10..11 then "%Y%m%d%H"
|
153
|
+
when 12..13 then "%Y%m%d%H%M"
|
154
154
|
else "%Y%m%d%H%M%S"
|
155
155
|
end
|
156
156
|
end
|
data/test/test_helper.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
$:.unshift File.dirname(__FILE__)
|
2
|
-
|
3
|
-
|
2
|
+
if $test_installed_gem
|
3
|
+
require 'rubygems'
|
4
|
+
require 'ferret'
|
5
|
+
else
|
6
|
+
$:.unshift File.join(File.dirname(__FILE__), '../lib')
|
7
|
+
$:.unshift File.join(File.dirname(__FILE__), '../ext')
|
8
|
+
end
|
4
9
|
|
5
10
|
ENV['LANG'] = "en_US.UTF-8"
|
6
11
|
ENV['LC_CTYPE'] = "en_US.UTF-8"
|
@@ -0,0 +1 @@
|
|
1
|
+
$test_installed_gem = true
|
@@ -1,3 +1,5 @@
|
|
1
|
+
$:.unshift('.')
|
2
|
+
require 'monitor'
|
1
3
|
require File.dirname(__FILE__) + "/../test_helper"
|
2
4
|
require File.dirname(__FILE__) + "/number_to_spoken.rb"
|
3
5
|
require 'thread'
|
@@ -21,6 +23,7 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
|
|
21
23
|
def indexing_thread()
|
22
24
|
index = Index.new(:path => INDEX_DIR,
|
23
25
|
:analyzer => ANALYZER,
|
26
|
+
:auto_flush => true,
|
24
27
|
:default_field => :content)
|
25
28
|
|
26
29
|
ITERATIONS.times do
|
@@ -37,6 +40,10 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
|
|
37
40
|
end
|
38
41
|
index.commit
|
39
42
|
end
|
43
|
+
rescue Exception => e
|
44
|
+
puts e
|
45
|
+
puts e.backtrace
|
46
|
+
raise 'hell'
|
40
47
|
end
|
41
48
|
|
42
49
|
def do_optimize(index)
|
@@ -74,6 +81,8 @@ class IndexThreadSafetyTest < Test::Unit::TestCase
|
|
74
81
|
threads << Thread.new { indexing_thread }
|
75
82
|
end
|
76
83
|
|
77
|
-
threads.each {|t|
|
84
|
+
threads.each {|t|
|
85
|
+
t.join
|
86
|
+
}
|
78
87
|
end
|
79
88
|
end
|
@@ -1,20 +1,19 @@
|
|
1
1
|
require File.dirname(__FILE__) + "/../test_helper"
|
2
|
-
require File.dirname(__FILE__) + "
|
2
|
+
require File.dirname(__FILE__) + "/number_to_spoken.rb"
|
3
3
|
require 'thread'
|
4
4
|
|
5
5
|
class IndexThreadSafetyReadWriteTest < Test::Unit::TestCase
|
6
6
|
include Ferret::Index
|
7
|
-
include Ferret::Document
|
8
7
|
|
9
8
|
INDEX_DIR = File.expand_path(File.join(File.dirname(__FILE__), "index"))
|
10
9
|
ITERATIONS = 10000
|
11
10
|
ANALYZER = Ferret::Analysis::Analyzer.new()
|
12
11
|
|
13
12
|
def setup
|
14
|
-
@index = Index.new(:path =>
|
13
|
+
@index = Index.new(:path => INDEX_DIR,
|
15
14
|
:create => true,
|
16
15
|
:analyzer => ANALYZER,
|
17
|
-
:default_field =>
|
16
|
+
:default_field => :content)
|
18
17
|
end
|
19
18
|
|
20
19
|
def search_thread()
|
@@ -42,10 +41,8 @@ class IndexThreadSafetyReadWriteTest < Test::Unit::TestCase
|
|
42
41
|
end
|
43
42
|
|
44
43
|
def do_add_doc
|
45
|
-
d = Document.new()
|
46
44
|
n = rand(0xFFFFFFFF)
|
47
|
-
d
|
48
|
-
d << Field.new("contents", n.to_spoken, Field::Store::NO, Field::Index::TOKENIZED)
|
45
|
+
d = {:id => n.to_s, :content => n.to_spoken}
|
49
46
|
puts("Adding #{n}")
|
50
47
|
begin
|
51
48
|
@index << d
|
File without changes
|
@@ -1,10 +1,12 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require File.dirname(__FILE__) + "/../../test_helper"
|
2
4
|
|
3
5
|
class AnalyzerTest < Test::Unit::TestCase
|
4
6
|
include Ferret::Analysis
|
5
7
|
|
6
8
|
def test_analyzer()
|
7
|
-
input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23
|
9
|
+
input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#!$'
|
8
10
|
a = Analyzer.new()
|
9
11
|
t = a.token_stream("fieldname", input)
|
10
12
|
t2 = a.token_stream("fieldname", input)
|
@@ -44,7 +46,7 @@ class AsciiLetterAnalyzerTest < Test::Unit::TestCase
|
|
44
46
|
include Ferret::Analysis
|
45
47
|
|
46
48
|
def test_letter_analyzer()
|
47
|
-
input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23
|
49
|
+
input = 'DBalmain@gmail.com is My E-Mail 523@#$ ADDRESS. 23#!$'
|
48
50
|
a = AsciiLetterAnalyzer.new()
|
49
51
|
t = a.token_stream("fieldname", input)
|
50
52
|
t2 = a.token_stream("fieldname", input)
|
@@ -85,7 +87,7 @@ class LetterAnalyzerTest < Test::Unit::TestCase
|
|
85
87
|
|
86
88
|
def test_letter_analyzer()
|
87
89
|
Ferret.locale = ""
|
88
|
-
input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23
|
90
|
+
input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
89
91
|
a = LetterAnalyzer.new(false)
|
90
92
|
t = a.token_stream("fieldname", input)
|
91
93
|
t2 = a.token_stream("fieldname", input)
|
@@ -137,7 +139,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
|
137
139
|
include Ferret::Analysis
|
138
140
|
|
139
141
|
def test_white_space_analyzer()
|
140
|
-
input = 'DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23
|
142
|
+
input = 'DBalmain@gmail.com is My E-Mail 52 #$ ADDRESS. 23#!$'
|
141
143
|
a = AsciiWhiteSpaceAnalyzer.new()
|
142
144
|
t = a.token_stream("fieldname", input)
|
143
145
|
t2 = a.token_stream("fieldname", input)
|
@@ -148,7 +150,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
|
148
150
|
assert_equal(Token.new('52', 32, 34), t.next)
|
149
151
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
150
152
|
assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
|
151
|
-
assert_equal(Token.new('23
|
153
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
152
154
|
assert(! t.next())
|
153
155
|
assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t2.next)
|
154
156
|
assert_equal(Token.new('is', 19, 21), t2.next)
|
@@ -157,7 +159,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
|
157
159
|
assert_equal(Token.new('52', 32, 34), t2.next)
|
158
160
|
assert_equal(Token.new('#$', 37, 39), t2.next)
|
159
161
|
assert_equal(Token.new('ADDRESS.', 40, 48), t2.next)
|
160
|
-
assert_equal(Token.new('23
|
162
|
+
assert_equal(Token.new('23#!$', 49, 54), t2.next)
|
161
163
|
assert(! t2.next())
|
162
164
|
a = AsciiWhiteSpaceAnalyzer.new(true)
|
163
165
|
t = a.token_stream("fieldname", input)
|
@@ -168,7 +170,7 @@ class AsciiWhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
|
168
170
|
assert_equal(Token.new('52', 32, 34), t.next)
|
169
171
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
170
172
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
171
|
-
assert_equal(Token.new('23
|
173
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
172
174
|
assert(! t.next())
|
173
175
|
end
|
174
176
|
end
|
@@ -177,7 +179,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
|
177
179
|
include Ferret::Analysis
|
178
180
|
|
179
181
|
def test_white_space_analyzer()
|
180
|
-
input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23
|
182
|
+
input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
181
183
|
a = WhiteSpaceAnalyzer.new()
|
182
184
|
t = a.token_stream("fieldname", input)
|
183
185
|
t2 = a.token_stream("fieldname", input)
|
@@ -188,7 +190,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
|
188
190
|
assert_equal(Token.new('52', 32, 34), t.next)
|
189
191
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
190
192
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
191
|
-
assert_equal(Token.new('23
|
193
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
192
194
|
assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
|
193
195
|
assert(! t.next())
|
194
196
|
assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t2.next)
|
@@ -198,7 +200,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
|
198
200
|
assert_equal(Token.new('52', 32, 34), t2.next)
|
199
201
|
assert_equal(Token.new('#$', 37, 39), t2.next)
|
200
202
|
assert_equal(Token.new('address.', 40, 48), t2.next)
|
201
|
-
assert_equal(Token.new('23
|
203
|
+
assert_equal(Token.new('23#!$', 49, 54), t2.next)
|
202
204
|
assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t2.next)
|
203
205
|
assert(! t2.next())
|
204
206
|
a = WhiteSpaceAnalyzer.new(true)
|
@@ -210,7 +212,7 @@ class WhiteSpaceAnalyzerTest < Test::Unit::TestCase
|
|
210
212
|
assert_equal(Token.new('52', 32, 34), t.next)
|
211
213
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
212
214
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
213
|
-
assert_equal(Token.new('23
|
215
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
214
216
|
assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
|
215
217
|
assert(! t.next())
|
216
218
|
end
|
@@ -220,7 +222,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
|
|
220
222
|
include Ferret::Analysis
|
221
223
|
|
222
224
|
def test_standard_analyzer()
|
223
|
-
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23
|
225
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
|
224
226
|
a = AsciiStandardAnalyzer.new()
|
225
227
|
t = a.token_stream("fieldname", input)
|
226
228
|
t2 = a.token_stream("fieldname", input)
|
@@ -231,7 +233,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
|
|
231
233
|
assert_equal(Token.new('52', 32, 34), t.next)
|
232
234
|
assert_equal(Token.new('address', 40, 47), t.next)
|
233
235
|
assert_equal(Token.new('23', 49, 51), t.next)
|
234
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
236
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
|
235
237
|
assert_equal(Token.new('tnt', 86, 91), t.next)
|
236
238
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
237
239
|
assert(! t.next())
|
@@ -242,7 +244,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
|
|
242
244
|
assert_equal(Token.new('52', 32, 34), t2.next)
|
243
245
|
assert_equal(Token.new('address', 40, 47), t2.next)
|
244
246
|
assert_equal(Token.new('23', 49, 51), t2.next)
|
245
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
247
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
|
246
248
|
assert_equal(Token.new('tnt', 86, 91), t2.next)
|
247
249
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
|
248
250
|
assert(! t2.next())
|
@@ -257,7 +259,7 @@ class AsciiStandardAnalyzerTest < Test::Unit::TestCase
|
|
257
259
|
assert_equal(Token.new('52', 32, 34), t.next)
|
258
260
|
assert_equal(Token.new('Address', 40, 47), t.next)
|
259
261
|
assert_equal(Token.new('23', 49, 51), t.next)
|
260
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
262
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
|
261
263
|
assert_equal(Token.new('TNT', 86, 91), t.next)
|
262
264
|
assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
|
263
265
|
assert(! t.next())
|
@@ -268,7 +270,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
|
|
268
270
|
include Ferret::Analysis
|
269
271
|
|
270
272
|
def test_standard_analyzer()
|
271
|
-
input = 'DBalmán@gmail.com is My e-mail and the Address. 23
|
273
|
+
input = 'DBalmán@gmail.com is My e-mail and the Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
272
274
|
a = StandardAnalyzer.new()
|
273
275
|
t = a.token_stream("fieldname", input)
|
274
276
|
t2 = a.token_stream("fieldname", input)
|
@@ -278,7 +280,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
|
|
278
280
|
assert_equal(Token.new('mail', 27, 31), t.next)
|
279
281
|
assert_equal(Token.new('address', 40, 47), t.next)
|
280
282
|
assert_equal(Token.new('23', 49, 51), t.next)
|
281
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
283
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
|
282
284
|
assert_equal(Token.new('tnt', 86, 91), t.next)
|
283
285
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
284
286
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -293,7 +295,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
|
|
293
295
|
assert_equal(Token.new('mail', 27, 31), t2.next)
|
294
296
|
assert_equal(Token.new('address', 40, 47), t2.next)
|
295
297
|
assert_equal(Token.new('23', 49, 51), t2.next)
|
296
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
298
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
|
297
299
|
assert_equal(Token.new('tnt', 86, 91), t2.next)
|
298
300
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
|
299
301
|
assert_equal(Token.new('23', 111, 113), t2.next)
|
@@ -311,7 +313,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
|
|
311
313
|
assert_equal(Token.new('mail', 27, 31), t.next)
|
312
314
|
assert_equal(Token.new('Address', 40, 47), t.next)
|
313
315
|
assert_equal(Token.new('23', 49, 51), t.next)
|
314
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
316
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
|
315
317
|
assert_equal(Token.new('TNT', 86, 91), t.next)
|
316
318
|
assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
|
317
319
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -329,7 +331,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
|
|
329
331
|
assert_equal(Token.new('and', 32, 35), t.next)
|
330
332
|
assert_equal(Token.new('the', 36, 39), t.next)
|
331
333
|
assert_equal(Token.new('address', 40, 47), t.next)
|
332
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
334
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
|
333
335
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
334
336
|
assert_equal(Token.new('áägç', 117, 124), t.next)
|
335
337
|
assert_equal(Token.new('êëì', 126, 132), t.next)
|
@@ -342,7 +344,7 @@ class StandardAnalyzerTest < Test::Unit::TestCase
|
|
342
344
|
assert_equal(Token.new('and', 32, 35), t2.next)
|
343
345
|
assert_equal(Token.new('the', 36, 39), t2.next)
|
344
346
|
assert_equal(Token.new('address', 40, 47), t2.next)
|
345
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
347
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t2.next)
|
346
348
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t2.next)
|
347
349
|
assert_equal(Token.new('áägç', 117, 124), t2.next)
|
348
350
|
assert_equal(Token.new('êëì', 126, 132), t2.next)
|
@@ -355,7 +357,7 @@ end if (/utf-8/i =~ Ferret.locale)
|
|
355
357
|
class PerFieldAnalyzerTest < Test::Unit::TestCase
|
356
358
|
include Ferret::Analysis
|
357
359
|
def test_per_field_analyzer()
|
358
|
-
input = 'DBalmain@gmail.com is My e-mail 52 #$ address. 23
|
360
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ address. 23#!$'
|
359
361
|
pfa = PerFieldAnalyzer.new(StandardAnalyzer.new())
|
360
362
|
pfa['white'] = WhiteSpaceAnalyzer.new(false)
|
361
363
|
pfa['white_l'] = WhiteSpaceAnalyzer.new(true)
|
@@ -370,7 +372,7 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
|
|
370
372
|
assert_equal(Token.new('52', 32, 34), t.next)
|
371
373
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
372
374
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
373
|
-
assert_equal(Token.new('23
|
375
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
374
376
|
assert(! t.next())
|
375
377
|
t = pfa.token_stream('white_l', input)
|
376
378
|
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
@@ -380,7 +382,7 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
|
|
380
382
|
assert_equal(Token.new('52', 32, 34), t.next)
|
381
383
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
382
384
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
383
|
-
assert_equal(Token.new('23
|
385
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
384
386
|
assert(! t.next())
|
385
387
|
t = pfa.token_stream('letter_u', input)
|
386
388
|
assert_equal(Token.new('DBalmain', 0, 8), t.next)
|
@@ -418,7 +420,7 @@ class RegExpAnalyzerTest < Test::Unit::TestCase
|
|
418
420
|
include Ferret::Analysis
|
419
421
|
|
420
422
|
def test_reg_exp_analyzer()
|
421
|
-
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23
|
423
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
|
422
424
|
a = RegExpAnalyzer.new()
|
423
425
|
t = a.token_stream('XXX', input)
|
424
426
|
t2 = a.token_stream('XXX', "one_Two three")
|
@@ -510,7 +512,7 @@ class CustomAnalyzerTest < Test::Unit::TestCase
|
|
510
512
|
include Ferret::Analysis
|
511
513
|
|
512
514
|
def test_custom_filter()
|
513
|
-
input = 'DBalmán@gmail.com is My e-mail and the Address. 23
|
515
|
+
input = 'DBalmán@gmail.com is My e-mail and the Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
514
516
|
a = StemmingStandardAnalyzer.new()
|
515
517
|
t = a.token_stream("fieldname", input)
|
516
518
|
assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
|
@@ -519,7 +521,7 @@ class CustomAnalyzerTest < Test::Unit::TestCase
|
|
519
521
|
assert_equal(Token.new('mail', 27, 31), t.next)
|
520
522
|
assert_equal(Token.new('address', 40, 47), t.next)
|
521
523
|
assert_equal(Token.new('23', 49, 51), t.next)
|
522
|
-
assert_equal(Token.new('www.google.com/result', 55,
|
524
|
+
assert_equal(Token.new('www.google.com/result', 55, 85), t.next)
|
523
525
|
assert_equal(Token.new('tnt', 86, 91), t.next)
|
524
526
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
525
527
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require File.dirname(__FILE__) + "/../../test_helper"
|
2
4
|
|
3
5
|
puts "Loading once"
|
@@ -27,7 +29,7 @@ class AsciiLetterTokenizerTest < Test::Unit::TestCase
|
|
27
29
|
include Ferret::Analysis
|
28
30
|
|
29
31
|
def test_letter_tokenizer()
|
30
|
-
input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23
|
32
|
+
input = 'DBalmain@gmail.com is My e-mail 523@#$ ADDRESS. 23#!$'
|
31
33
|
t = AsciiLetterTokenizer.new(input)
|
32
34
|
assert_equal(Token.new("DBalmain", 0, 8), t.next())
|
33
35
|
assert_equal(Token.new("gmail", 9, 14), t.next())
|
@@ -60,7 +62,7 @@ class LetterTokenizerTest < Test::Unit::TestCase
|
|
60
62
|
include Ferret::Analysis
|
61
63
|
|
62
64
|
def test_letter_tokenizer()
|
63
|
-
input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23
|
65
|
+
input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
64
66
|
t = LetterTokenizer.new(input)
|
65
67
|
assert_equal(Token.new('DBalmän', 0, 8), t.next)
|
66
68
|
assert_equal(Token.new('gmail', 9, 14), t.next)
|
@@ -115,7 +117,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
|
|
115
117
|
include Ferret::Analysis
|
116
118
|
|
117
119
|
def test_whitespace_tokenizer()
|
118
|
-
input = 'DBalmain@gmail.com is My e-mail 52 #$ ADDRESS. 23
|
120
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ ADDRESS. 23#!$'
|
119
121
|
t = AsciiWhiteSpaceTokenizer.new(input)
|
120
122
|
assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
|
121
123
|
assert_equal(Token.new('is', 19, 21), t.next)
|
@@ -124,7 +126,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
|
|
124
126
|
assert_equal(Token.new('52', 32, 34), t.next)
|
125
127
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
126
128
|
assert_equal(Token.new('ADDRESS.', 40, 48), t.next)
|
127
|
-
assert_equal(Token.new('23
|
129
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
128
130
|
assert(! t.next())
|
129
131
|
t.text = "one_two three"
|
130
132
|
assert_equal(Token.new("one_two", 0, 7), t.next())
|
@@ -138,7 +140,7 @@ class AsciiWhiteSpaceTokenizerTest < Test::Unit::TestCase
|
|
138
140
|
assert_equal(Token.new('52', 32, 34), t.next)
|
139
141
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
140
142
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
141
|
-
assert_equal(Token.new('23
|
143
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
142
144
|
assert(! t.next())
|
143
145
|
end
|
144
146
|
end
|
@@ -147,7 +149,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
|
|
147
149
|
include Ferret::Analysis
|
148
150
|
|
149
151
|
def test_whitespace_tokenizer()
|
150
|
-
input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23
|
152
|
+
input = 'DBalmän@gmail.com is My e-mail 52 #$ address. 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
151
153
|
t = WhiteSpaceTokenizer.new(input)
|
152
154
|
assert_equal(Token.new('DBalmän@gmail.com', 0, 18), t.next)
|
153
155
|
assert_equal(Token.new('is', 19, 21), t.next)
|
@@ -156,7 +158,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
|
|
156
158
|
assert_equal(Token.new('52', 32, 34), t.next)
|
157
159
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
158
160
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
159
|
-
assert_equal(Token.new('23
|
161
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
160
162
|
assert_equal(Token.new('ÁÄGÇ®ÊË̯ÚØìÖÎÍ', 55, 86), t.next)
|
161
163
|
assert(! t.next())
|
162
164
|
t.text = "one_two three"
|
@@ -171,7 +173,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
|
|
171
173
|
assert_equal(Token.new('52', 32, 34), t.next)
|
172
174
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
173
175
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
174
|
-
assert_equal(Token.new('23
|
176
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
175
177
|
assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
|
176
178
|
assert(! t.next())
|
177
179
|
t = WhiteSpaceTokenizer.new(input, true)
|
@@ -182,7 +184,7 @@ class WhiteSpaceTokenizerTest < Test::Unit::TestCase
|
|
182
184
|
assert_equal(Token.new('52', 32, 34), t.next)
|
183
185
|
assert_equal(Token.new('#$', 37, 39), t.next)
|
184
186
|
assert_equal(Token.new('address.', 40, 48), t.next)
|
185
|
-
assert_equal(Token.new('23
|
187
|
+
assert_equal(Token.new('23#!$', 49, 54), t.next)
|
186
188
|
assert_equal(Token.new('áägç®êëì¯úøã¬öîí', 55, 86), t.next)
|
187
189
|
assert(! t.next())
|
188
190
|
end
|
@@ -192,7 +194,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
|
|
192
194
|
include Ferret::Analysis
|
193
195
|
|
194
196
|
def test_standard_tokenizer()
|
195
|
-
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23
|
197
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234'
|
196
198
|
t = AsciiStandardTokenizer.new(input)
|
197
199
|
assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
|
198
200
|
assert_equal(Token.new('is', 19, 21), t.next)
|
@@ -201,7 +203,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
|
|
201
203
|
assert_equal(Token.new('52', 32, 34), t.next)
|
202
204
|
assert_equal(Token.new('Address', 40, 47), t.next)
|
203
205
|
assert_equal(Token.new('23', 49, 51), t.next)
|
204
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
206
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
|
205
207
|
assert_equal(Token.new('TNT', 86, 91), t.next)
|
206
208
|
assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
|
207
209
|
assert(! t.next())
|
@@ -217,7 +219,7 @@ class AsciiStandardTokenizerTest < Test::Unit::TestCase
|
|
217
219
|
assert_equal(Token.new('52', 32, 34), t.next)
|
218
220
|
assert_equal(Token.new('address', 40, 47), t.next)
|
219
221
|
assert_equal(Token.new('23', 49, 51), t.next)
|
220
|
-
assert_equal(Token.new('www.google.com/results', 55,
|
222
|
+
assert_equal(Token.new('www.google.com/results', 55, 85), t.next)
|
221
223
|
assert_equal(Token.new('tnt', 86, 91), t.next)
|
222
224
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
223
225
|
assert(! t.next())
|
@@ -228,7 +230,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
228
230
|
include Ferret::Analysis
|
229
231
|
|
230
232
|
def test_standard_tokenizer()
|
231
|
-
input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23
|
233
|
+
input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#!$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
232
234
|
t = StandardTokenizer.new(input)
|
233
235
|
assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
|
234
236
|
assert_equal(Token.new('is', 19, 21), t.next)
|
@@ -237,7 +239,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
237
239
|
assert_equal(Token.new('52', 32, 34), t.next)
|
238
240
|
assert_equal(Token.new('Address', 40, 47), t.next)
|
239
241
|
assert_equal(Token.new('23', 49, 51), t.next)
|
240
|
-
assert_equal(Token.new('www.google.com/res_345', 55,
|
242
|
+
assert_equal(Token.new('www.google.com/res_345', 55, 85), t.next)
|
241
243
|
assert_equal(Token.new('TNT', 86, 91), t.next)
|
242
244
|
assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
|
243
245
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -258,7 +260,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
258
260
|
assert_equal(Token.new('52', 32, 34), t.next)
|
259
261
|
assert_equal(Token.new('address', 40, 47), t.next)
|
260
262
|
assert_equal(Token.new('23', 49, 51), t.next)
|
261
|
-
assert_equal(Token.new('www.google.com/res_345', 55,
|
263
|
+
assert_equal(Token.new('www.google.com/res_345', 55, 85), t.next)
|
262
264
|
assert_equal(Token.new('tnt', 86, 91), t.next)
|
263
265
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
264
266
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -287,7 +289,7 @@ class RegExpTokenizerTest < Test::Unit::TestCase
|
|
287
289
|
APOSTROPHE_WORD = /^#{APOSTROPHE}$/
|
288
290
|
|
289
291
|
def test_reg_exp_tokenizer()
|
290
|
-
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23
|
292
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#!$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
|
291
293
|
t = RegExpTokenizer.new(input)
|
292
294
|
assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
|
293
295
|
assert_equal(Token.new('is', 19, 21), t.next)
|
@@ -483,6 +485,11 @@ class StemFilterTest < Test::Unit::TestCase
|
|
483
485
|
assert_equal(Token.new("dêbater", 36, 44), t.next)
|
484
486
|
assert(! t.next())
|
485
487
|
end
|
488
|
+
|
489
|
+
tz = AsciiLetterTokenizer.new(input)
|
490
|
+
assert_not_nil(StemFilter.new(tz,'HunGarIaN', 'Utf-8'))
|
491
|
+
assert_not_nil(StemFilter.new(tz,'romanIAN', 'iso-8859-2'))
|
492
|
+
assert_raises(ArgumentError) {StemFilter.new(tz, 'Jibberish', 'UTF-8')}
|
486
493
|
end
|
487
494
|
end
|
488
495
|
|