ferret 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/lib/ferret/search/query.rb
CHANGED
@@ -62,15 +62,44 @@ module Ferret::Search
|
|
62
62
|
|
63
63
|
# Expert: called when re-writing queries under MultiSearcher.
|
64
64
|
#
|
65
|
-
#
|
66
|
-
#
|
65
|
+
# Create a single query suitable for use by all subsearchers (in 1-1
|
66
|
+
# correspondence with queries). This is an optimization of the OR of
|
67
|
+
# all queries. We handle the common optimization cases of equal
|
68
|
+
# queries and overlapping clauses of boolean OR queries (as generated
|
69
|
+
# by MultiTermQuery.rewrite() and RangeQuery.rewrite()).
|
70
|
+
# Be careful overriding this method as queries[0] determines which
|
71
|
+
# method will be called and is not necessarily of the same type as
|
72
|
+
# the other queries.
|
67
73
|
def combine(queries)
|
74
|
+
uniques = Set.new
|
68
75
|
queries.each do |query|
|
69
|
-
|
70
|
-
|
76
|
+
clauses = []
|
77
|
+
# check if we can split the query into clauses
|
78
|
+
splittable = query.respond_to? :clauses
|
79
|
+
if splittable
|
80
|
+
splittable = query.coord_disabled?
|
81
|
+
clauses = query.clauses
|
82
|
+
clauses.each do |clause|
|
83
|
+
splittable = clause.occur == BooleanClause::Occur::SHOULD
|
84
|
+
break unless splittable
|
85
|
+
end
|
86
|
+
end
|
87
|
+
if splittable
|
88
|
+
clauses.each { |clause| uniques << clause.query }
|
89
|
+
else
|
90
|
+
uniques << query
|
71
91
|
end
|
72
92
|
end
|
73
|
-
return
|
93
|
+
# optimization: if we have just one query, just return it
|
94
|
+
if uniques.size == 1
|
95
|
+
uniques.each { |query| return query }
|
96
|
+
end
|
97
|
+
|
98
|
+
result = BooleanQuery.new(true)
|
99
|
+
uniques.each do |query|
|
100
|
+
result.add_query(query, BooleanClause::Occur::SHOULD)
|
101
|
+
end
|
102
|
+
return result
|
74
103
|
end
|
75
104
|
|
76
105
|
# Expert: adds all terms occuring in this query to the terms set
|
data/lib/ferret/search/sort.rb
CHANGED
@@ -84,8 +84,12 @@ module Ferret::Search
|
|
84
84
|
fields = fields.map {|field| field.is_a?(Symbol) ? field.to_s : field}
|
85
85
|
if fields[0].is_a?(String)
|
86
86
|
@fields = fields.map do |field|
|
87
|
-
|
88
|
-
|
87
|
+
if (field.is_a?(String))
|
88
|
+
next SortField.new(field, {:sort_type => SortField::SortType::AUTO,
|
89
|
+
:reverse => reverse})
|
90
|
+
else
|
91
|
+
next field
|
92
|
+
end
|
89
93
|
end
|
90
94
|
end
|
91
95
|
doc_sort_added = false
|
@@ -102,7 +106,7 @@ module Ferret::Search
|
|
102
106
|
INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
|
103
107
|
|
104
108
|
def to_s()
|
105
|
-
return @fields.map {|field| "#{field}"}.join(", ")
|
109
|
+
return "Sort[" + @fields.map {|field| "#{field}"}.join(", ") + "]"
|
106
110
|
end
|
107
111
|
end
|
108
112
|
end
|
@@ -20,11 +20,11 @@ module Ferret::Search
|
|
20
20
|
|
21
21
|
# Sort by document score (relevancy). Sort values are Float and higher
|
22
22
|
# values are at the front.
|
23
|
-
SCORE = SortType.new("
|
23
|
+
SCORE = SortType.new("SCORE")
|
24
24
|
|
25
25
|
# Sort by document number (order). Sort values are Integer and lower
|
26
26
|
# values are at the front.
|
27
|
-
DOC = SortType.new("
|
27
|
+
DOC = SortType.new("DOC")
|
28
28
|
|
29
29
|
# Guess sort type of sort based on field contents. We try parsing the
|
30
30
|
# field as an integer and then as a floating point number. If we are
|
@@ -37,7 +37,7 @@ module Ferret::Search
|
|
37
37
|
|
38
38
|
# Sort using term values as encoded Integers. Sort values are Integer
|
39
39
|
# and lower values are at the front.
|
40
|
-
INTEGER = SortType.new("
|
40
|
+
INTEGER = SortType.new("integer", lambda{|str| str.to_i})
|
41
41
|
|
42
42
|
# Sort using term values as encoded Floats. Sort values are Float and
|
43
43
|
# lower values are at the front.
|
@@ -79,7 +79,11 @@ module Ferret::Search
|
|
79
79
|
FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
|
80
80
|
|
81
81
|
def to_s()
|
82
|
-
|
82
|
+
if @name
|
83
|
+
buffer = "#@name:<#@sort_type>"
|
84
|
+
else
|
85
|
+
buffer = "<#{@sort_type}>"
|
86
|
+
end
|
83
87
|
buffer << '!' if @reverse
|
84
88
|
return buffer
|
85
89
|
end
|
@@ -102,13 +102,13 @@ module Ferret::Store
|
|
102
102
|
# delete all the files
|
103
103
|
refresh_dir
|
104
104
|
each do |fname|
|
105
|
-
|
105
|
+
FileUtils.rm_rf(dir_path(fname))
|
106
106
|
end
|
107
107
|
# clear all the locks
|
108
108
|
refresh_lock_dir
|
109
109
|
@lock_dir.each do |lock_fname|
|
110
110
|
next if lock_fname == '.' or lock_fname == '..'
|
111
|
-
|
111
|
+
FileUtils.rm_rf(@lock_dir.path + '/' + lock_fname)
|
112
112
|
end
|
113
113
|
end
|
114
114
|
end
|
@@ -159,7 +159,13 @@ module Ferret::Store
|
|
159
159
|
# This replacement should be atomic.
|
160
160
|
def rename(from, to)
|
161
161
|
synchronize do
|
162
|
-
|
162
|
+
begin
|
163
|
+
File.rename(dir_path(from), dir_path(to))
|
164
|
+
rescue
|
165
|
+
# try again, this time forcing the delete
|
166
|
+
FileUtils.rm_rf(dir_path(to))
|
167
|
+
FileUtils.cp(dir_path(from), dir_path(to))
|
168
|
+
end
|
163
169
|
end
|
164
170
|
end
|
165
171
|
|
@@ -208,11 +214,11 @@ module Ferret::Store
|
|
208
214
|
def initialize(lock_file)
|
209
215
|
@lock_file = lock_file
|
210
216
|
#@clean = FSLock.make_finalizer(lock_file)
|
211
|
-
@clean = lambda {
|
217
|
+
@clean = lambda { FileUtils.rm_rf(lock_file)}
|
212
218
|
end
|
213
219
|
|
214
220
|
def FSLock.make_finalizer(lock_file)
|
215
|
-
lambda {
|
221
|
+
lambda { FileUtils.rm_rf(lock_file)}
|
216
222
|
end
|
217
223
|
|
218
224
|
# obtain the lock on the data source
|
@@ -238,7 +244,7 @@ module Ferret::Store
|
|
238
244
|
def release
|
239
245
|
return if FSDirectory.locks_disabled?
|
240
246
|
begin
|
241
|
-
|
247
|
+
FileUtils.rm_rf(@lock_file)
|
242
248
|
ObjectSpace.undefine_finalizer(self)
|
243
249
|
rescue SystemCallError
|
244
250
|
# maybe we tried to release a lock that wasn't locked. This
|
@@ -364,6 +370,7 @@ module Ferret::Store
|
|
364
370
|
# This method is only used by the c extension to free the directory
|
365
371
|
def close_internal
|
366
372
|
end
|
373
|
+
|
367
374
|
#end private
|
368
375
|
end
|
369
376
|
end
|
@@ -213,20 +213,6 @@ module Ferret::Store
|
|
213
213
|
last = start + length
|
214
214
|
(start ... last).each do |i|
|
215
215
|
write_byte(buf[i])
|
216
|
-
# code = buf[i]
|
217
|
-
# if code >= 0x01 and code <= 0x7F
|
218
|
-
# write_byte(code)
|
219
|
-
# else
|
220
|
-
# # We need to write unicode characters. ToDo: test that this works.
|
221
|
-
# if code > 0x80 and code <= 0x7FF or code == 0
|
222
|
-
# write_byte(0xC0 | code >> 6)
|
223
|
-
# write_byte(0x80 | code & 0x3F)
|
224
|
-
# else
|
225
|
-
# write_byte(0xE0 | (code >> 12))
|
226
|
-
# write_byte(0x80 | ((code >> 6) & 0x3F))
|
227
|
-
# write_byte(0x80 | (code & 0x3F))
|
228
|
-
# end
|
229
|
-
# end
|
230
216
|
end
|
231
217
|
end
|
232
218
|
|
@@ -159,9 +159,10 @@ module Ferret::Store
|
|
159
159
|
flush()
|
160
160
|
last_buffer_number = (@file.length / BUFFER_SIZE).to_i
|
161
161
|
last_buffer_offset = @file.length % BUFFER_SIZE
|
162
|
-
|
162
|
+
|
163
|
+
(0..last_buffer_number).each do |i|
|
163
164
|
len = (i == last_buffer_number ? last_buffer_offset : BUFFER_SIZE)
|
164
|
-
output.write_bytes(
|
165
|
+
output.write_bytes(@file.buffers[i], len)
|
165
166
|
end
|
166
167
|
end
|
167
168
|
|
data/lib/rferret.rb
CHANGED
@@ -399,3 +399,134 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
|
|
399
399
|
assert(! t.next())
|
400
400
|
end
|
401
401
|
end
|
402
|
+
|
403
|
+
class RegExpAnalyzerTest < Test::Unit::TestCase
|
404
|
+
include Ferret::Analysis
|
405
|
+
|
406
|
+
def test_reg_exp_analyzer()
|
407
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
|
408
|
+
a = RegExpAnalyzer.new()
|
409
|
+
t = a.token_stream('XXX', input)
|
410
|
+
t2 = a.token_stream('XXX', "one_Two three")
|
411
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
412
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
413
|
+
assert_equal(Token.new('my', 22, 24), t.next)
|
414
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
415
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
416
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
417
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
418
|
+
assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
|
419
|
+
assert_equal(Token.new('t.n.t.', 91, 97), t.next)
|
420
|
+
assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
|
421
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
422
|
+
assert_equal(Token.new('rob\'s', 119, 124), t.next)
|
423
|
+
assert(! t.next())
|
424
|
+
t = t2
|
425
|
+
assert_equal(Token.new("one_two", 0, 7), t.next())
|
426
|
+
assert_equal(Token.new("three", 8, 13), t.next())
|
427
|
+
assert(! t.next())
|
428
|
+
a = RegExpAnalyzer.new(/\w{2,}/, false)
|
429
|
+
t = a.token_stream('XXX', input)
|
430
|
+
t2 = a.token_stream('XXX', "one Two three")
|
431
|
+
assert_equal(Token.new('DBalmain', 0, 8), t.next)
|
432
|
+
assert_equal(Token.new('gmail', 9, 14), t.next)
|
433
|
+
assert_equal(Token.new('com', 15, 18), t.next)
|
434
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
435
|
+
assert_equal(Token.new('My', 22, 24), t.next)
|
436
|
+
assert_equal(Token.new('mail', 27, 31), t.next)
|
437
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
438
|
+
assert_equal(Token.new('Address', 40, 47), t.next)
|
439
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
440
|
+
assert_equal(Token.new('http', 55, 59), t.next)
|
441
|
+
assert_equal(Token.new('www', 62, 65), t.next)
|
442
|
+
assert_equal(Token.new('google', 66, 72), t.next)
|
443
|
+
assert_equal(Token.new('com', 73, 76), t.next)
|
444
|
+
assert_equal(Token.new('RESULT_3', 77, 85), t.next)
|
445
|
+
assert_equal(Token.new('html', 86, 90), t.next)
|
446
|
+
assert_equal(Token.new('123', 98, 101), t.next)
|
447
|
+
assert_equal(Token.new('1235', 102, 106), t.next)
|
448
|
+
assert_equal(Token.new('ASD', 107, 110), t.next)
|
449
|
+
assert_equal(Token.new('1234', 111, 115), t.next)
|
450
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
451
|
+
assert_equal(Token.new('Rob', 119, 122), t.next)
|
452
|
+
assert(! t.next())
|
453
|
+
assert_equal(Token.new("one", 0, 3), t2.next())
|
454
|
+
assert_equal(Token.new("Two", 4, 7), t2.next())
|
455
|
+
assert_equal(Token.new("three", 8, 13), t2.next())
|
456
|
+
assert(! t2.next())
|
457
|
+
a = RegExpAnalyzer.new() do |str|
|
458
|
+
if str =~ /^[[:alpha:]]\.([[:alpha:]]\.)+$/
|
459
|
+
str.gsub!(/\./, '')
|
460
|
+
elsif str =~ /'[sS]$/
|
461
|
+
str.gsub!(/'[sS]$/, '')
|
462
|
+
end
|
463
|
+
str
|
464
|
+
end
|
465
|
+
t = a.token_stream('XXX', input)
|
466
|
+
t2 = a.token_stream('XXX', "one's don't T.N.T.")
|
467
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
468
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
469
|
+
assert_equal(Token.new('my', 22, 24), t.next)
|
470
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
471
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
472
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
473
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
474
|
+
assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
|
475
|
+
assert_equal(Token.new('tnt', 91, 97), t.next)
|
476
|
+
assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
|
477
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
478
|
+
assert_equal(Token.new('rob', 119, 124), t.next)
|
479
|
+
assert(! t.next())
|
480
|
+
assert_equal(Token.new("one", 0, 5), t2.next())
|
481
|
+
assert_equal(Token.new("don't", 6, 11), t2.next())
|
482
|
+
assert_equal(Token.new("tnt", 12, 18), t2.next())
|
483
|
+
assert(! t2.next())
|
484
|
+
end
|
485
|
+
end
|
486
|
+
|
487
|
+
module Ferret::Analysis
|
488
|
+
class StemmingStandardAnalyzer < StandardAnalyzer
|
489
|
+
def token_stream(field, text)
|
490
|
+
StemFilter.new(super)
|
491
|
+
end
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
class CustomAnalyzerTest < Test::Unit::TestCase
|
496
|
+
include Ferret::Analysis
|
497
|
+
|
498
|
+
def test_custom_filter()
|
499
|
+
input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
500
|
+
a = StemmingStandardAnalyzer.new()
|
501
|
+
t = a.token_stream("fieldname", input)
|
502
|
+
assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
|
503
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
504
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
505
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
506
|
+
assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
|
507
|
+
assert_equal(Token.new('tnt', 86, 91), t.next)
|
508
|
+
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
509
|
+
assert_equal(Token.new('23', 111, 113), t.next)
|
510
|
+
assert_equal(Token.new('áägç', 117, 124), t.next)
|
511
|
+
assert_equal(Token.new('êëì', 126, 132), t.next)
|
512
|
+
assert_equal(Token.new('úøã', 134, 140), t.next)
|
513
|
+
assert_equal(Token.new('öîí', 142, 148), t.next)
|
514
|
+
assert(! t.next())
|
515
|
+
input = "Debate Debates DEBATED DEBating Debater";
|
516
|
+
t = a.token_stream("fieldname", input)
|
517
|
+
assert_equal(Token.new("debat", 0, 6), t.next)
|
518
|
+
assert_equal(Token.new("debat", 7, 14), t.next)
|
519
|
+
assert_equal(Token.new("debat", 15, 22), t.next)
|
520
|
+
assert_equal(Token.new("debat", 23, 31), t.next)
|
521
|
+
assert_equal(Token.new("debat", 32, 39), t.next)
|
522
|
+
assert(! t.next())
|
523
|
+
input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
|
524
|
+
t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
|
525
|
+
assert_equal(Token.new("dêbate", 0, 7), t.next)
|
526
|
+
assert_equal(Token.new("dêbate", 8, 16), t.next)
|
527
|
+
assert_equal(Token.new("dêbate", 17, 25), t.next)
|
528
|
+
assert_equal(Token.new("dêbate", 26, 35), t.next)
|
529
|
+
assert_equal(Token.new("dêbater", 36, 44), t.next)
|
530
|
+
assert(! t.next())
|
531
|
+
end
|
532
|
+
end
|
@@ -205,7 +205,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
205
205
|
include Ferret::Analysis
|
206
206
|
|
207
207
|
def test_standard_tokenizer()
|
208
|
-
input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/
|
208
|
+
input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
|
209
209
|
t = StandardTokenizer.new(input)
|
210
210
|
assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
|
211
211
|
assert_equal(Token.new('is', 19, 21), t.next)
|
@@ -214,7 +214,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
214
214
|
assert_equal(Token.new('52', 32, 34), t.next)
|
215
215
|
assert_equal(Token.new('Address', 40, 47), t.next)
|
216
216
|
assert_equal(Token.new('23', 49, 51), t.next)
|
217
|
-
assert_equal(Token.new('www.google.com/
|
217
|
+
assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
|
218
218
|
assert_equal(Token.new('TNT', 86, 91), t.next)
|
219
219
|
assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
|
220
220
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -235,7 +235,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
235
235
|
assert_equal(Token.new('52', 32, 34), t.next)
|
236
236
|
assert_equal(Token.new('address', 40, 47), t.next)
|
237
237
|
assert_equal(Token.new('23', 49, 51), t.next)
|
238
|
-
assert_equal(Token.new('www.google.com/
|
238
|
+
assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
|
239
239
|
assert_equal(Token.new('tnt', 86, 91), t.next)
|
240
240
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
241
241
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -247,6 +247,97 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
247
247
|
end
|
248
248
|
end
|
249
249
|
|
250
|
+
class RegExpTokenizerTest < Test::Unit::TestCase
|
251
|
+
include Ferret::Analysis
|
252
|
+
|
253
|
+
ALPHA = /[[:alpha:]_-]+/
|
254
|
+
APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
|
255
|
+
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
256
|
+
ACRONYM_WORD = /^#{ACRONYM}$/
|
257
|
+
APOSTROPHE_WORD = /^#{APOSTROPHE}$/
|
258
|
+
|
259
|
+
def test_reg_exp_tokenizer()
|
260
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
|
261
|
+
t = RegExpTokenizer.new(input)
|
262
|
+
assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
|
263
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
264
|
+
assert_equal(Token.new('My', 22, 24), t.next)
|
265
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
266
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
267
|
+
assert_equal(Token.new('Address', 40, 47), t.next)
|
268
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
269
|
+
assert_equal(Token.new('http://www.google.com/RESULT_3.html', 55, 90), t.next)
|
270
|
+
assert_equal(Token.new('T.N.T.', 91, 97), t.next)
|
271
|
+
assert_equal(Token.new('123-1235-ASD-1234', 98, 115), t.next)
|
272
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
273
|
+
assert_equal(Token.new('Rob\'s', 119, 124), t.next)
|
274
|
+
assert(! t.next())
|
275
|
+
t.text = "one_two three"
|
276
|
+
assert_equal(Token.new("one_two", 0, 7), t.next())
|
277
|
+
assert_equal(Token.new("three", 8, 13), t.next())
|
278
|
+
assert(! t.next())
|
279
|
+
t = LowerCaseFilter.new(RegExpTokenizer.new(input))
|
280
|
+
t2 = LowerCaseFilter.new(RegExpTokenizer.new(input, /\w{2,}/))
|
281
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
282
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
283
|
+
assert_equal(Token.new('my', 22, 24), t.next)
|
284
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
285
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
286
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
287
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
288
|
+
assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
|
289
|
+
assert_equal(Token.new('t.n.t.', 91, 97), t.next)
|
290
|
+
assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
|
291
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
292
|
+
assert_equal(Token.new('rob\'s', 119, 124), t.next)
|
293
|
+
assert(! t.next())
|
294
|
+
assert_equal(Token.new('dbalmain', 0, 8), t2.next)
|
295
|
+
assert_equal(Token.new('gmail', 9, 14), t2.next)
|
296
|
+
assert_equal(Token.new('com', 15, 18), t2.next)
|
297
|
+
assert_equal(Token.new('is', 19, 21), t2.next)
|
298
|
+
assert_equal(Token.new('my', 22, 24), t2.next)
|
299
|
+
assert_equal(Token.new('mail', 27, 31), t2.next)
|
300
|
+
assert_equal(Token.new('52', 32, 34), t2.next)
|
301
|
+
assert_equal(Token.new('address', 40, 47), t2.next)
|
302
|
+
assert_equal(Token.new('23', 49, 51), t2.next)
|
303
|
+
assert_equal(Token.new('http', 55, 59), t2.next)
|
304
|
+
assert_equal(Token.new('www', 62, 65), t2.next)
|
305
|
+
assert_equal(Token.new('google', 66, 72), t2.next)
|
306
|
+
assert_equal(Token.new('com', 73, 76), t2.next)
|
307
|
+
assert_equal(Token.new('result_3', 77, 85), t2.next)
|
308
|
+
assert_equal(Token.new('html', 86, 90), t2.next)
|
309
|
+
assert_equal(Token.new('123', 98, 101), t2.next)
|
310
|
+
assert_equal(Token.new('1235', 102, 106), t2.next)
|
311
|
+
assert_equal(Token.new('asd', 107, 110), t2.next)
|
312
|
+
assert_equal(Token.new('1234', 111, 115), t2.next)
|
313
|
+
assert_equal(Token.new('23', 116, 118), t2.next)
|
314
|
+
assert_equal(Token.new('rob', 119, 122), t2.next)
|
315
|
+
assert(! t2.next())
|
316
|
+
t = RegExpTokenizer.new(input) do |str|
|
317
|
+
if str =~ ACRONYM_WORD
|
318
|
+
str.gsub!(/\./, '')
|
319
|
+
elsif str =~ APOSTROPHE_WORD
|
320
|
+
str.gsub!(/'[sS]$/, '')
|
321
|
+
end
|
322
|
+
str
|
323
|
+
end
|
324
|
+
t = LowerCaseFilter.new(t)
|
325
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
326
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
327
|
+
assert_equal(Token.new('my', 22, 24), t.next)
|
328
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
329
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
330
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
331
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
332
|
+
assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
|
333
|
+
assert_equal(Token.new('tnt', 91, 97), t.next)
|
334
|
+
assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
|
335
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
336
|
+
assert_equal(Token.new('rob', 119, 124), t.next)
|
337
|
+
assert(! t.next())
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
250
341
|
class StopFilterTest < Test::Unit::TestCase
|
251
342
|
include Ferret::Analysis
|
252
343
|
|
@@ -383,11 +474,9 @@ module Ferret::Analysis
|
|
383
474
|
def next()
|
384
475
|
t = @input.next()
|
385
476
|
|
386
|
-
if (t
|
387
|
-
return nil
|
388
|
-
end
|
477
|
+
return nil if (t.nil?)
|
389
478
|
|
390
|
-
t.text = t.text
|
479
|
+
t.text = t.text.capitalize
|
391
480
|
|
392
481
|
return t
|
393
482
|
end
|
@@ -402,7 +491,7 @@ class CustomFilterTest < Test::Unit::TestCase
|
|
402
491
|
t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
|
403
492
|
assert_equal(Token.new("This", 0, 4), t.next)
|
404
493
|
assert_equal(Token.new("Text", 5, 9), t.next)
|
405
|
-
assert_equal(Token.new("
|
494
|
+
assert_equal(Token.new("Should", 10, 16), t.next)
|
406
495
|
assert_equal(Token.new("Be", 17, 19), t.next)
|
407
496
|
assert_equal(Token.new("Capitalized", 20, 31), t.next)
|
408
497
|
assert_equal(Token.new("I", 36, 37), t.next)
|
@@ -412,7 +501,7 @@ class CustomFilterTest < Test::Unit::TestCase
|
|
412
501
|
t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
|
413
502
|
assert_equal(Token.new("This", 0, 4), t.next)
|
414
503
|
assert_equal(Token.new("Text", 5, 9), t.next)
|
415
|
-
assert_equal(Token.new("
|
504
|
+
assert_equal(Token.new("Should", 10, 16), t.next)
|
416
505
|
assert_equal(Token.new("Be", 17, 19), t.next)
|
417
506
|
assert_equal(Token.new("Capit", 20, 31), t.next)
|
418
507
|
assert_equal(Token.new("I", 36, 37), t.next)
|