ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/lib/ferret/search/query.rb
CHANGED
@@ -62,15 +62,44 @@ module Ferret::Search
|
|
62
62
|
|
63
63
|
# Expert: called when re-writing queries under MultiSearcher.
|
64
64
|
#
|
65
|
-
#
|
66
|
-
#
|
65
|
+
# Create a single query suitable for use by all subsearchers (in 1-1
|
66
|
+
# correspondence with queries). This is an optimization of the OR of
|
67
|
+
# all queries. We handle the common optimization cases of equal
|
68
|
+
# queries and overlapping clauses of boolean OR queries (as generated
|
69
|
+
# by MultiTermQuery.rewrite() and RangeQuery.rewrite()).
|
70
|
+
# Be careful overriding this method as queries[0] determines which
|
71
|
+
# method will be called and is not necessarily of the same type as
|
72
|
+
# the other queries.
|
67
73
|
def combine(queries)
|
74
|
+
uniques = Set.new
|
68
75
|
queries.each do |query|
|
69
|
-
|
70
|
-
|
76
|
+
clauses = []
|
77
|
+
# check if we can split the query into clauses
|
78
|
+
splittable = query.respond_to? :clauses
|
79
|
+
if splittable
|
80
|
+
splittable = query.coord_disabled?
|
81
|
+
clauses = query.clauses
|
82
|
+
clauses.each do |clause|
|
83
|
+
splittable = clause.occur == BooleanClause::Occur::SHOULD
|
84
|
+
break unless splittable
|
85
|
+
end
|
86
|
+
end
|
87
|
+
if splittable
|
88
|
+
clauses.each { |clause| uniques << clause.query }
|
89
|
+
else
|
90
|
+
uniques << query
|
71
91
|
end
|
72
92
|
end
|
73
|
-
return
|
93
|
+
# optimization: if we have just one query, just return it
|
94
|
+
if uniques.size == 1
|
95
|
+
uniques.each { |query| return query }
|
96
|
+
end
|
97
|
+
|
98
|
+
result = BooleanQuery.new(true)
|
99
|
+
uniques.each do |query|
|
100
|
+
result.add_query(query, BooleanClause::Occur::SHOULD)
|
101
|
+
end
|
102
|
+
return result
|
74
103
|
end
|
75
104
|
|
76
105
|
# Expert: adds all terms occuring in this query to the terms set
|
data/lib/ferret/search/sort.rb
CHANGED
@@ -84,8 +84,12 @@ module Ferret::Search
|
|
84
84
|
fields = fields.map {|field| field.is_a?(Symbol) ? field.to_s : field}
|
85
85
|
if fields[0].is_a?(String)
|
86
86
|
@fields = fields.map do |field|
|
87
|
-
|
88
|
-
|
87
|
+
if (field.is_a?(String))
|
88
|
+
next SortField.new(field, {:sort_type => SortField::SortType::AUTO,
|
89
|
+
:reverse => reverse})
|
90
|
+
else
|
91
|
+
next field
|
92
|
+
end
|
89
93
|
end
|
90
94
|
end
|
91
95
|
doc_sort_added = false
|
@@ -102,7 +106,7 @@ module Ferret::Search
|
|
102
106
|
INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
|
103
107
|
|
104
108
|
def to_s()
|
105
|
-
return @fields.map {|field| "#{field}"}.join(", ")
|
109
|
+
return "Sort[" + @fields.map {|field| "#{field}"}.join(", ") + "]"
|
106
110
|
end
|
107
111
|
end
|
108
112
|
end
|
@@ -20,11 +20,11 @@ module Ferret::Search
|
|
20
20
|
|
21
21
|
# Sort by document score (relevancy). Sort values are Float and higher
|
22
22
|
# values are at the front.
|
23
|
-
SCORE = SortType.new("
|
23
|
+
SCORE = SortType.new("SCORE")
|
24
24
|
|
25
25
|
# Sort by document number (order). Sort values are Integer and lower
|
26
26
|
# values are at the front.
|
27
|
-
DOC = SortType.new("
|
27
|
+
DOC = SortType.new("DOC")
|
28
28
|
|
29
29
|
# Guess sort type of sort based on field contents. We try parsing the
|
30
30
|
# field as an integer and then as a floating point number. If we are
|
@@ -37,7 +37,7 @@ module Ferret::Search
|
|
37
37
|
|
38
38
|
# Sort using term values as encoded Integers. Sort values are Integer
|
39
39
|
# and lower values are at the front.
|
40
|
-
INTEGER = SortType.new("
|
40
|
+
INTEGER = SortType.new("integer", lambda{|str| str.to_i})
|
41
41
|
|
42
42
|
# Sort using term values as encoded Floats. Sort values are Float and
|
43
43
|
# lower values are at the front.
|
@@ -79,7 +79,11 @@ module Ferret::Search
|
|
79
79
|
FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
|
80
80
|
|
81
81
|
def to_s()
|
82
|
-
|
82
|
+
if @name
|
83
|
+
buffer = "#@name:<#@sort_type>"
|
84
|
+
else
|
85
|
+
buffer = "<#{@sort_type}>"
|
86
|
+
end
|
83
87
|
buffer << '!' if @reverse
|
84
88
|
return buffer
|
85
89
|
end
|
@@ -102,13 +102,13 @@ module Ferret::Store
|
|
102
102
|
# delete all the files
|
103
103
|
refresh_dir
|
104
104
|
each do |fname|
|
105
|
-
|
105
|
+
FileUtils.rm_rf(dir_path(fname))
|
106
106
|
end
|
107
107
|
# clear all the locks
|
108
108
|
refresh_lock_dir
|
109
109
|
@lock_dir.each do |lock_fname|
|
110
110
|
next if lock_fname == '.' or lock_fname == '..'
|
111
|
-
|
111
|
+
FileUtils.rm_rf(@lock_dir.path + '/' + lock_fname)
|
112
112
|
end
|
113
113
|
end
|
114
114
|
end
|
@@ -159,7 +159,13 @@ module Ferret::Store
|
|
159
159
|
# This replacement should be atomic.
|
160
160
|
def rename(from, to)
|
161
161
|
synchronize do
|
162
|
-
|
162
|
+
begin
|
163
|
+
File.rename(dir_path(from), dir_path(to))
|
164
|
+
rescue
|
165
|
+
# try again, this time forcing the delete
|
166
|
+
FileUtils.rm_rf(dir_path(to))
|
167
|
+
FileUtils.cp(dir_path(from), dir_path(to))
|
168
|
+
end
|
163
169
|
end
|
164
170
|
end
|
165
171
|
|
@@ -208,11 +214,11 @@ module Ferret::Store
|
|
208
214
|
def initialize(lock_file)
|
209
215
|
@lock_file = lock_file
|
210
216
|
#@clean = FSLock.make_finalizer(lock_file)
|
211
|
-
@clean = lambda {
|
217
|
+
@clean = lambda { FileUtils.rm_rf(lock_file)}
|
212
218
|
end
|
213
219
|
|
214
220
|
def FSLock.make_finalizer(lock_file)
|
215
|
-
lambda {
|
221
|
+
lambda { FileUtils.rm_rf(lock_file)}
|
216
222
|
end
|
217
223
|
|
218
224
|
# obtain the lock on the data source
|
@@ -238,7 +244,7 @@ module Ferret::Store
|
|
238
244
|
def release
|
239
245
|
return if FSDirectory.locks_disabled?
|
240
246
|
begin
|
241
|
-
|
247
|
+
FileUtils.rm_rf(@lock_file)
|
242
248
|
ObjectSpace.undefine_finalizer(self)
|
243
249
|
rescue SystemCallError
|
244
250
|
# maybe we tried to release a lock that wasn't locked. This
|
@@ -364,6 +370,7 @@ module Ferret::Store
|
|
364
370
|
# This method is only used by the c extension to free the directory
|
365
371
|
def close_internal
|
366
372
|
end
|
373
|
+
|
367
374
|
#end private
|
368
375
|
end
|
369
376
|
end
|
@@ -213,20 +213,6 @@ module Ferret::Store
|
|
213
213
|
last = start + length
|
214
214
|
(start ... last).each do |i|
|
215
215
|
write_byte(buf[i])
|
216
|
-
# code = buf[i]
|
217
|
-
# if code >= 0x01 and code <= 0x7F
|
218
|
-
# write_byte(code)
|
219
|
-
# else
|
220
|
-
# # We need to write unicode characters. ToDo: test that this works.
|
221
|
-
# if code > 0x80 and code <= 0x7FF or code == 0
|
222
|
-
# write_byte(0xC0 | code >> 6)
|
223
|
-
# write_byte(0x80 | code & 0x3F)
|
224
|
-
# else
|
225
|
-
# write_byte(0xE0 | (code >> 12))
|
226
|
-
# write_byte(0x80 | ((code >> 6) & 0x3F))
|
227
|
-
# write_byte(0x80 | (code & 0x3F))
|
228
|
-
# end
|
229
|
-
# end
|
230
216
|
end
|
231
217
|
end
|
232
218
|
|
@@ -159,9 +159,10 @@ module Ferret::Store
|
|
159
159
|
flush()
|
160
160
|
last_buffer_number = (@file.length / BUFFER_SIZE).to_i
|
161
161
|
last_buffer_offset = @file.length % BUFFER_SIZE
|
162
|
-
|
162
|
+
|
163
|
+
(0..last_buffer_number).each do |i|
|
163
164
|
len = (i == last_buffer_number ? last_buffer_offset : BUFFER_SIZE)
|
164
|
-
output.write_bytes(
|
165
|
+
output.write_bytes(@file.buffers[i], len)
|
165
166
|
end
|
166
167
|
end
|
167
168
|
|
data/lib/rferret.rb
CHANGED
@@ -399,3 +399,134 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
|
|
399
399
|
assert(! t.next())
|
400
400
|
end
|
401
401
|
end
|
402
|
+
|
403
|
+
class RegExpAnalyzerTest < Test::Unit::TestCase
|
404
|
+
include Ferret::Analysis
|
405
|
+
|
406
|
+
def test_reg_exp_analyzer()
|
407
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
|
408
|
+
a = RegExpAnalyzer.new()
|
409
|
+
t = a.token_stream('XXX', input)
|
410
|
+
t2 = a.token_stream('XXX', "one_Two three")
|
411
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
412
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
413
|
+
assert_equal(Token.new('my', 22, 24), t.next)
|
414
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
415
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
416
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
417
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
418
|
+
assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
|
419
|
+
assert_equal(Token.new('t.n.t.', 91, 97), t.next)
|
420
|
+
assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
|
421
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
422
|
+
assert_equal(Token.new('rob\'s', 119, 124), t.next)
|
423
|
+
assert(! t.next())
|
424
|
+
t = t2
|
425
|
+
assert_equal(Token.new("one_two", 0, 7), t.next())
|
426
|
+
assert_equal(Token.new("three", 8, 13), t.next())
|
427
|
+
assert(! t.next())
|
428
|
+
a = RegExpAnalyzer.new(/\w{2,}/, false)
|
429
|
+
t = a.token_stream('XXX', input)
|
430
|
+
t2 = a.token_stream('XXX', "one Two three")
|
431
|
+
assert_equal(Token.new('DBalmain', 0, 8), t.next)
|
432
|
+
assert_equal(Token.new('gmail', 9, 14), t.next)
|
433
|
+
assert_equal(Token.new('com', 15, 18), t.next)
|
434
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
435
|
+
assert_equal(Token.new('My', 22, 24), t.next)
|
436
|
+
assert_equal(Token.new('mail', 27, 31), t.next)
|
437
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
438
|
+
assert_equal(Token.new('Address', 40, 47), t.next)
|
439
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
440
|
+
assert_equal(Token.new('http', 55, 59), t.next)
|
441
|
+
assert_equal(Token.new('www', 62, 65), t.next)
|
442
|
+
assert_equal(Token.new('google', 66, 72), t.next)
|
443
|
+
assert_equal(Token.new('com', 73, 76), t.next)
|
444
|
+
assert_equal(Token.new('RESULT_3', 77, 85), t.next)
|
445
|
+
assert_equal(Token.new('html', 86, 90), t.next)
|
446
|
+
assert_equal(Token.new('123', 98, 101), t.next)
|
447
|
+
assert_equal(Token.new('1235', 102, 106), t.next)
|
448
|
+
assert_equal(Token.new('ASD', 107, 110), t.next)
|
449
|
+
assert_equal(Token.new('1234', 111, 115), t.next)
|
450
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
451
|
+
assert_equal(Token.new('Rob', 119, 122), t.next)
|
452
|
+
assert(! t.next())
|
453
|
+
assert_equal(Token.new("one", 0, 3), t2.next())
|
454
|
+
assert_equal(Token.new("Two", 4, 7), t2.next())
|
455
|
+
assert_equal(Token.new("three", 8, 13), t2.next())
|
456
|
+
assert(! t2.next())
|
457
|
+
a = RegExpAnalyzer.new() do |str|
|
458
|
+
if str =~ /^[[:alpha:]]\.([[:alpha:]]\.)+$/
|
459
|
+
str.gsub!(/\./, '')
|
460
|
+
elsif str =~ /'[sS]$/
|
461
|
+
str.gsub!(/'[sS]$/, '')
|
462
|
+
end
|
463
|
+
str
|
464
|
+
end
|
465
|
+
t = a.token_stream('XXX', input)
|
466
|
+
t2 = a.token_stream('XXX', "one's don't T.N.T.")
|
467
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
468
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
469
|
+
assert_equal(Token.new('my', 22, 24), t.next)
|
470
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
471
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
472
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
473
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
474
|
+
assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
|
475
|
+
assert_equal(Token.new('tnt', 91, 97), t.next)
|
476
|
+
assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
|
477
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
478
|
+
assert_equal(Token.new('rob', 119, 124), t.next)
|
479
|
+
assert(! t.next())
|
480
|
+
assert_equal(Token.new("one", 0, 5), t2.next())
|
481
|
+
assert_equal(Token.new("don't", 6, 11), t2.next())
|
482
|
+
assert_equal(Token.new("tnt", 12, 18), t2.next())
|
483
|
+
assert(! t2.next())
|
484
|
+
end
|
485
|
+
end
|
486
|
+
|
487
|
+
module Ferret::Analysis
|
488
|
+
class StemmingStandardAnalyzer < StandardAnalyzer
|
489
|
+
def token_stream(field, text)
|
490
|
+
StemFilter.new(super)
|
491
|
+
end
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
class CustomAnalyzerTest < Test::Unit::TestCase
|
496
|
+
include Ferret::Analysis
|
497
|
+
|
498
|
+
def test_custom_filter()
|
499
|
+
input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
|
500
|
+
a = StemmingStandardAnalyzer.new()
|
501
|
+
t = a.token_stream("fieldname", input)
|
502
|
+
assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
|
503
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
504
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
505
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
506
|
+
assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
|
507
|
+
assert_equal(Token.new('tnt', 86, 91), t.next)
|
508
|
+
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
509
|
+
assert_equal(Token.new('23', 111, 113), t.next)
|
510
|
+
assert_equal(Token.new('áägç', 117, 124), t.next)
|
511
|
+
assert_equal(Token.new('êëì', 126, 132), t.next)
|
512
|
+
assert_equal(Token.new('úøã', 134, 140), t.next)
|
513
|
+
assert_equal(Token.new('öîí', 142, 148), t.next)
|
514
|
+
assert(! t.next())
|
515
|
+
input = "Debate Debates DEBATED DEBating Debater";
|
516
|
+
t = a.token_stream("fieldname", input)
|
517
|
+
assert_equal(Token.new("debat", 0, 6), t.next)
|
518
|
+
assert_equal(Token.new("debat", 7, 14), t.next)
|
519
|
+
assert_equal(Token.new("debat", 15, 22), t.next)
|
520
|
+
assert_equal(Token.new("debat", 23, 31), t.next)
|
521
|
+
assert_equal(Token.new("debat", 32, 39), t.next)
|
522
|
+
assert(! t.next())
|
523
|
+
input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
|
524
|
+
t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
|
525
|
+
assert_equal(Token.new("dêbate", 0, 7), t.next)
|
526
|
+
assert_equal(Token.new("dêbate", 8, 16), t.next)
|
527
|
+
assert_equal(Token.new("dêbate", 17, 25), t.next)
|
528
|
+
assert_equal(Token.new("dêbate", 26, 35), t.next)
|
529
|
+
assert_equal(Token.new("dêbater", 36, 44), t.next)
|
530
|
+
assert(! t.next())
|
531
|
+
end
|
532
|
+
end
|
@@ -205,7 +205,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
205
205
|
include Ferret::Analysis
|
206
206
|
|
207
207
|
def test_standard_tokenizer()
|
208
|
-
input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/
|
208
|
+
input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
|
209
209
|
t = StandardTokenizer.new(input)
|
210
210
|
assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
|
211
211
|
assert_equal(Token.new('is', 19, 21), t.next)
|
@@ -214,7 +214,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
214
214
|
assert_equal(Token.new('52', 32, 34), t.next)
|
215
215
|
assert_equal(Token.new('Address', 40, 47), t.next)
|
216
216
|
assert_equal(Token.new('23', 49, 51), t.next)
|
217
|
-
assert_equal(Token.new('www.google.com/
|
217
|
+
assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
|
218
218
|
assert_equal(Token.new('TNT', 86, 91), t.next)
|
219
219
|
assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
|
220
220
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -235,7 +235,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
235
235
|
assert_equal(Token.new('52', 32, 34), t.next)
|
236
236
|
assert_equal(Token.new('address', 40, 47), t.next)
|
237
237
|
assert_equal(Token.new('23', 49, 51), t.next)
|
238
|
-
assert_equal(Token.new('www.google.com/
|
238
|
+
assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
|
239
239
|
assert_equal(Token.new('tnt', 86, 91), t.next)
|
240
240
|
assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
|
241
241
|
assert_equal(Token.new('23', 111, 113), t.next)
|
@@ -247,6 +247,97 @@ class StandardTokenizerTest < Test::Unit::TestCase
|
|
247
247
|
end
|
248
248
|
end
|
249
249
|
|
250
|
+
class RegExpTokenizerTest < Test::Unit::TestCase
|
251
|
+
include Ferret::Analysis
|
252
|
+
|
253
|
+
ALPHA = /[[:alpha:]_-]+/
|
254
|
+
APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
|
255
|
+
ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
|
256
|
+
ACRONYM_WORD = /^#{ACRONYM}$/
|
257
|
+
APOSTROPHE_WORD = /^#{APOSTROPHE}$/
|
258
|
+
|
259
|
+
def test_reg_exp_tokenizer()
|
260
|
+
input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
|
261
|
+
t = RegExpTokenizer.new(input)
|
262
|
+
assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
|
263
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
264
|
+
assert_equal(Token.new('My', 22, 24), t.next)
|
265
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
266
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
267
|
+
assert_equal(Token.new('Address', 40, 47), t.next)
|
268
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
269
|
+
assert_equal(Token.new('http://www.google.com/RESULT_3.html', 55, 90), t.next)
|
270
|
+
assert_equal(Token.new('T.N.T.', 91, 97), t.next)
|
271
|
+
assert_equal(Token.new('123-1235-ASD-1234', 98, 115), t.next)
|
272
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
273
|
+
assert_equal(Token.new('Rob\'s', 119, 124), t.next)
|
274
|
+
assert(! t.next())
|
275
|
+
t.text = "one_two three"
|
276
|
+
assert_equal(Token.new("one_two", 0, 7), t.next())
|
277
|
+
assert_equal(Token.new("three", 8, 13), t.next())
|
278
|
+
assert(! t.next())
|
279
|
+
t = LowerCaseFilter.new(RegExpTokenizer.new(input))
|
280
|
+
t2 = LowerCaseFilter.new(RegExpTokenizer.new(input, /\w{2,}/))
|
281
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
282
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
283
|
+
assert_equal(Token.new('my', 22, 24), t.next)
|
284
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
285
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
286
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
287
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
288
|
+
assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
|
289
|
+
assert_equal(Token.new('t.n.t.', 91, 97), t.next)
|
290
|
+
assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
|
291
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
292
|
+
assert_equal(Token.new('rob\'s', 119, 124), t.next)
|
293
|
+
assert(! t.next())
|
294
|
+
assert_equal(Token.new('dbalmain', 0, 8), t2.next)
|
295
|
+
assert_equal(Token.new('gmail', 9, 14), t2.next)
|
296
|
+
assert_equal(Token.new('com', 15, 18), t2.next)
|
297
|
+
assert_equal(Token.new('is', 19, 21), t2.next)
|
298
|
+
assert_equal(Token.new('my', 22, 24), t2.next)
|
299
|
+
assert_equal(Token.new('mail', 27, 31), t2.next)
|
300
|
+
assert_equal(Token.new('52', 32, 34), t2.next)
|
301
|
+
assert_equal(Token.new('address', 40, 47), t2.next)
|
302
|
+
assert_equal(Token.new('23', 49, 51), t2.next)
|
303
|
+
assert_equal(Token.new('http', 55, 59), t2.next)
|
304
|
+
assert_equal(Token.new('www', 62, 65), t2.next)
|
305
|
+
assert_equal(Token.new('google', 66, 72), t2.next)
|
306
|
+
assert_equal(Token.new('com', 73, 76), t2.next)
|
307
|
+
assert_equal(Token.new('result_3', 77, 85), t2.next)
|
308
|
+
assert_equal(Token.new('html', 86, 90), t2.next)
|
309
|
+
assert_equal(Token.new('123', 98, 101), t2.next)
|
310
|
+
assert_equal(Token.new('1235', 102, 106), t2.next)
|
311
|
+
assert_equal(Token.new('asd', 107, 110), t2.next)
|
312
|
+
assert_equal(Token.new('1234', 111, 115), t2.next)
|
313
|
+
assert_equal(Token.new('23', 116, 118), t2.next)
|
314
|
+
assert_equal(Token.new('rob', 119, 122), t2.next)
|
315
|
+
assert(! t2.next())
|
316
|
+
t = RegExpTokenizer.new(input) do |str|
|
317
|
+
if str =~ ACRONYM_WORD
|
318
|
+
str.gsub!(/\./, '')
|
319
|
+
elsif str =~ APOSTROPHE_WORD
|
320
|
+
str.gsub!(/'[sS]$/, '')
|
321
|
+
end
|
322
|
+
str
|
323
|
+
end
|
324
|
+
t = LowerCaseFilter.new(t)
|
325
|
+
assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
|
326
|
+
assert_equal(Token.new('is', 19, 21), t.next)
|
327
|
+
assert_equal(Token.new('my', 22, 24), t.next)
|
328
|
+
assert_equal(Token.new('e-mail', 25, 31), t.next)
|
329
|
+
assert_equal(Token.new('52', 32, 34), t.next)
|
330
|
+
assert_equal(Token.new('address', 40, 47), t.next)
|
331
|
+
assert_equal(Token.new('23', 49, 51), t.next)
|
332
|
+
assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
|
333
|
+
assert_equal(Token.new('tnt', 91, 97), t.next)
|
334
|
+
assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
|
335
|
+
assert_equal(Token.new('23', 116, 118), t.next)
|
336
|
+
assert_equal(Token.new('rob', 119, 124), t.next)
|
337
|
+
assert(! t.next())
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
250
341
|
class StopFilterTest < Test::Unit::TestCase
|
251
342
|
include Ferret::Analysis
|
252
343
|
|
@@ -383,11 +474,9 @@ module Ferret::Analysis
|
|
383
474
|
def next()
|
384
475
|
t = @input.next()
|
385
476
|
|
386
|
-
if (t
|
387
|
-
return nil
|
388
|
-
end
|
477
|
+
return nil if (t.nil?)
|
389
478
|
|
390
|
-
t.text = t.text
|
479
|
+
t.text = t.text.capitalize
|
391
480
|
|
392
481
|
return t
|
393
482
|
end
|
@@ -402,7 +491,7 @@ class CustomFilterTest < Test::Unit::TestCase
|
|
402
491
|
t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
|
403
492
|
assert_equal(Token.new("This", 0, 4), t.next)
|
404
493
|
assert_equal(Token.new("Text", 5, 9), t.next)
|
405
|
-
assert_equal(Token.new("
|
494
|
+
assert_equal(Token.new("Should", 10, 16), t.next)
|
406
495
|
assert_equal(Token.new("Be", 17, 19), t.next)
|
407
496
|
assert_equal(Token.new("Capitalized", 20, 31), t.next)
|
408
497
|
assert_equal(Token.new("I", 36, 37), t.next)
|
@@ -412,7 +501,7 @@ class CustomFilterTest < Test::Unit::TestCase
|
|
412
501
|
t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
|
413
502
|
assert_equal(Token.new("This", 0, 4), t.next)
|
414
503
|
assert_equal(Token.new("Text", 5, 9), t.next)
|
415
|
-
assert_equal(Token.new("
|
504
|
+
assert_equal(Token.new("Should", 10, 16), t.next)
|
416
505
|
assert_equal(Token.new("Be", 17, 19), t.next)
|
417
506
|
assert_equal(Token.new("Capit", 20, 31), t.next)
|
418
507
|
assert_equal(Token.new("I", 36, 37), t.next)
|