ferret 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
@@ -173,7 +173,7 @@ module Ferret::Search
173
173
 
174
174
  # See Query#extract_terms()
175
175
  def extract_terms(query_terms)
176
- query_terms.add_all(@terms)
176
+ query_terms.merge(@terms)
177
177
  end
178
178
 
179
179
  # Prints a user-readable version of this query.
@@ -62,15 +62,44 @@ module Ferret::Search
62
62
 
63
63
  # Expert: called when re-writing queries under MultiSearcher.
64
64
  #
65
- # Only implemented by derived queries, with no #create_weight()
66
- # implementatation.
65
+ # Create a single query suitable for use by all subsearchers (in 1-1
66
+ # correspondence with queries). This is an optimization of the OR of
67
+ # all queries. We handle the common optimization cases of equal
68
+ # queries and overlapping clauses of boolean OR queries (as generated
69
+ # by MultiTermQuery.rewrite() and RangeQuery.rewrite()).
70
+ # Be careful overriding this method as queries[0] determines which
71
+ # method will be called and is not necessarily of the same type as
72
+ # the other queries.
67
73
  def combine(queries)
74
+ uniques = Set.new
68
75
  queries.each do |query|
69
- if self != query
70
- raise ArgumentError
76
+ clauses = []
77
+ # check if we can split the query into clauses
78
+ splittable = query.respond_to? :clauses
79
+ if splittable
80
+ splittable = query.coord_disabled?
81
+ clauses = query.clauses
82
+ clauses.each do |clause|
83
+ splittable = clause.occur == BooleanClause::Occur::SHOULD
84
+ break unless splittable
85
+ end
86
+ end
87
+ if splittable
88
+ clauses.each { |clause| uniques << clause.query }
89
+ else
90
+ uniques << query
71
91
  end
72
92
  end
73
- return self
93
+ # optimization: if we have just one query, just return it
94
+ if uniques.size == 1
95
+ uniques.each { |query| return query }
96
+ end
97
+
98
+ result = BooleanQuery.new(true)
99
+ uniques.each do |query|
100
+ result.add_query(query, BooleanClause::Occur::SHOULD)
101
+ end
102
+ return result
74
103
  end
75
104
 
76
105
  # Expert: adds all terms occuring in this query to the terms set
@@ -84,8 +84,12 @@ module Ferret::Search
84
84
  fields = fields.map {|field| field.is_a?(Symbol) ? field.to_s : field}
85
85
  if fields[0].is_a?(String)
86
86
  @fields = fields.map do |field|
87
- SortField.new(field, {:sort_type => SortField::SortType::AUTO,
88
- :reverse => reverse})
87
+ if (field.is_a?(String))
88
+ next SortField.new(field, {:sort_type => SortField::SortType::AUTO,
89
+ :reverse => reverse})
90
+ else
91
+ next field
92
+ end
89
93
  end
90
94
  end
91
95
  doc_sort_added = false
@@ -102,7 +106,7 @@ module Ferret::Search
102
106
  INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
103
107
 
104
108
  def to_s()
105
- return @fields.map {|field| "#{field}"}.join(", ")
109
+ return "Sort[" + @fields.map {|field| "#{field}"}.join(", ") + "]"
106
110
  end
107
111
  end
108
112
  end
@@ -20,11 +20,11 @@ module Ferret::Search
20
20
 
21
21
  # Sort by document score (relevancy). Sort values are Float and higher
22
22
  # values are at the front.
23
- SCORE = SortType.new("score")
23
+ SCORE = SortType.new("SCORE")
24
24
 
25
25
  # Sort by document number (order). Sort values are Integer and lower
26
26
  # values are at the front.
27
- DOC = SortType.new("doc")
27
+ DOC = SortType.new("DOC")
28
28
 
29
29
  # Guess sort type of sort based on field contents. We try parsing the
30
30
  # field as an integer and then as a floating point number. If we are
@@ -37,7 +37,7 @@ module Ferret::Search
37
37
 
38
38
  # Sort using term values as encoded Integers. Sort values are Integer
39
39
  # and lower values are at the front.
40
- INTEGER = SortType.new("int", lambda{|str| str.to_i})
40
+ INTEGER = SortType.new("integer", lambda{|str| str.to_i})
41
41
 
42
42
  # Sort using term values as encoded Floats. Sort values are Float and
43
43
  # lower values are at the front.
@@ -79,7 +79,11 @@ module Ferret::Search
79
79
  FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
80
80
 
81
81
  def to_s()
82
- buffer = '"' + (@name||"<#{@sort_type}>") + '"'
82
+ if @name
83
+ buffer = "#@name:<#@sort_type>"
84
+ else
85
+ buffer = "<#{@sort_type}>"
86
+ end
83
87
  buffer << '!' if @reverse
84
88
  return buffer
85
89
  end
@@ -102,13 +102,13 @@ module Ferret::Store
102
102
  # delete all the files
103
103
  refresh_dir
104
104
  each do |fname|
105
- File.delete(dir_path(fname))
105
+ FileUtils.rm_rf(dir_path(fname))
106
106
  end
107
107
  # clear all the locks
108
108
  refresh_lock_dir
109
109
  @lock_dir.each do |lock_fname|
110
110
  next if lock_fname == '.' or lock_fname == '..'
111
- File.delete(@lock_dir.path + '/' + lock_fname)
111
+ FileUtils.rm_rf(@lock_dir.path + '/' + lock_fname)
112
112
  end
113
113
  end
114
114
  end
@@ -159,7 +159,13 @@ module Ferret::Store
159
159
  # This replacement should be atomic.
160
160
  def rename(from, to)
161
161
  synchronize do
162
- File.rename(dir_path(from), dir_path(to))
162
+ begin
163
+ File.rename(dir_path(from), dir_path(to))
164
+ rescue
165
+ # try again, this time forcing the delete
166
+ FileUtils.rm_rf(dir_path(to))
167
+ FileUtils.cp(dir_path(from), dir_path(to))
168
+ end
163
169
  end
164
170
  end
165
171
 
@@ -208,11 +214,11 @@ module Ferret::Store
208
214
  def initialize(lock_file)
209
215
  @lock_file = lock_file
210
216
  #@clean = FSLock.make_finalizer(lock_file)
211
- @clean = lambda { File.delete(lock_file) rescue nil}
217
+ @clean = lambda { FileUtils.rm_rf(lock_file)}
212
218
  end
213
219
 
214
220
  def FSLock.make_finalizer(lock_file)
215
- lambda { File.delete(lock_file) rescue nil}
221
+ lambda { FileUtils.rm_rf(lock_file)}
216
222
  end
217
223
 
218
224
  # obtain the lock on the data source
@@ -238,7 +244,7 @@ module Ferret::Store
238
244
  def release
239
245
  return if FSDirectory.locks_disabled?
240
246
  begin
241
- File.delete(@lock_file)
247
+ FileUtils.rm_rf(@lock_file)
242
248
  ObjectSpace.undefine_finalizer(self)
243
249
  rescue SystemCallError
244
250
  # maybe we tried to release a lock that wasn't locked. This
@@ -364,6 +370,7 @@ module Ferret::Store
364
370
  # This method is only used by the c extension to free the directory
365
371
  def close_internal
366
372
  end
373
+
367
374
  #end private
368
375
  end
369
376
  end
@@ -213,20 +213,6 @@ module Ferret::Store
213
213
  last = start + length
214
214
  (start ... last).each do |i|
215
215
  write_byte(buf[i])
216
- # code = buf[i]
217
- # if code >= 0x01 and code <= 0x7F
218
- # write_byte(code)
219
- # else
220
- # # We need to write unicode characters. ToDo: test that this works.
221
- # if code > 0x80 and code <= 0x7FF or code == 0
222
- # write_byte(0xC0 | code >> 6)
223
- # write_byte(0x80 | code & 0x3F)
224
- # else
225
- # write_byte(0xE0 | (code >> 12))
226
- # write_byte(0x80 | ((code >> 6) & 0x3F))
227
- # write_byte(0x80 | (code & 0x3F))
228
- # end
229
- # end
230
216
  end
231
217
  end
232
218
 
@@ -159,9 +159,10 @@ module Ferret::Store
159
159
  flush()
160
160
  last_buffer_number = (@file.length / BUFFER_SIZE).to_i
161
161
  last_buffer_offset = @file.length % BUFFER_SIZE
162
- @file.buffers.each_with_index do |buffer, i|
162
+
163
+ (0..last_buffer_number).each do |i|
163
164
  len = (i == last_buffer_number ? last_buffer_offset : BUFFER_SIZE)
164
- output.write_bytes(buffer, len)
165
+ output.write_bytes(@file.buffers[i], len)
165
166
  end
166
167
  end
167
168
 
data/lib/rferret.rb CHANGED
@@ -23,7 +23,7 @@ $: << File.dirname(__FILE__)
23
23
  #++
24
24
  # :include: ../TUTORIAL
25
25
  module Ferret
26
- VERSION = '0.9.1'
26
+ VERSION = '0.9.2'
27
27
  end
28
28
 
29
29
  $ferret_pure_ruby = true
@@ -399,3 +399,134 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
399
399
  assert(! t.next())
400
400
  end
401
401
  end
402
+
403
+ class RegExpAnalyzerTest < Test::Unit::TestCase
404
+ include Ferret::Analysis
405
+
406
+ def test_reg_exp_analyzer()
407
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
408
+ a = RegExpAnalyzer.new()
409
+ t = a.token_stream('XXX', input)
410
+ t2 = a.token_stream('XXX', "one_Two three")
411
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
412
+ assert_equal(Token.new('is', 19, 21), t.next)
413
+ assert_equal(Token.new('my', 22, 24), t.next)
414
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
415
+ assert_equal(Token.new('52', 32, 34), t.next)
416
+ assert_equal(Token.new('address', 40, 47), t.next)
417
+ assert_equal(Token.new('23', 49, 51), t.next)
418
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
419
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
420
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
421
+ assert_equal(Token.new('23', 116, 118), t.next)
422
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
423
+ assert(! t.next())
424
+ t = t2
425
+ assert_equal(Token.new("one_two", 0, 7), t.next())
426
+ assert_equal(Token.new("three", 8, 13), t.next())
427
+ assert(! t.next())
428
+ a = RegExpAnalyzer.new(/\w{2,}/, false)
429
+ t = a.token_stream('XXX', input)
430
+ t2 = a.token_stream('XXX', "one Two three")
431
+ assert_equal(Token.new('DBalmain', 0, 8), t.next)
432
+ assert_equal(Token.new('gmail', 9, 14), t.next)
433
+ assert_equal(Token.new('com', 15, 18), t.next)
434
+ assert_equal(Token.new('is', 19, 21), t.next)
435
+ assert_equal(Token.new('My', 22, 24), t.next)
436
+ assert_equal(Token.new('mail', 27, 31), t.next)
437
+ assert_equal(Token.new('52', 32, 34), t.next)
438
+ assert_equal(Token.new('Address', 40, 47), t.next)
439
+ assert_equal(Token.new('23', 49, 51), t.next)
440
+ assert_equal(Token.new('http', 55, 59), t.next)
441
+ assert_equal(Token.new('www', 62, 65), t.next)
442
+ assert_equal(Token.new('google', 66, 72), t.next)
443
+ assert_equal(Token.new('com', 73, 76), t.next)
444
+ assert_equal(Token.new('RESULT_3', 77, 85), t.next)
445
+ assert_equal(Token.new('html', 86, 90), t.next)
446
+ assert_equal(Token.new('123', 98, 101), t.next)
447
+ assert_equal(Token.new('1235', 102, 106), t.next)
448
+ assert_equal(Token.new('ASD', 107, 110), t.next)
449
+ assert_equal(Token.new('1234', 111, 115), t.next)
450
+ assert_equal(Token.new('23', 116, 118), t.next)
451
+ assert_equal(Token.new('Rob', 119, 122), t.next)
452
+ assert(! t.next())
453
+ assert_equal(Token.new("one", 0, 3), t2.next())
454
+ assert_equal(Token.new("Two", 4, 7), t2.next())
455
+ assert_equal(Token.new("three", 8, 13), t2.next())
456
+ assert(! t2.next())
457
+ a = RegExpAnalyzer.new() do |str|
458
+ if str =~ /^[[:alpha:]]\.([[:alpha:]]\.)+$/
459
+ str.gsub!(/\./, '')
460
+ elsif str =~ /'[sS]$/
461
+ str.gsub!(/'[sS]$/, '')
462
+ end
463
+ str
464
+ end
465
+ t = a.token_stream('XXX', input)
466
+ t2 = a.token_stream('XXX', "one's don't T.N.T.")
467
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
468
+ assert_equal(Token.new('is', 19, 21), t.next)
469
+ assert_equal(Token.new('my', 22, 24), t.next)
470
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
471
+ assert_equal(Token.new('52', 32, 34), t.next)
472
+ assert_equal(Token.new('address', 40, 47), t.next)
473
+ assert_equal(Token.new('23', 49, 51), t.next)
474
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
475
+ assert_equal(Token.new('tnt', 91, 97), t.next)
476
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
477
+ assert_equal(Token.new('23', 116, 118), t.next)
478
+ assert_equal(Token.new('rob', 119, 124), t.next)
479
+ assert(! t.next())
480
+ assert_equal(Token.new("one", 0, 5), t2.next())
481
+ assert_equal(Token.new("don't", 6, 11), t2.next())
482
+ assert_equal(Token.new("tnt", 12, 18), t2.next())
483
+ assert(! t2.next())
484
+ end
485
+ end
486
+
487
+ module Ferret::Analysis
488
+ class StemmingStandardAnalyzer < StandardAnalyzer
489
+ def token_stream(field, text)
490
+ StemFilter.new(super)
491
+ end
492
+ end
493
+ end
494
+
495
+ class CustomAnalyzerTest < Test::Unit::TestCase
496
+ include Ferret::Analysis
497
+
498
+ def test_custom_filter()
499
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
500
+ a = StemmingStandardAnalyzer.new()
501
+ t = a.token_stream("fieldname", input)
502
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
503
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
504
+ assert_equal(Token.new('address', 40, 47), t.next)
505
+ assert_equal(Token.new('23', 49, 51), t.next)
506
+ assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
507
+ assert_equal(Token.new('tnt', 86, 91), t.next)
508
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
509
+ assert_equal(Token.new('23', 111, 113), t.next)
510
+ assert_equal(Token.new('áägç', 117, 124), t.next)
511
+ assert_equal(Token.new('êëì', 126, 132), t.next)
512
+ assert_equal(Token.new('úøã', 134, 140), t.next)
513
+ assert_equal(Token.new('öîí', 142, 148), t.next)
514
+ assert(! t.next())
515
+ input = "Debate Debates DEBATED DEBating Debater";
516
+ t = a.token_stream("fieldname", input)
517
+ assert_equal(Token.new("debat", 0, 6), t.next)
518
+ assert_equal(Token.new("debat", 7, 14), t.next)
519
+ assert_equal(Token.new("debat", 15, 22), t.next)
520
+ assert_equal(Token.new("debat", 23, 31), t.next)
521
+ assert_equal(Token.new("debat", 32, 39), t.next)
522
+ assert(! t.next())
523
+ input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
524
+ t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
525
+ assert_equal(Token.new("dêbate", 0, 7), t.next)
526
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
527
+ assert_equal(Token.new("dêbate", 17, 25), t.next)
528
+ assert_equal(Token.new("dêbate", 26, 35), t.next)
529
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
530
+ assert(! t.next())
531
+ end
532
+ end
@@ -205,7 +205,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
205
205
  include Ferret::Analysis
206
206
 
207
207
  def test_standard_tokenizer()
208
- input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
208
+ input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊËÌ¯ÚØÃ¬ÖÎÍ'
209
209
  t = StandardTokenizer.new(input)
210
210
  assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
211
211
  assert_equal(Token.new('is', 19, 21), t.next)
@@ -214,7 +214,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
214
214
  assert_equal(Token.new('52', 32, 34), t.next)
215
215
  assert_equal(Token.new('Address', 40, 47), t.next)
216
216
  assert_equal(Token.new('23', 49, 51), t.next)
217
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
217
+ assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
218
218
  assert_equal(Token.new('TNT', 86, 91), t.next)
219
219
  assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
220
220
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -235,7 +235,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
235
235
  assert_equal(Token.new('52', 32, 34), t.next)
236
236
  assert_equal(Token.new('address', 40, 47), t.next)
237
237
  assert_equal(Token.new('23', 49, 51), t.next)
238
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
238
+ assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
239
239
  assert_equal(Token.new('tnt', 86, 91), t.next)
240
240
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
241
241
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -247,6 +247,97 @@ class StandardTokenizerTest < Test::Unit::TestCase
247
247
  end
248
248
  end
249
249
 
250
+ class RegExpTokenizerTest < Test::Unit::TestCase
251
+ include Ferret::Analysis
252
+
253
+ ALPHA = /[[:alpha:]_-]+/
254
+ APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
255
+ ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
256
+ ACRONYM_WORD = /^#{ACRONYM}$/
257
+ APOSTROPHE_WORD = /^#{APOSTROPHE}$/
258
+
259
+ def test_reg_exp_tokenizer()
260
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
261
+ t = RegExpTokenizer.new(input)
262
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
263
+ assert_equal(Token.new('is', 19, 21), t.next)
264
+ assert_equal(Token.new('My', 22, 24), t.next)
265
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
266
+ assert_equal(Token.new('52', 32, 34), t.next)
267
+ assert_equal(Token.new('Address', 40, 47), t.next)
268
+ assert_equal(Token.new('23', 49, 51), t.next)
269
+ assert_equal(Token.new('http://www.google.com/RESULT_3.html', 55, 90), t.next)
270
+ assert_equal(Token.new('T.N.T.', 91, 97), t.next)
271
+ assert_equal(Token.new('123-1235-ASD-1234', 98, 115), t.next)
272
+ assert_equal(Token.new('23', 116, 118), t.next)
273
+ assert_equal(Token.new('Rob\'s', 119, 124), t.next)
274
+ assert(! t.next())
275
+ t.text = "one_two three"
276
+ assert_equal(Token.new("one_two", 0, 7), t.next())
277
+ assert_equal(Token.new("three", 8, 13), t.next())
278
+ assert(! t.next())
279
+ t = LowerCaseFilter.new(RegExpTokenizer.new(input))
280
+ t2 = LowerCaseFilter.new(RegExpTokenizer.new(input, /\w{2,}/))
281
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
282
+ assert_equal(Token.new('is', 19, 21), t.next)
283
+ assert_equal(Token.new('my', 22, 24), t.next)
284
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
285
+ assert_equal(Token.new('52', 32, 34), t.next)
286
+ assert_equal(Token.new('address', 40, 47), t.next)
287
+ assert_equal(Token.new('23', 49, 51), t.next)
288
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
289
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
290
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
291
+ assert_equal(Token.new('23', 116, 118), t.next)
292
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
293
+ assert(! t.next())
294
+ assert_equal(Token.new('dbalmain', 0, 8), t2.next)
295
+ assert_equal(Token.new('gmail', 9, 14), t2.next)
296
+ assert_equal(Token.new('com', 15, 18), t2.next)
297
+ assert_equal(Token.new('is', 19, 21), t2.next)
298
+ assert_equal(Token.new('my', 22, 24), t2.next)
299
+ assert_equal(Token.new('mail', 27, 31), t2.next)
300
+ assert_equal(Token.new('52', 32, 34), t2.next)
301
+ assert_equal(Token.new('address', 40, 47), t2.next)
302
+ assert_equal(Token.new('23', 49, 51), t2.next)
303
+ assert_equal(Token.new('http', 55, 59), t2.next)
304
+ assert_equal(Token.new('www', 62, 65), t2.next)
305
+ assert_equal(Token.new('google', 66, 72), t2.next)
306
+ assert_equal(Token.new('com', 73, 76), t2.next)
307
+ assert_equal(Token.new('result_3', 77, 85), t2.next)
308
+ assert_equal(Token.new('html', 86, 90), t2.next)
309
+ assert_equal(Token.new('123', 98, 101), t2.next)
310
+ assert_equal(Token.new('1235', 102, 106), t2.next)
311
+ assert_equal(Token.new('asd', 107, 110), t2.next)
312
+ assert_equal(Token.new('1234', 111, 115), t2.next)
313
+ assert_equal(Token.new('23', 116, 118), t2.next)
314
+ assert_equal(Token.new('rob', 119, 122), t2.next)
315
+ assert(! t2.next())
316
+ t = RegExpTokenizer.new(input) do |str|
317
+ if str =~ ACRONYM_WORD
318
+ str.gsub!(/\./, '')
319
+ elsif str =~ APOSTROPHE_WORD
320
+ str.gsub!(/'[sS]$/, '')
321
+ end
322
+ str
323
+ end
324
+ t = LowerCaseFilter.new(t)
325
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
326
+ assert_equal(Token.new('is', 19, 21), t.next)
327
+ assert_equal(Token.new('my', 22, 24), t.next)
328
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
329
+ assert_equal(Token.new('52', 32, 34), t.next)
330
+ assert_equal(Token.new('address', 40, 47), t.next)
331
+ assert_equal(Token.new('23', 49, 51), t.next)
332
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
333
+ assert_equal(Token.new('tnt', 91, 97), t.next)
334
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
335
+ assert_equal(Token.new('23', 116, 118), t.next)
336
+ assert_equal(Token.new('rob', 119, 124), t.next)
337
+ assert(! t.next())
338
+ end
339
+ end
340
+
250
341
  class StopFilterTest < Test::Unit::TestCase
251
342
  include Ferret::Analysis
252
343
 
@@ -383,11 +474,9 @@ module Ferret::Analysis
383
474
  def next()
384
475
  t = @input.next()
385
476
 
386
- if (t == nil)
387
- return nil
388
- end
477
+ return nil if (t.nil?)
389
478
 
390
- t.text = t.text[0,1].upcase + t.text[1..-1]
479
+ t.text = t.text.capitalize
391
480
 
392
481
  return t
393
482
  end
@@ -402,7 +491,7 @@ class CustomFilterTest < Test::Unit::TestCase
402
491
  t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
403
492
  assert_equal(Token.new("This", 0, 4), t.next)
404
493
  assert_equal(Token.new("Text", 5, 9), t.next)
405
- assert_equal(Token.new("SHOULD", 10, 16), t.next)
494
+ assert_equal(Token.new("Should", 10, 16), t.next)
406
495
  assert_equal(Token.new("Be", 17, 19), t.next)
407
496
  assert_equal(Token.new("Capitalized", 20, 31), t.next)
408
497
  assert_equal(Token.new("I", 36, 37), t.next)
@@ -412,7 +501,7 @@ class CustomFilterTest < Test::Unit::TestCase
412
501
  t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
413
502
  assert_equal(Token.new("This", 0, 4), t.next)
414
503
  assert_equal(Token.new("Text", 5, 9), t.next)
415
- assert_equal(Token.new("SHOULD", 10, 16), t.next)
504
+ assert_equal(Token.new("Should", 10, 16), t.next)
416
505
  assert_equal(Token.new("Be", 17, 19), t.next)
417
506
  assert_equal(Token.new("Capit", 20, 31), t.next)
418
507
  assert_equal(Token.new("I", 36, 37), t.next)