ferret 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
@@ -173,7 +173,7 @@ module Ferret::Search
173
173
 
174
174
  # See Query#extract_terms()
175
175
  def extract_terms(query_terms)
176
- query_terms.add_all(@terms)
176
+ query_terms.merge(@terms)
177
177
  end
178
178
 
179
179
  # Prints a user-readable version of this query.
@@ -62,15 +62,44 @@ module Ferret::Search
62
62
 
63
63
  # Expert: called when re-writing queries under MultiSearcher.
64
64
  #
65
- # Only implemented by derived queries, with no #create_weight()
66
- # implementatation.
65
+ # Create a single query suitable for use by all subsearchers (in 1-1
66
+ # correspondence with queries). This is an optimization of the OR of
67
+ # all queries. We handle the common optimization cases of equal
68
+ # queries and overlapping clauses of boolean OR queries (as generated
69
+ # by MultiTermQuery.rewrite() and RangeQuery.rewrite()).
70
+ # Be careful overriding this method as queries[0] determines which
71
+ # method will be called and is not necessarily of the same type as
72
+ # the other queries.
67
73
  def combine(queries)
74
+ uniques = Set.new
68
75
  queries.each do |query|
69
- if self != query
70
- raise ArgumentError
76
+ clauses = []
77
+ # check if we can split the query into clauses
78
+ splittable = query.respond_to? :clauses
79
+ if splittable
80
+ splittable = query.coord_disabled?
81
+ clauses = query.clauses
82
+ clauses.each do |clause|
83
+ splittable = clause.occur == BooleanClause::Occur::SHOULD
84
+ break unless splittable
85
+ end
86
+ end
87
+ if splittable
88
+ clauses.each { |clause| uniques << clause.query }
89
+ else
90
+ uniques << query
71
91
  end
72
92
  end
73
- return self
93
+ # optimization: if we have just one query, just return it
94
+ if uniques.size == 1
95
+ uniques.each { |query| return query }
96
+ end
97
+
98
+ result = BooleanQuery.new(true)
99
+ uniques.each do |query|
100
+ result.add_query(query, BooleanClause::Occur::SHOULD)
101
+ end
102
+ return result
74
103
  end
75
104
 
76
105
  # Expert: adds all terms occuring in this query to the terms set
@@ -84,8 +84,12 @@ module Ferret::Search
84
84
  fields = fields.map {|field| field.is_a?(Symbol) ? field.to_s : field}
85
85
  if fields[0].is_a?(String)
86
86
  @fields = fields.map do |field|
87
- SortField.new(field, {:sort_type => SortField::SortType::AUTO,
88
- :reverse => reverse})
87
+ if (field.is_a?(String))
88
+ next SortField.new(field, {:sort_type => SortField::SortType::AUTO,
89
+ :reverse => reverse})
90
+ else
91
+ next field
92
+ end
89
93
  end
90
94
  end
91
95
  doc_sort_added = false
@@ -102,7 +106,7 @@ module Ferret::Search
102
106
  INDEX_ORDER = Sort.new(SortField::FIELD_DOC)
103
107
 
104
108
  def to_s()
105
- return @fields.map {|field| "#{field}"}.join(", ")
109
+ return "Sort[" + @fields.map {|field| "#{field}"}.join(", ") + "]"
106
110
  end
107
111
  end
108
112
  end
@@ -20,11 +20,11 @@ module Ferret::Search
20
20
 
21
21
  # Sort by document score (relevancy). Sort values are Float and higher
22
22
  # values are at the front.
23
- SCORE = SortType.new("score")
23
+ SCORE = SortType.new("SCORE")
24
24
 
25
25
  # Sort by document number (order). Sort values are Integer and lower
26
26
  # values are at the front.
27
- DOC = SortType.new("doc")
27
+ DOC = SortType.new("DOC")
28
28
 
29
29
  # Guess sort type of sort based on field contents. We try parsing the
30
30
  # field as an integer and then as a floating point number. If we are
@@ -37,7 +37,7 @@ module Ferret::Search
37
37
 
38
38
  # Sort using term values as encoded Integers. Sort values are Integer
39
39
  # and lower values are at the front.
40
- INTEGER = SortType.new("int", lambda{|str| str.to_i})
40
+ INTEGER = SortType.new("integer", lambda{|str| str.to_i})
41
41
 
42
42
  # Sort using term values as encoded Floats. Sort values are Float and
43
43
  # lower values are at the front.
@@ -79,7 +79,11 @@ module Ferret::Search
79
79
  FIELD_DOC = SortField.new(nil, {:sort_type => SortType::DOC})
80
80
 
81
81
  def to_s()
82
- buffer = '"' + (@name||"<#{@sort_type}>") + '"'
82
+ if @name
83
+ buffer = "#@name:<#@sort_type>"
84
+ else
85
+ buffer = "<#{@sort_type}>"
86
+ end
83
87
  buffer << '!' if @reverse
84
88
  return buffer
85
89
  end
@@ -102,13 +102,13 @@ module Ferret::Store
102
102
  # delete all the files
103
103
  refresh_dir
104
104
  each do |fname|
105
- File.delete(dir_path(fname))
105
+ FileUtils.rm_rf(dir_path(fname))
106
106
  end
107
107
  # clear all the locks
108
108
  refresh_lock_dir
109
109
  @lock_dir.each do |lock_fname|
110
110
  next if lock_fname == '.' or lock_fname == '..'
111
- File.delete(@lock_dir.path + '/' + lock_fname)
111
+ FileUtils.rm_rf(@lock_dir.path + '/' + lock_fname)
112
112
  end
113
113
  end
114
114
  end
@@ -159,7 +159,13 @@ module Ferret::Store
159
159
  # This replacement should be atomic.
160
160
  def rename(from, to)
161
161
  synchronize do
162
- File.rename(dir_path(from), dir_path(to))
162
+ begin
163
+ File.rename(dir_path(from), dir_path(to))
164
+ rescue
165
+ # try again, this time forcing the delete
166
+ FileUtils.rm_rf(dir_path(to))
167
+ FileUtils.cp(dir_path(from), dir_path(to))
168
+ end
163
169
  end
164
170
  end
165
171
 
@@ -208,11 +214,11 @@ module Ferret::Store
208
214
  def initialize(lock_file)
209
215
  @lock_file = lock_file
210
216
  #@clean = FSLock.make_finalizer(lock_file)
211
- @clean = lambda { File.delete(lock_file) rescue nil}
217
+ @clean = lambda { FileUtils.rm_rf(lock_file)}
212
218
  end
213
219
 
214
220
  def FSLock.make_finalizer(lock_file)
215
- lambda { File.delete(lock_file) rescue nil}
221
+ lambda { FileUtils.rm_rf(lock_file)}
216
222
  end
217
223
 
218
224
  # obtain the lock on the data source
@@ -238,7 +244,7 @@ module Ferret::Store
238
244
  def release
239
245
  return if FSDirectory.locks_disabled?
240
246
  begin
241
- File.delete(@lock_file)
247
+ FileUtils.rm_rf(@lock_file)
242
248
  ObjectSpace.undefine_finalizer(self)
243
249
  rescue SystemCallError
244
250
  # maybe we tried to release a lock that wasn't locked. This
@@ -364,6 +370,7 @@ module Ferret::Store
364
370
  # This method is only used by the c extension to free the directory
365
371
  def close_internal
366
372
  end
373
+
367
374
  #end private
368
375
  end
369
376
  end
@@ -213,20 +213,6 @@ module Ferret::Store
213
213
  last = start + length
214
214
  (start ... last).each do |i|
215
215
  write_byte(buf[i])
216
- # code = buf[i]
217
- # if code >= 0x01 and code <= 0x7F
218
- # write_byte(code)
219
- # else
220
- # # We need to write unicode characters. ToDo: test that this works.
221
- # if code > 0x80 and code <= 0x7FF or code == 0
222
- # write_byte(0xC0 | code >> 6)
223
- # write_byte(0x80 | code & 0x3F)
224
- # else
225
- # write_byte(0xE0 | (code >> 12))
226
- # write_byte(0x80 | ((code >> 6) & 0x3F))
227
- # write_byte(0x80 | (code & 0x3F))
228
- # end
229
- # end
230
216
  end
231
217
  end
232
218
 
@@ -159,9 +159,10 @@ module Ferret::Store
159
159
  flush()
160
160
  last_buffer_number = (@file.length / BUFFER_SIZE).to_i
161
161
  last_buffer_offset = @file.length % BUFFER_SIZE
162
- @file.buffers.each_with_index do |buffer, i|
162
+
163
+ (0..last_buffer_number).each do |i|
163
164
  len = (i == last_buffer_number ? last_buffer_offset : BUFFER_SIZE)
164
- output.write_bytes(buffer, len)
165
+ output.write_bytes(@file.buffers[i], len)
165
166
  end
166
167
  end
167
168
 
data/lib/rferret.rb CHANGED
@@ -23,7 +23,7 @@ $: << File.dirname(__FILE__)
23
23
  #++
24
24
  # :include: ../TUTORIAL
25
25
  module Ferret
26
- VERSION = '0.9.1'
26
+ VERSION = '0.9.2'
27
27
  end
28
28
 
29
29
  $ferret_pure_ruby = true
@@ -399,3 +399,134 @@ class PerFieldAnalyzerTest < Test::Unit::TestCase
399
399
  assert(! t.next())
400
400
  end
401
401
  end
402
+
403
+ class RegExpAnalyzerTest < Test::Unit::TestCase
404
+ include Ferret::Analysis
405
+
406
+ def test_reg_exp_analyzer()
407
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
408
+ a = RegExpAnalyzer.new()
409
+ t = a.token_stream('XXX', input)
410
+ t2 = a.token_stream('XXX', "one_Two three")
411
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
412
+ assert_equal(Token.new('is', 19, 21), t.next)
413
+ assert_equal(Token.new('my', 22, 24), t.next)
414
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
415
+ assert_equal(Token.new('52', 32, 34), t.next)
416
+ assert_equal(Token.new('address', 40, 47), t.next)
417
+ assert_equal(Token.new('23', 49, 51), t.next)
418
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
419
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
420
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
421
+ assert_equal(Token.new('23', 116, 118), t.next)
422
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
423
+ assert(! t.next())
424
+ t = t2
425
+ assert_equal(Token.new("one_two", 0, 7), t.next())
426
+ assert_equal(Token.new("three", 8, 13), t.next())
427
+ assert(! t.next())
428
+ a = RegExpAnalyzer.new(/\w{2,}/, false)
429
+ t = a.token_stream('XXX', input)
430
+ t2 = a.token_stream('XXX', "one Two three")
431
+ assert_equal(Token.new('DBalmain', 0, 8), t.next)
432
+ assert_equal(Token.new('gmail', 9, 14), t.next)
433
+ assert_equal(Token.new('com', 15, 18), t.next)
434
+ assert_equal(Token.new('is', 19, 21), t.next)
435
+ assert_equal(Token.new('My', 22, 24), t.next)
436
+ assert_equal(Token.new('mail', 27, 31), t.next)
437
+ assert_equal(Token.new('52', 32, 34), t.next)
438
+ assert_equal(Token.new('Address', 40, 47), t.next)
439
+ assert_equal(Token.new('23', 49, 51), t.next)
440
+ assert_equal(Token.new('http', 55, 59), t.next)
441
+ assert_equal(Token.new('www', 62, 65), t.next)
442
+ assert_equal(Token.new('google', 66, 72), t.next)
443
+ assert_equal(Token.new('com', 73, 76), t.next)
444
+ assert_equal(Token.new('RESULT_3', 77, 85), t.next)
445
+ assert_equal(Token.new('html', 86, 90), t.next)
446
+ assert_equal(Token.new('123', 98, 101), t.next)
447
+ assert_equal(Token.new('1235', 102, 106), t.next)
448
+ assert_equal(Token.new('ASD', 107, 110), t.next)
449
+ assert_equal(Token.new('1234', 111, 115), t.next)
450
+ assert_equal(Token.new('23', 116, 118), t.next)
451
+ assert_equal(Token.new('Rob', 119, 122), t.next)
452
+ assert(! t.next())
453
+ assert_equal(Token.new("one", 0, 3), t2.next())
454
+ assert_equal(Token.new("Two", 4, 7), t2.next())
455
+ assert_equal(Token.new("three", 8, 13), t2.next())
456
+ assert(! t2.next())
457
+ a = RegExpAnalyzer.new() do |str|
458
+ if str =~ /^[[:alpha:]]\.([[:alpha:]]\.)+$/
459
+ str.gsub!(/\./, '')
460
+ elsif str =~ /'[sS]$/
461
+ str.gsub!(/'[sS]$/, '')
462
+ end
463
+ str
464
+ end
465
+ t = a.token_stream('XXX', input)
466
+ t2 = a.token_stream('XXX', "one's don't T.N.T.")
467
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
468
+ assert_equal(Token.new('is', 19, 21), t.next)
469
+ assert_equal(Token.new('my', 22, 24), t.next)
470
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
471
+ assert_equal(Token.new('52', 32, 34), t.next)
472
+ assert_equal(Token.new('address', 40, 47), t.next)
473
+ assert_equal(Token.new('23', 49, 51), t.next)
474
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
475
+ assert_equal(Token.new('tnt', 91, 97), t.next)
476
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
477
+ assert_equal(Token.new('23', 116, 118), t.next)
478
+ assert_equal(Token.new('rob', 119, 124), t.next)
479
+ assert(! t.next())
480
+ assert_equal(Token.new("one", 0, 5), t2.next())
481
+ assert_equal(Token.new("don't", 6, 11), t2.next())
482
+ assert_equal(Token.new("tnt", 12, 18), t2.next())
483
+ assert(! t2.next())
484
+ end
485
+ end
486
+
487
+ module Ferret::Analysis
488
+ class StemmingStandardAnalyzer < StandardAnalyzer
489
+ def token_stream(field, text)
490
+ StemFilter.new(super)
491
+ end
492
+ end
493
+ end
494
+
495
+ class CustomAnalyzerTest < Test::Unit::TestCase
496
+ include Ferret::Analysis
497
+
498
+ def test_custom_filter()
499
+ input = 'DBalmán@gmail.com is My e-mail and the Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
500
+ a = StemmingStandardAnalyzer.new()
501
+ t = a.token_stream("fieldname", input)
502
+ assert_equal(Token.new('dbalmán@gmail.com', 0, 18), t.next)
503
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
504
+ assert_equal(Token.new('address', 40, 47), t.next)
505
+ assert_equal(Token.new('23', 49, 51), t.next)
506
+ assert_equal(Token.new('www.google.com/result', 55, 84), t.next)
507
+ assert_equal(Token.new('tnt', 86, 91), t.next)
508
+ assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
509
+ assert_equal(Token.new('23', 111, 113), t.next)
510
+ assert_equal(Token.new('áägç', 117, 124), t.next)
511
+ assert_equal(Token.new('êëì', 126, 132), t.next)
512
+ assert_equal(Token.new('úøã', 134, 140), t.next)
513
+ assert_equal(Token.new('öîí', 142, 148), t.next)
514
+ assert(! t.next())
515
+ input = "Debate Debates DEBATED DEBating Debater";
516
+ t = a.token_stream("fieldname", input)
517
+ assert_equal(Token.new("debat", 0, 6), t.next)
518
+ assert_equal(Token.new("debat", 7, 14), t.next)
519
+ assert_equal(Token.new("debat", 15, 22), t.next)
520
+ assert_equal(Token.new("debat", 23, 31), t.next)
521
+ assert_equal(Token.new("debat", 32, 39), t.next)
522
+ assert(! t.next())
523
+ input = "Dêbate dêbates DÊBATED DÊBATing dêbater";
524
+ t = StemFilter.new(LowerCaseFilter.new(LetterTokenizer.new(input)), :english)
525
+ assert_equal(Token.new("dêbate", 0, 7), t.next)
526
+ assert_equal(Token.new("dêbate", 8, 16), t.next)
527
+ assert_equal(Token.new("dêbate", 17, 25), t.next)
528
+ assert_equal(Token.new("dêbate", 26, 35), t.next)
529
+ assert_equal(Token.new("dêbater", 36, 44), t.next)
530
+ assert(! t.next())
531
+ end
532
+ end
@@ -205,7 +205,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
205
205
  include Ferret::Analysis
206
206
 
207
207
  def test_standard_tokenizer()
208
- input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/results/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
208
+ input = 'DBalmán@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/res_345/ T.N.T. 123-1235-ASD-1234 23#@$ ÁÄGÇ®ÊË̯ÚØìÖÎÍ'
209
209
  t = StandardTokenizer.new(input)
210
210
  assert_equal(Token.new('DBalmán@gmail.com', 0, 18), t.next)
211
211
  assert_equal(Token.new('is', 19, 21), t.next)
@@ -214,7 +214,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
214
214
  assert_equal(Token.new('52', 32, 34), t.next)
215
215
  assert_equal(Token.new('Address', 40, 47), t.next)
216
216
  assert_equal(Token.new('23', 49, 51), t.next)
217
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
217
+ assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
218
218
  assert_equal(Token.new('TNT', 86, 91), t.next)
219
219
  assert_equal(Token.new('123-1235-ASD-1234', 93, 110), t.next)
220
220
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -235,7 +235,7 @@ class StandardTokenizerTest < Test::Unit::TestCase
235
235
  assert_equal(Token.new('52', 32, 34), t.next)
236
236
  assert_equal(Token.new('address', 40, 47), t.next)
237
237
  assert_equal(Token.new('23', 49, 51), t.next)
238
- assert_equal(Token.new('www.google.com/results', 55, 84), t.next)
238
+ assert_equal(Token.new('www.google.com/res_345', 55, 84), t.next)
239
239
  assert_equal(Token.new('tnt', 86, 91), t.next)
240
240
  assert_equal(Token.new('123-1235-asd-1234', 93, 110), t.next)
241
241
  assert_equal(Token.new('23', 111, 113), t.next)
@@ -247,6 +247,97 @@ class StandardTokenizerTest < Test::Unit::TestCase
247
247
  end
248
248
  end
249
249
 
250
+ class RegExpTokenizerTest < Test::Unit::TestCase
251
+ include Ferret::Analysis
252
+
253
+ ALPHA = /[[:alpha:]_-]+/
254
+ APOSTROPHE = /#{ALPHA}('#{ALPHA})+/
255
+ ACRONYM = /#{ALPHA}\.(#{ALPHA}\.)+/
256
+ ACRONYM_WORD = /^#{ACRONYM}$/
257
+ APOSTROPHE_WORD = /^#{APOSTROPHE}$/
258
+
259
+ def test_reg_exp_tokenizer()
260
+ input = 'DBalmain@gmail.com is My e-mail 52 #$ Address. 23#@$ http://www.google.com/RESULT_3.html T.N.T. 123-1235-ASD-1234 23 Rob\'s'
261
+ t = RegExpTokenizer.new(input)
262
+ assert_equal(Token.new('DBalmain@gmail.com', 0, 18), t.next)
263
+ assert_equal(Token.new('is', 19, 21), t.next)
264
+ assert_equal(Token.new('My', 22, 24), t.next)
265
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
266
+ assert_equal(Token.new('52', 32, 34), t.next)
267
+ assert_equal(Token.new('Address', 40, 47), t.next)
268
+ assert_equal(Token.new('23', 49, 51), t.next)
269
+ assert_equal(Token.new('http://www.google.com/RESULT_3.html', 55, 90), t.next)
270
+ assert_equal(Token.new('T.N.T.', 91, 97), t.next)
271
+ assert_equal(Token.new('123-1235-ASD-1234', 98, 115), t.next)
272
+ assert_equal(Token.new('23', 116, 118), t.next)
273
+ assert_equal(Token.new('Rob\'s', 119, 124), t.next)
274
+ assert(! t.next())
275
+ t.text = "one_two three"
276
+ assert_equal(Token.new("one_two", 0, 7), t.next())
277
+ assert_equal(Token.new("three", 8, 13), t.next())
278
+ assert(! t.next())
279
+ t = LowerCaseFilter.new(RegExpTokenizer.new(input))
280
+ t2 = LowerCaseFilter.new(RegExpTokenizer.new(input, /\w{2,}/))
281
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
282
+ assert_equal(Token.new('is', 19, 21), t.next)
283
+ assert_equal(Token.new('my', 22, 24), t.next)
284
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
285
+ assert_equal(Token.new('52', 32, 34), t.next)
286
+ assert_equal(Token.new('address', 40, 47), t.next)
287
+ assert_equal(Token.new('23', 49, 51), t.next)
288
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
289
+ assert_equal(Token.new('t.n.t.', 91, 97), t.next)
290
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
291
+ assert_equal(Token.new('23', 116, 118), t.next)
292
+ assert_equal(Token.new('rob\'s', 119, 124), t.next)
293
+ assert(! t.next())
294
+ assert_equal(Token.new('dbalmain', 0, 8), t2.next)
295
+ assert_equal(Token.new('gmail', 9, 14), t2.next)
296
+ assert_equal(Token.new('com', 15, 18), t2.next)
297
+ assert_equal(Token.new('is', 19, 21), t2.next)
298
+ assert_equal(Token.new('my', 22, 24), t2.next)
299
+ assert_equal(Token.new('mail', 27, 31), t2.next)
300
+ assert_equal(Token.new('52', 32, 34), t2.next)
301
+ assert_equal(Token.new('address', 40, 47), t2.next)
302
+ assert_equal(Token.new('23', 49, 51), t2.next)
303
+ assert_equal(Token.new('http', 55, 59), t2.next)
304
+ assert_equal(Token.new('www', 62, 65), t2.next)
305
+ assert_equal(Token.new('google', 66, 72), t2.next)
306
+ assert_equal(Token.new('com', 73, 76), t2.next)
307
+ assert_equal(Token.new('result_3', 77, 85), t2.next)
308
+ assert_equal(Token.new('html', 86, 90), t2.next)
309
+ assert_equal(Token.new('123', 98, 101), t2.next)
310
+ assert_equal(Token.new('1235', 102, 106), t2.next)
311
+ assert_equal(Token.new('asd', 107, 110), t2.next)
312
+ assert_equal(Token.new('1234', 111, 115), t2.next)
313
+ assert_equal(Token.new('23', 116, 118), t2.next)
314
+ assert_equal(Token.new('rob', 119, 122), t2.next)
315
+ assert(! t2.next())
316
+ t = RegExpTokenizer.new(input) do |str|
317
+ if str =~ ACRONYM_WORD
318
+ str.gsub!(/\./, '')
319
+ elsif str =~ APOSTROPHE_WORD
320
+ str.gsub!(/'[sS]$/, '')
321
+ end
322
+ str
323
+ end
324
+ t = LowerCaseFilter.new(t)
325
+ assert_equal(Token.new('dbalmain@gmail.com', 0, 18), t.next)
326
+ assert_equal(Token.new('is', 19, 21), t.next)
327
+ assert_equal(Token.new('my', 22, 24), t.next)
328
+ assert_equal(Token.new('e-mail', 25, 31), t.next)
329
+ assert_equal(Token.new('52', 32, 34), t.next)
330
+ assert_equal(Token.new('address', 40, 47), t.next)
331
+ assert_equal(Token.new('23', 49, 51), t.next)
332
+ assert_equal(Token.new('http://www.google.com/result_3.html', 55, 90), t.next)
333
+ assert_equal(Token.new('tnt', 91, 97), t.next)
334
+ assert_equal(Token.new('123-1235-asd-1234', 98, 115), t.next)
335
+ assert_equal(Token.new('23', 116, 118), t.next)
336
+ assert_equal(Token.new('rob', 119, 124), t.next)
337
+ assert(! t.next())
338
+ end
339
+ end
340
+
250
341
  class StopFilterTest < Test::Unit::TestCase
251
342
  include Ferret::Analysis
252
343
 
@@ -383,11 +474,9 @@ module Ferret::Analysis
383
474
  def next()
384
475
  t = @input.next()
385
476
 
386
- if (t == nil)
387
- return nil
388
- end
477
+ return nil if (t.nil?)
389
478
 
390
- t.text = t.text[0,1].upcase + t.text[1..-1]
479
+ t.text = t.text.capitalize
391
480
 
392
481
  return t
393
482
  end
@@ -402,7 +491,7 @@ class CustomFilterTest < Test::Unit::TestCase
402
491
  t = CapitalizeFilter.new(AsciiLetterTokenizer.new(input))
403
492
  assert_equal(Token.new("This", 0, 4), t.next)
404
493
  assert_equal(Token.new("Text", 5, 9), t.next)
405
- assert_equal(Token.new("SHOULD", 10, 16), t.next)
494
+ assert_equal(Token.new("Should", 10, 16), t.next)
406
495
  assert_equal(Token.new("Be", 17, 19), t.next)
407
496
  assert_equal(Token.new("Capitalized", 20, 31), t.next)
408
497
  assert_equal(Token.new("I", 36, 37), t.next)
@@ -412,7 +501,7 @@ class CustomFilterTest < Test::Unit::TestCase
412
501
  t = StemFilter.new(CapitalizeFilter.new(AsciiLetterTokenizer.new(input)))
413
502
  assert_equal(Token.new("This", 0, 4), t.next)
414
503
  assert_equal(Token.new("Text", 5, 9), t.next)
415
- assert_equal(Token.new("SHOULD", 10, 16), t.next)
504
+ assert_equal(Token.new("Should", 10, 16), t.next)
416
505
  assert_equal(Token.new("Be", 17, 19), t.next)
417
506
  assert_equal(Token.new("Capit", 20, 31), t.next)
418
507
  assert_equal(Token.new("I", 36, 37), t.next)