ferret 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/Rakefile +7 -1
  2. data/ext/analysis.c +21 -13
  3. data/ext/array.c +1 -1
  4. data/ext/bitvector.c +2 -2
  5. data/ext/defines.h +0 -6
  6. data/ext/except.c +6 -6
  7. data/ext/except.h +12 -8
  8. data/ext/extconf.rb +1 -0
  9. data/ext/ferret.c +4 -0
  10. data/ext/ferret.h +1 -0
  11. data/ext/fs_store.c +18 -4
  12. data/ext/global.c +18 -16
  13. data/ext/global.h +7 -2
  14. data/ext/hash.c +1 -1
  15. data/ext/helper.c +1 -1
  16. data/ext/helper.h +1 -1
  17. data/ext/inc/lang.h +7 -1
  18. data/ext/ind.c +4 -4
  19. data/ext/ind.h +3 -3
  20. data/ext/index.c +33 -26
  21. data/ext/index.h +1 -1
  22. data/ext/lang.h +7 -1
  23. data/ext/mem_pool.c +1 -1
  24. data/ext/mem_pool.h +1 -1
  25. data/ext/q_fuzzy.c +2 -2
  26. data/ext/q_match_all.c +2 -2
  27. data/ext/q_multi_term.c +1 -1
  28. data/ext/q_parser.c +60 -52
  29. data/ext/r_analysis.c +6 -4
  30. data/ext/r_index.c +57 -4
  31. data/ext/r_search.c +1 -1
  32. data/ext/r_utils.c +1 -1
  33. data/ext/ram_store.c +1 -1
  34. data/ext/search.c +4 -4
  35. data/ext/sort.c +3 -3
  36. data/ext/store.c +9 -9
  37. data/ext/store.h +4 -4
  38. data/ext/tags +7841 -0
  39. data/ext/term_vectors.c +3 -3
  40. data/lib/ferret/index.rb +69 -7
  41. data/test/test_helper.rb +3 -2
  42. data/test/unit/analysis/tc_token_stream.rb +1 -0
  43. data/test/unit/index/tc_index.rb +157 -2
  44. data/test/unit/index/tc_index_reader.rb +108 -5
  45. data/test/unit/query_parser/tc_query_parser.rb +2 -1
  46. data/test/unit/search/tc_index_searcher.rb +1 -1
  47. data/test/unit/search/tc_multi_searcher.rb +2 -1
  48. data/test/unit/search/tc_spans.rb +1 -1
  49. data/test/unit/store/tc_fs_store.rb +6 -3
  50. data/test/unit/ts_analysis.rb +1 -1
  51. data/test/unit/ts_utils.rb +1 -1
  52. data/test/unit/utils/tc_number_tools.rb +1 -1
  53. metadata +138 -137
data/ext/term_vectors.c CHANGED
@@ -250,10 +250,10 @@ TermVectorsWriter *tvw_open(Store *store, const char *segment, FieldInfos *fis)
250
250
  tvw->fis = fis;
251
251
  tvw->fields = ary_new_type_capa(TVField, TV_FIELD_INIT_CAPA);
252
252
 
253
- sprintf(file_name, "%s.tvx", segment);
253
+ snprintf(file_name, SEGMENT_NAME_MAX_LENGTH, "%s.tvx", segment);
254
254
  tvw->tvx_out = store->new_output(store, file_name);
255
255
 
256
- sprintf(file_name, "%s.tvd", segment);
256
+ snprintf(file_name, SEGMENT_NAME_MAX_LENGTH, "%s.tvd", segment);
257
257
  tvw->tvd_out = store->new_output(store, file_name);
258
258
 
259
259
  return tvw;
@@ -335,7 +335,7 @@ void tvw_add_postings(TermVectorsWriter *tvw,
335
335
 
336
336
  if (fi_store_offsets(fi)) {
337
337
  /* use delta encoding for offsets */
338
- int last_end = 0;
338
+ int last_end = 0;
339
339
  os_write_vint(tvd_out, offset_count); /* write shared prefix length */
340
340
  for (i = 0; i < offset_count; i++) {
341
341
  int start = offsets[i].start;
data/lib/ferret/index.rb CHANGED
@@ -10,6 +10,7 @@ module Ferret::Index
10
10
  include Ferret::Search
11
11
 
12
12
  attr_reader :options
13
+
13
14
  # If you create an Index without any options, it'll simply create an index
14
15
  # in memory. But this class is highly configurable and every option that
15
16
  # you can supply to IndexWriter and QueryParser, you can also set here.
@@ -52,6 +53,10 @@ module Ferret::Index
52
53
  # concerned about performance. In that case you
53
54
  # should think about setting up a DRb indexing
54
55
  # service.
56
+ # lock_retry_time:: Default: 2 seconds. This parameter specifies how
57
+ # long to wait before retrying to obtain the
58
+ # commit lock when detecting if the IndexReader is
59
+ # at the latest version.
55
60
  #
56
61
  # Some examples;
57
62
  #
@@ -64,8 +69,14 @@ module Ferret::Index
64
69
  # index = Index::Index.new(:dir => directory,
65
70
  # :default_slop => 2,
66
71
  # :handle_parse_errors => false)
67
- #
68
- def initialize(options = {})
72
+ #
73
+ # You can also pass a block if you like. The index will be yielded and
74
+ # closed at the index of the box. For example;
75
+ #
76
+ # Ferret::I.new() do |index|
77
+ # # do stuff with index. Most of your actions will be cached.
78
+ # end
79
+ def initialize(options = {}, &block)
69
80
  super()
70
81
 
71
82
  if options[:key]
@@ -92,14 +103,19 @@ module Ferret::Index
92
103
  end
93
104
 
94
105
  options[:dir] = @dir
106
+ options[:lock_retry_time]||= 2
95
107
  @dir.extend(MonitorMixin)
96
108
  @dir.synchronize do
97
109
  @options = options
98
- @writer = IndexWriter.new(options) # create the index if need be
99
- options[:analyzer] = @analyzer = @writer.analyzer
100
- @writer.close
110
+ if (!@dir.exists?("segments")) || options[:create]
111
+ IndexWriter.new(options).close
112
+ end
113
+ options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new
114
+
115
+ @searcher = nil
101
116
  @writer = nil
102
117
  @reader = nil
118
+
103
119
  @options.delete(:create) # only want to create the first time if at all
104
120
  @auto_flush = @options[:auto_flush] || false
105
121
  if (@options[:id_field].nil? and
@@ -117,13 +133,51 @@ module Ferret::Index
117
133
  @open = true
118
134
  @qp = nil
119
135
  end
136
+ if block
137
+ yield self
138
+ self.close
139
+ end
140
+ end
141
+
142
+ # Returns an array of strings with the matches highlighted. The +query+ can
143
+ # either a query String or a Ferret::Search::Query object. The doc_id is
144
+ # the id of the document you want to highlight (usually returned by the
145
+ # search methods). There are also a number of options you can pass;
146
+ #
147
+ # === Options
148
+ #
149
+ # :field:: Default: @options[:default_field]. The default_field
150
+ # is the field that is usually highlighted but you can
151
+ # specify which field you want to highlight here. If
152
+ # you want to highlight multiple fields then you will
153
+ # need to call this method multiple times.
154
+ # :excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
155
+ # terms will be in the centre of the excerpt.
156
+ # :num_excerpts:: Default: 2. Number of excerpts to return.
157
+ # :pre_tag:: Default: "<b>". Tag to place to the left of the
158
+ # match. You'll probably want to change this to a
159
+ # "<span>" tag with a class "\033[7m" for use in a
160
+ # terminal.
161
+ # :post_tag:: Default: "</b>". This tag should close the
162
+ # +:pre_tag+. Try tag "\033[m" in the terminal.
163
+ # :ellipsis:: Default: "...". This is the string that is appended
164
+ # at the beginning and end of excerpts (unless the
165
+ # excerpt hits the start or end of the field. You'll
166
+ # probably want to change this so a Unicode elipsis
167
+ # character.
168
+ def highlight(query, doc_id, options = {})
169
+ ensure_searcher_open()
170
+ @searcher.highlight(process_query(query),
171
+ doc_id,
172
+ options[:field]||@options[:default_field],
173
+ options)
120
174
  end
121
175
 
122
176
  # Closes this index by closing its associated reader and writer objects.
123
177
  def close
124
178
  @dir.synchronize do
125
179
  if not @open
126
- raise "tried to close an already closed directory"
180
+ raise(StandardError, "tried to close an already closed directory")
127
181
  end
128
182
  @searcher.close() if @searcher
129
183
  @reader.close() if @reader
@@ -534,7 +588,15 @@ module Ferret::Index
534
588
  def ensure_reader_open()
535
589
  raise "tried to use a closed index" if not @open
536
590
  if @reader
537
- if not @reader.latest?
591
+ latest = false
592
+ begin
593
+ latest = @reader.latest?
594
+ rescue LockException => le
595
+ sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again
596
+ latest = @reader.latest?
597
+ end
598
+ if not latest
599
+ @reader.close
538
600
  return @reader = IndexReader.new(@dir)
539
601
  end
540
602
  else
data/test/test_helper.rb CHANGED
@@ -3,9 +3,10 @@ $:.unshift File.join(File.dirname(__FILE__), '../lib')
3
3
  $:.unshift File.join(File.dirname(__FILE__), '../ext')
4
4
 
5
5
  class Float
6
- def =~(o)
7
- return (1 - self/o).abs < 0.00001
6
+ def approx_eql?(o)
7
+ return (1 - self/o).abs < 0.0001
8
8
  end
9
+ alias :=~ :approx_eql?
9
10
  end
10
11
 
11
12
  require 'test/unit'
@@ -1,5 +1,6 @@
1
1
  require File.dirname(__FILE__) + "/../../test_helper"
2
2
 
3
+ puts "Loading once"
3
4
  class TokenTest < Test::Unit::TestCase
4
5
  include Ferret::Analysis
5
6
  def test_token
@@ -9,7 +9,7 @@ class IndexTest < Test::Unit::TestCase
9
9
  def setup()
10
10
  end
11
11
 
12
- def tear_down()
12
+ def teardown()
13
13
  end
14
14
 
15
15
  def check_results(index, query, expected)
@@ -345,9 +345,10 @@ class IndexTest < Test::Unit::TestCase
345
345
  assert_equal(2, index2.size)
346
346
  assert_equal(2, index.size)
347
347
  top_docs = index.search("content3")
348
+
348
349
  assert_equal(0, top_docs.hits.size)
349
350
 
350
- iw = IndexWriter.new(:path => fs_path, :analyzer => WhiteSpaceAnalyzer.new())
351
+ iw = IndexWriter.new(:path => fs_path, :analyzer => WhiteSpaceAnalyzer.new)
351
352
  iw << {:f, "content3"}
352
353
  iw.close()
353
354
 
@@ -355,6 +356,7 @@ class IndexTest < Test::Unit::TestCase
355
356
  assert_equal(1, top_docs.hits.size)
356
357
  assert_equal(3, index.size)
357
358
  assert_equal("content3", index[2][:f])
359
+ index2.close
358
360
  index.close
359
361
  end
360
362
 
@@ -556,6 +558,7 @@ class IndexTest < Test::Unit::TestCase
556
558
 
557
559
  data = %q(one two three four five six seven eight nine ten eleven twelve)
558
560
  index1 = Index.new(:path => fs_path, :auto_flush => true, :key => :id)
561
+ index1 << "zero"
559
562
  index2 = Index.new(:path => fs_path, :auto_flush => true)
560
563
  begin
561
564
  data.each do |datum|
@@ -611,4 +614,156 @@ class IndexTest < Test::Unit::TestCase
611
614
  hits = i.search 'move or shake'
612
615
  assert_equal 1, hits.total_hits # fails when id field is present
613
616
  end
617
+
618
+ def test_threading
619
+ path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
620
+ index = Ferret::Index::Index.new(:path => path, :create => true)
621
+
622
+ 100.times do |i|
623
+ buf = ''
624
+ doc = {}
625
+ doc[:id] = i
626
+ doc[:foo] = "foo #{i}"
627
+ index << doc
628
+ end
629
+
630
+ threads = []
631
+
632
+ 4.times do
633
+ threads << Thread.new(index) do |index|
634
+ result = index.search('id:42')
635
+ assert_equal(1, result.total_hits)
636
+ end
637
+ end
638
+
639
+ threads.each{|t| t.join }
640
+ end
641
+
642
+ def test_wildcard
643
+ i = nil
644
+ Ferret::I.new do |i|
645
+ i << "one"
646
+ assert_equal(1, i.search("*").total_hits)
647
+ i << "two"
648
+ assert_equal(2, i.search("*").total_hits)
649
+ i << {:content => "three"}
650
+ assert_equal(3, i.search("*").total_hits)
651
+ assert_equal(3, i.search("id:*").total_hits)
652
+ assert_equal(2, i.search('id:?*').total_hits)
653
+ end
654
+ assert_raise(StandardError) {i.close}
655
+ end
656
+
657
+ def test_highlighter()
658
+ index = Ferret::I.new(:default_field => :field,
659
+ :default_input_field => :field,
660
+ :analyzer => Ferret::Analysis::WhiteSpaceAnalyzer.new)
661
+ [
662
+ "the words we are searching for are one and two also " +
663
+ "sometimes looking for them as a phrase like this; one " +
664
+ "two lets see how it goes"
665
+ ].each {|doc| index << doc }
666
+
667
+ highlights = index.highlight("one", 0,
668
+ :excerpt_length => 10,
669
+ :num_excerpts => 1)
670
+
671
+ assert_equal(1, highlights.size)
672
+ assert_equal("...are <b>one</b>...", highlights[0])
673
+
674
+ highlights = index.highlight("one", 0,
675
+ :excerpt_length => 10,
676
+ :num_excerpts => 2)
677
+ assert_equal(2, highlights.size)
678
+ assert_equal("...are <b>one</b>...", highlights[0])
679
+ assert_equal("...this; <b>one</b>...", highlights[1])
680
+
681
+ highlights = index.highlight("one", 0,
682
+ :excerpt_length => 10,
683
+ :num_excerpts => 3)
684
+ assert_equal(3, highlights.size)
685
+ assert_equal("the words...", highlights[0])
686
+ assert_equal("...are <b>one</b>...", highlights[1])
687
+ assert_equal("...this; <b>one</b>...", highlights[2])
688
+
689
+ highlights = index.highlight("one", 0,
690
+ :excerpt_length => 10,
691
+ :num_excerpts => 4)
692
+ assert_equal(3, highlights.size)
693
+ assert_equal("the words we are...", highlights[0])
694
+ assert_equal("...are <b>one</b>...", highlights[1])
695
+ assert_equal("...this; <b>one</b>...", highlights[2])
696
+
697
+ highlights = index.highlight("one", 0,
698
+ :excerpt_length => 10,
699
+ :num_excerpts => 5)
700
+ assert_equal(2, highlights.size)
701
+ assert_equal("the words we are searching for are <b>one</b>...", highlights[0])
702
+ assert_equal("...this; <b>one</b>...", highlights[1])
703
+
704
+ highlights = index.highlight("one", 0,
705
+ :excerpt_length => 10,
706
+ :num_excerpts => 20)
707
+ assert_equal(1, highlights.size)
708
+ assert_equal("the words we are searching for are <b>one</b> and two also " +
709
+ "sometimes looking for them as a phrase like this; <b>one</b> " +
710
+ "two lets see how it goes", highlights[0])
711
+
712
+ highlights = index.highlight("one", 0,
713
+ :excerpt_length => 1000,
714
+ :num_excerpts => 1)
715
+ assert_equal(1, highlights.size)
716
+ assert_equal("the words we are searching for are <b>one</b> and two also " +
717
+ "sometimes looking for them as a phrase like this; <b>one</b> " +
718
+ "two lets see how it goes", highlights[0])
719
+
720
+ highlights = index.highlight("(one two)", 0,
721
+ :excerpt_length => 15,
722
+ :num_excerpts => 2)
723
+ assert_equal(2, highlights.size)
724
+ assert_equal("...<b>one</b> and <b>two</b>...", highlights[0])
725
+ assert_equal("...this; <b>one</b> <b>two</b>...", highlights[1])
726
+
727
+ highlights = index.highlight('one two "one two"', 0,
728
+ :excerpt_length => 15,
729
+ :num_excerpts => 2)
730
+ assert_equal(2, highlights.size)
731
+ assert_equal("...<b>one</b> and <b>two</b>...", highlights[0])
732
+ assert_equal("...this; <b>one two</b>...", highlights[1])
733
+
734
+ highlights = index.highlight('"one two"', 0,
735
+ :excerpt_length => 15,
736
+ :num_excerpts => 1)
737
+ assert_equal(1, highlights.size)
738
+ # should have a higher priority since it the merger of three matches
739
+ assert_equal("...this; <b>one two</b>...", highlights[0])
740
+
741
+ highlights = index.highlight('"one two"', 0, :field => :not_a_field,
742
+ :excerpt_length => 15,
743
+ :num_excerpts => 1)
744
+ assert_nil(highlights)
745
+
746
+ highlights = index.highlight("wrong_field:one", 0, :field => :wrong_field,
747
+ :excerpt_length => 15,
748
+ :num_excerpts => 1)
749
+ assert_nil(highlights)
750
+
751
+ highlights = index.highlight('"the words" "for are one and two" ' +
752
+ 'words one two', 0,
753
+ :excerpt_length => 10,
754
+ :num_excerpts => 1)
755
+ assert_equal(1, highlights.size)
756
+ assert_equal("...<b>for are one and two</b>...", highlights[0])
757
+
758
+ highlights = index.highlight('"the words" "for are one and two" ' +
759
+ 'words one two', 0,
760
+ :excerpt_length => 10,
761
+ :num_excerpts => 2)
762
+ assert_equal(2, highlights.size)
763
+ assert_equal("<b>the words</b>...", highlights[0])
764
+ assert_equal("...<b>for are one and two</b>...", highlights[1])
765
+
766
+
767
+ index.close
768
+ end
614
769
  end
@@ -63,6 +63,13 @@ module IndexReaderCommon
63
63
  assert_equal(1, te.doc_freq)
64
64
  assert(!te.next?)
65
65
 
66
+ expected = %w{is 1 more 1 not 1 skip 42 stored 1 text 1 which 1}
67
+ te = @ir.terms(:text)
68
+ te.each do |term, doc_freq|
69
+ assert_equal(expected.shift, term)
70
+ assert_equal(expected.shift.to_i, doc_freq)
71
+ end
72
+
66
73
  te = @ir.terms_from(:body, "Not")
67
74
  assert_equal("Not", te.term)
68
75
  assert_equal(1, te.doc_freq)
@@ -177,7 +184,7 @@ module IndexReaderCommon
177
184
 
178
185
  def do_test_get_doc()
179
186
  doc = @ir.get_document(3)
180
- assert_equal([:year, :body, :title, :author], doc.fields)
187
+ [:author, :body, :title, :year].each {|fn| assert(doc.fields.include?(fn))}
181
188
  assert_equal(4, doc.fields.size)
182
189
  assert_equal(0, doc.size)
183
190
  assert_equal([], doc.keys)
@@ -296,6 +303,7 @@ module IndexReaderCommon
296
303
  assert_equal(doc_count, ir2.max_doc())
297
304
  assert_equal(doc_count, ir2.num_docs())
298
305
 
306
+ ir2.close
299
307
  ir2 = ir_new()
300
308
  assert(ir2.has_deletions?())
301
309
  assert_equal(doc_count, ir2.max_doc())
@@ -325,6 +333,7 @@ module IndexReaderCommon
325
333
  assert_equal(doc_count - 6, ir3.max_doc())
326
334
  assert_equal(doc_count - 6, ir3.num_docs())
327
335
 
336
+ ir2.close()
328
337
  ir3.close()
329
338
  end
330
339
  end
@@ -358,7 +367,7 @@ class MultiReaderTest < Test::Unit::TestCase
358
367
  @ir = ir_new()
359
368
  end
360
369
 
361
- def tear_down()
370
+ def teardown()
362
371
  @ir.close()
363
372
  @dir.close()
364
373
  end
@@ -406,9 +415,102 @@ class MultiExternalReaderTest < Test::Unit::TestCase
406
415
  @ir = ir_new
407
416
  end
408
417
 
409
- def tear_down()
418
+ def teardown()
419
+ @ir.close()
420
+ @dirs.each {|dir| dir.close}
421
+ end
422
+ end
423
+
424
+ class MultiExternalReaderDirTest < Test::Unit::TestCase
425
+ include IndexReaderCommon
426
+
427
+ def ir_new
428
+ IndexReader.new(@dirs)
429
+ end
430
+
431
+ def iw_optimize
432
+ @dirs.each do |dir|
433
+ iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
434
+ iw.optimize()
435
+ iw.close()
436
+ end
437
+ end
438
+
439
+ def setup()
440
+ @dirs = []
441
+
442
+ [
443
+ [0, 10],
444
+ [10, 30],
445
+ [30, IndexTestHelper::INDEX_TEST_DOCS.size]
446
+ ].each do |start, finish|
447
+ dir = Ferret::Store::RAMDirectory.new()
448
+ @dirs << dir
449
+
450
+ iw = IndexWriter.new(:dir => dir,
451
+ :analyzer => WhiteSpaceAnalyzer.new(),
452
+ :create => true,
453
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS)
454
+ (start...finish).each do |doc_id|
455
+ iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
456
+ end
457
+ iw.close()
458
+ end
459
+ @ir = ir_new
460
+ end
461
+
462
+ def teardown()
463
+ @ir.close()
464
+ @dirs.each {|dir| dir.close}
465
+ end
466
+ end
467
+
468
+ class MultiExternalReaderPathTest < Test::Unit::TestCase
469
+ include IndexReaderCommon
470
+
471
+ def ir_new
472
+ IndexReader.new(@paths)
473
+ end
474
+
475
+ def iw_optimize
476
+ @paths.each do |path|
477
+ iw = IndexWriter.new(:path => path, :analyzer => WhiteSpaceAnalyzer.new())
478
+ iw.optimize()
479
+ iw.close()
480
+ end
481
+ end
482
+
483
+ def setup()
484
+ base_dir = File.expand_path(File.join(File.dirname(__FILE__),
485
+ '../../temp/multidir'))
486
+ FileUtils.mkdir_p(base_dir)
487
+ @paths = [
488
+ File.join(base_dir, "i1"),
489
+ File.join(base_dir, "i2"),
490
+ File.join(base_dir, "i3")
491
+ ]
492
+
493
+ [
494
+ [0, 10],
495
+ [10, 30],
496
+ [30, IndexTestHelper::INDEX_TEST_DOCS.size]
497
+ ].each_with_index do |(start, finish), i|
498
+ path = @paths[i]
499
+
500
+ iw = IndexWriter.new(:path => path,
501
+ :analyzer => WhiteSpaceAnalyzer.new(),
502
+ :create => true,
503
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS)
504
+ (start...finish).each do |doc_id|
505
+ iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
506
+ end
507
+ iw.close()
508
+ end
509
+ @ir = ir_new
510
+ end
511
+
512
+ def teardown()
410
513
  @ir.close()
411
- @dir.close()
412
514
  end
413
515
  end
414
516
 
@@ -420,7 +522,7 @@ class IndexReaderTest < Test::Unit::TestCase
420
522
  @dir = Ferret::Store::RAMDirectory.new()
421
523
  end
422
524
 
423
- def tear_down()
525
+ def teardown()
424
526
  @dir.close()
425
527
  end
426
528
 
@@ -445,6 +547,7 @@ class IndexReaderTest < Test::Unit::TestCase
445
547
  @dir = Ferret::Store::RAMDirectory.new(@fs_dir)
446
548
  ir = IndexReader.new(@dir)
447
549
  assert_equal(doc, ir.get_document(0).load)
550
+ ir.close
448
551
  end
449
552
 
450
553
  def do_test_term_vectors(ir)