ferret 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/Rakefile +7 -1
  2. data/ext/analysis.c +21 -13
  3. data/ext/array.c +1 -1
  4. data/ext/bitvector.c +2 -2
  5. data/ext/defines.h +0 -6
  6. data/ext/except.c +6 -6
  7. data/ext/except.h +12 -8
  8. data/ext/extconf.rb +1 -0
  9. data/ext/ferret.c +4 -0
  10. data/ext/ferret.h +1 -0
  11. data/ext/fs_store.c +18 -4
  12. data/ext/global.c +18 -16
  13. data/ext/global.h +7 -2
  14. data/ext/hash.c +1 -1
  15. data/ext/helper.c +1 -1
  16. data/ext/helper.h +1 -1
  17. data/ext/inc/lang.h +7 -1
  18. data/ext/ind.c +4 -4
  19. data/ext/ind.h +3 -3
  20. data/ext/index.c +33 -26
  21. data/ext/index.h +1 -1
  22. data/ext/lang.h +7 -1
  23. data/ext/mem_pool.c +1 -1
  24. data/ext/mem_pool.h +1 -1
  25. data/ext/q_fuzzy.c +2 -2
  26. data/ext/q_match_all.c +2 -2
  27. data/ext/q_multi_term.c +1 -1
  28. data/ext/q_parser.c +60 -52
  29. data/ext/r_analysis.c +6 -4
  30. data/ext/r_index.c +57 -4
  31. data/ext/r_search.c +1 -1
  32. data/ext/r_utils.c +1 -1
  33. data/ext/ram_store.c +1 -1
  34. data/ext/search.c +4 -4
  35. data/ext/sort.c +3 -3
  36. data/ext/store.c +9 -9
  37. data/ext/store.h +4 -4
  38. data/ext/tags +7841 -0
  39. data/ext/term_vectors.c +3 -3
  40. data/lib/ferret/index.rb +69 -7
  41. data/test/test_helper.rb +3 -2
  42. data/test/unit/analysis/tc_token_stream.rb +1 -0
  43. data/test/unit/index/tc_index.rb +157 -2
  44. data/test/unit/index/tc_index_reader.rb +108 -5
  45. data/test/unit/query_parser/tc_query_parser.rb +2 -1
  46. data/test/unit/search/tc_index_searcher.rb +1 -1
  47. data/test/unit/search/tc_multi_searcher.rb +2 -1
  48. data/test/unit/search/tc_spans.rb +1 -1
  49. data/test/unit/store/tc_fs_store.rb +6 -3
  50. data/test/unit/ts_analysis.rb +1 -1
  51. data/test/unit/ts_utils.rb +1 -1
  52. data/test/unit/utils/tc_number_tools.rb +1 -1
  53. metadata +138 -137
data/ext/term_vectors.c CHANGED
@@ -250,10 +250,10 @@ TermVectorsWriter *tvw_open(Store *store, const char *segment, FieldInfos *fis)
250
250
  tvw->fis = fis;
251
251
  tvw->fields = ary_new_type_capa(TVField, TV_FIELD_INIT_CAPA);
252
252
 
253
- sprintf(file_name, "%s.tvx", segment);
253
+ snprintf(file_name, SEGMENT_NAME_MAX_LENGTH, "%s.tvx", segment);
254
254
  tvw->tvx_out = store->new_output(store, file_name);
255
255
 
256
- sprintf(file_name, "%s.tvd", segment);
256
+ snprintf(file_name, SEGMENT_NAME_MAX_LENGTH, "%s.tvd", segment);
257
257
  tvw->tvd_out = store->new_output(store, file_name);
258
258
 
259
259
  return tvw;
@@ -335,7 +335,7 @@ void tvw_add_postings(TermVectorsWriter *tvw,
335
335
 
336
336
  if (fi_store_offsets(fi)) {
337
337
  /* use delta encoding for offsets */
338
- int last_end = 0;
338
+ int last_end = 0;
339
339
  os_write_vint(tvd_out, offset_count); /* write shared prefix length */
340
340
  for (i = 0; i < offset_count; i++) {
341
341
  int start = offsets[i].start;
data/lib/ferret/index.rb CHANGED
@@ -10,6 +10,7 @@ module Ferret::Index
10
10
  include Ferret::Search
11
11
 
12
12
  attr_reader :options
13
+
13
14
  # If you create an Index without any options, it'll simply create an index
14
15
  # in memory. But this class is highly configurable and every option that
15
16
  # you can supply to IndexWriter and QueryParser, you can also set here.
@@ -52,6 +53,10 @@ module Ferret::Index
52
53
  # concerned about performance. In that case you
53
54
  # should think about setting up a DRb indexing
54
55
  # service.
56
+ # lock_retry_time:: Default: 2 seconds. This parameter specifies how
57
+ # long to wait before retrying to obtain the
58
+ # commit lock when detecting if the IndexReader is
59
+ # at the latest version.
55
60
  #
56
61
  # Some examples;
57
62
  #
@@ -64,8 +69,14 @@ module Ferret::Index
64
69
  # index = Index::Index.new(:dir => directory,
65
70
  # :default_slop => 2,
66
71
  # :handle_parse_errors => false)
67
- #
68
- def initialize(options = {})
72
+ #
73
+ # You can also pass a block if you like. The index will be yielded and
74
+ # closed at the index of the box. For example;
75
+ #
76
+ # Ferret::I.new() do |index|
77
+ # # do stuff with index. Most of your actions will be cached.
78
+ # end
79
+ def initialize(options = {}, &block)
69
80
  super()
70
81
 
71
82
  if options[:key]
@@ -92,14 +103,19 @@ module Ferret::Index
92
103
  end
93
104
 
94
105
  options[:dir] = @dir
106
+ options[:lock_retry_time]||= 2
95
107
  @dir.extend(MonitorMixin)
96
108
  @dir.synchronize do
97
109
  @options = options
98
- @writer = IndexWriter.new(options) # create the index if need be
99
- options[:analyzer] = @analyzer = @writer.analyzer
100
- @writer.close
110
+ if (!@dir.exists?("segments")) || options[:create]
111
+ IndexWriter.new(options).close
112
+ end
113
+ options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new
114
+
115
+ @searcher = nil
101
116
  @writer = nil
102
117
  @reader = nil
118
+
103
119
  @options.delete(:create) # only want to create the first time if at all
104
120
  @auto_flush = @options[:auto_flush] || false
105
121
  if (@options[:id_field].nil? and
@@ -117,13 +133,51 @@ module Ferret::Index
117
133
  @open = true
118
134
  @qp = nil
119
135
  end
136
+ if block
137
+ yield self
138
+ self.close
139
+ end
140
+ end
141
+
142
+ # Returns an array of strings with the matches highlighted. The +query+ can
143
+ # either a query String or a Ferret::Search::Query object. The doc_id is
144
+ # the id of the document you want to highlight (usually returned by the
145
+ # search methods). There are also a number of options you can pass;
146
+ #
147
+ # === Options
148
+ #
149
+ # :field:: Default: @options[:default_field]. The default_field
150
+ # is the field that is usually highlighted but you can
151
+ # specify which field you want to highlight here. If
152
+ # you want to highlight multiple fields then you will
153
+ # need to call this method multiple times.
154
+ # :excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
155
+ # terms will be in the centre of the excerpt.
156
+ # :num_excerpts:: Default: 2. Number of excerpts to return.
157
+ # :pre_tag:: Default: "<b>". Tag to place to the left of the
158
+ # match. You'll probably want to change this to a
159
+ # "<span>" tag with a class "\033[7m" for use in a
160
+ # terminal.
161
+ # :post_tag:: Default: "</b>". This tag should close the
162
+ # +:pre_tag+. Try tag "\033[m" in the terminal.
163
+ # :ellipsis:: Default: "...". This is the string that is appended
164
+ # at the beginning and end of excerpts (unless the
165
+ # excerpt hits the start or end of the field. You'll
166
+ # probably want to change this so a Unicode elipsis
167
+ # character.
168
+ def highlight(query, doc_id, options = {})
169
+ ensure_searcher_open()
170
+ @searcher.highlight(process_query(query),
171
+ doc_id,
172
+ options[:field]||@options[:default_field],
173
+ options)
120
174
  end
121
175
 
122
176
  # Closes this index by closing its associated reader and writer objects.
123
177
  def close
124
178
  @dir.synchronize do
125
179
  if not @open
126
- raise "tried to close an already closed directory"
180
+ raise(StandardError, "tried to close an already closed directory")
127
181
  end
128
182
  @searcher.close() if @searcher
129
183
  @reader.close() if @reader
@@ -534,7 +588,15 @@ module Ferret::Index
534
588
  def ensure_reader_open()
535
589
  raise "tried to use a closed index" if not @open
536
590
  if @reader
537
- if not @reader.latest?
591
+ latest = false
592
+ begin
593
+ latest = @reader.latest?
594
+ rescue LockException => le
595
+ sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again
596
+ latest = @reader.latest?
597
+ end
598
+ if not latest
599
+ @reader.close
538
600
  return @reader = IndexReader.new(@dir)
539
601
  end
540
602
  else
data/test/test_helper.rb CHANGED
@@ -3,9 +3,10 @@ $:.unshift File.join(File.dirname(__FILE__), '../lib')
3
3
  $:.unshift File.join(File.dirname(__FILE__), '../ext')
4
4
 
5
5
  class Float
6
- def =~(o)
7
- return (1 - self/o).abs < 0.00001
6
+ def approx_eql?(o)
7
+ return (1 - self/o).abs < 0.0001
8
8
  end
9
+ alias :=~ :approx_eql?
9
10
  end
10
11
 
11
12
  require 'test/unit'
@@ -1,5 +1,6 @@
1
1
  require File.dirname(__FILE__) + "/../../test_helper"
2
2
 
3
+ puts "Loading once"
3
4
  class TokenTest < Test::Unit::TestCase
4
5
  include Ferret::Analysis
5
6
  def test_token
@@ -9,7 +9,7 @@ class IndexTest < Test::Unit::TestCase
9
9
  def setup()
10
10
  end
11
11
 
12
- def tear_down()
12
+ def teardown()
13
13
  end
14
14
 
15
15
  def check_results(index, query, expected)
@@ -345,9 +345,10 @@ class IndexTest < Test::Unit::TestCase
345
345
  assert_equal(2, index2.size)
346
346
  assert_equal(2, index.size)
347
347
  top_docs = index.search("content3")
348
+
348
349
  assert_equal(0, top_docs.hits.size)
349
350
 
350
- iw = IndexWriter.new(:path => fs_path, :analyzer => WhiteSpaceAnalyzer.new())
351
+ iw = IndexWriter.new(:path => fs_path, :analyzer => WhiteSpaceAnalyzer.new)
351
352
  iw << {:f, "content3"}
352
353
  iw.close()
353
354
 
@@ -355,6 +356,7 @@ class IndexTest < Test::Unit::TestCase
355
356
  assert_equal(1, top_docs.hits.size)
356
357
  assert_equal(3, index.size)
357
358
  assert_equal("content3", index[2][:f])
359
+ index2.close
358
360
  index.close
359
361
  end
360
362
 
@@ -556,6 +558,7 @@ class IndexTest < Test::Unit::TestCase
556
558
 
557
559
  data = %q(one two three four five six seven eight nine ten eleven twelve)
558
560
  index1 = Index.new(:path => fs_path, :auto_flush => true, :key => :id)
561
+ index1 << "zero"
559
562
  index2 = Index.new(:path => fs_path, :auto_flush => true)
560
563
  begin
561
564
  data.each do |datum|
@@ -611,4 +614,156 @@ class IndexTest < Test::Unit::TestCase
611
614
  hits = i.search 'move or shake'
612
615
  assert_equal 1, hits.total_hits # fails when id field is present
613
616
  end
617
+
618
+ def test_threading
619
+ path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
620
+ index = Ferret::Index::Index.new(:path => path, :create => true)
621
+
622
+ 100.times do |i|
623
+ buf = ''
624
+ doc = {}
625
+ doc[:id] = i
626
+ doc[:foo] = "foo #{i}"
627
+ index << doc
628
+ end
629
+
630
+ threads = []
631
+
632
+ 4.times do
633
+ threads << Thread.new(index) do |index|
634
+ result = index.search('id:42')
635
+ assert_equal(1, result.total_hits)
636
+ end
637
+ end
638
+
639
+ threads.each{|t| t.join }
640
+ end
641
+
642
+ def test_wildcard
643
+ i = nil
644
+ Ferret::I.new do |i|
645
+ i << "one"
646
+ assert_equal(1, i.search("*").total_hits)
647
+ i << "two"
648
+ assert_equal(2, i.search("*").total_hits)
649
+ i << {:content => "three"}
650
+ assert_equal(3, i.search("*").total_hits)
651
+ assert_equal(3, i.search("id:*").total_hits)
652
+ assert_equal(2, i.search('id:?*').total_hits)
653
+ end
654
+ assert_raise(StandardError) {i.close}
655
+ end
656
+
657
+ def test_highlighter()
658
+ index = Ferret::I.new(:default_field => :field,
659
+ :default_input_field => :field,
660
+ :analyzer => Ferret::Analysis::WhiteSpaceAnalyzer.new)
661
+ [
662
+ "the words we are searching for are one and two also " +
663
+ "sometimes looking for them as a phrase like this; one " +
664
+ "two lets see how it goes"
665
+ ].each {|doc| index << doc }
666
+
667
+ highlights = index.highlight("one", 0,
668
+ :excerpt_length => 10,
669
+ :num_excerpts => 1)
670
+
671
+ assert_equal(1, highlights.size)
672
+ assert_equal("...are <b>one</b>...", highlights[0])
673
+
674
+ highlights = index.highlight("one", 0,
675
+ :excerpt_length => 10,
676
+ :num_excerpts => 2)
677
+ assert_equal(2, highlights.size)
678
+ assert_equal("...are <b>one</b>...", highlights[0])
679
+ assert_equal("...this; <b>one</b>...", highlights[1])
680
+
681
+ highlights = index.highlight("one", 0,
682
+ :excerpt_length => 10,
683
+ :num_excerpts => 3)
684
+ assert_equal(3, highlights.size)
685
+ assert_equal("the words...", highlights[0])
686
+ assert_equal("...are <b>one</b>...", highlights[1])
687
+ assert_equal("...this; <b>one</b>...", highlights[2])
688
+
689
+ highlights = index.highlight("one", 0,
690
+ :excerpt_length => 10,
691
+ :num_excerpts => 4)
692
+ assert_equal(3, highlights.size)
693
+ assert_equal("the words we are...", highlights[0])
694
+ assert_equal("...are <b>one</b>...", highlights[1])
695
+ assert_equal("...this; <b>one</b>...", highlights[2])
696
+
697
+ highlights = index.highlight("one", 0,
698
+ :excerpt_length => 10,
699
+ :num_excerpts => 5)
700
+ assert_equal(2, highlights.size)
701
+ assert_equal("the words we are searching for are <b>one</b>...", highlights[0])
702
+ assert_equal("...this; <b>one</b>...", highlights[1])
703
+
704
+ highlights = index.highlight("one", 0,
705
+ :excerpt_length => 10,
706
+ :num_excerpts => 20)
707
+ assert_equal(1, highlights.size)
708
+ assert_equal("the words we are searching for are <b>one</b> and two also " +
709
+ "sometimes looking for them as a phrase like this; <b>one</b> " +
710
+ "two lets see how it goes", highlights[0])
711
+
712
+ highlights = index.highlight("one", 0,
713
+ :excerpt_length => 1000,
714
+ :num_excerpts => 1)
715
+ assert_equal(1, highlights.size)
716
+ assert_equal("the words we are searching for are <b>one</b> and two also " +
717
+ "sometimes looking for them as a phrase like this; <b>one</b> " +
718
+ "two lets see how it goes", highlights[0])
719
+
720
+ highlights = index.highlight("(one two)", 0,
721
+ :excerpt_length => 15,
722
+ :num_excerpts => 2)
723
+ assert_equal(2, highlights.size)
724
+ assert_equal("...<b>one</b> and <b>two</b>...", highlights[0])
725
+ assert_equal("...this; <b>one</b> <b>two</b>...", highlights[1])
726
+
727
+ highlights = index.highlight('one two "one two"', 0,
728
+ :excerpt_length => 15,
729
+ :num_excerpts => 2)
730
+ assert_equal(2, highlights.size)
731
+ assert_equal("...<b>one</b> and <b>two</b>...", highlights[0])
732
+ assert_equal("...this; <b>one two</b>...", highlights[1])
733
+
734
+ highlights = index.highlight('"one two"', 0,
735
+ :excerpt_length => 15,
736
+ :num_excerpts => 1)
737
+ assert_equal(1, highlights.size)
738
+ # should have a higher priority since it the merger of three matches
739
+ assert_equal("...this; <b>one two</b>...", highlights[0])
740
+
741
+ highlights = index.highlight('"one two"', 0, :field => :not_a_field,
742
+ :excerpt_length => 15,
743
+ :num_excerpts => 1)
744
+ assert_nil(highlights)
745
+
746
+ highlights = index.highlight("wrong_field:one", 0, :field => :wrong_field,
747
+ :excerpt_length => 15,
748
+ :num_excerpts => 1)
749
+ assert_nil(highlights)
750
+
751
+ highlights = index.highlight('"the words" "for are one and two" ' +
752
+ 'words one two', 0,
753
+ :excerpt_length => 10,
754
+ :num_excerpts => 1)
755
+ assert_equal(1, highlights.size)
756
+ assert_equal("...<b>for are one and two</b>...", highlights[0])
757
+
758
+ highlights = index.highlight('"the words" "for are one and two" ' +
759
+ 'words one two', 0,
760
+ :excerpt_length => 10,
761
+ :num_excerpts => 2)
762
+ assert_equal(2, highlights.size)
763
+ assert_equal("<b>the words</b>...", highlights[0])
764
+ assert_equal("...<b>for are one and two</b>...", highlights[1])
765
+
766
+
767
+ index.close
768
+ end
614
769
  end
@@ -63,6 +63,13 @@ module IndexReaderCommon
63
63
  assert_equal(1, te.doc_freq)
64
64
  assert(!te.next?)
65
65
 
66
+ expected = %w{is 1 more 1 not 1 skip 42 stored 1 text 1 which 1}
67
+ te = @ir.terms(:text)
68
+ te.each do |term, doc_freq|
69
+ assert_equal(expected.shift, term)
70
+ assert_equal(expected.shift.to_i, doc_freq)
71
+ end
72
+
66
73
  te = @ir.terms_from(:body, "Not")
67
74
  assert_equal("Not", te.term)
68
75
  assert_equal(1, te.doc_freq)
@@ -177,7 +184,7 @@ module IndexReaderCommon
177
184
 
178
185
  def do_test_get_doc()
179
186
  doc = @ir.get_document(3)
180
- assert_equal([:year, :body, :title, :author], doc.fields)
187
+ [:author, :body, :title, :year].each {|fn| assert(doc.fields.include?(fn))}
181
188
  assert_equal(4, doc.fields.size)
182
189
  assert_equal(0, doc.size)
183
190
  assert_equal([], doc.keys)
@@ -296,6 +303,7 @@ module IndexReaderCommon
296
303
  assert_equal(doc_count, ir2.max_doc())
297
304
  assert_equal(doc_count, ir2.num_docs())
298
305
 
306
+ ir2.close
299
307
  ir2 = ir_new()
300
308
  assert(ir2.has_deletions?())
301
309
  assert_equal(doc_count, ir2.max_doc())
@@ -325,6 +333,7 @@ module IndexReaderCommon
325
333
  assert_equal(doc_count - 6, ir3.max_doc())
326
334
  assert_equal(doc_count - 6, ir3.num_docs())
327
335
 
336
+ ir2.close()
328
337
  ir3.close()
329
338
  end
330
339
  end
@@ -358,7 +367,7 @@ class MultiReaderTest < Test::Unit::TestCase
358
367
  @ir = ir_new()
359
368
  end
360
369
 
361
- def tear_down()
370
+ def teardown()
362
371
  @ir.close()
363
372
  @dir.close()
364
373
  end
@@ -406,9 +415,102 @@ class MultiExternalReaderTest < Test::Unit::TestCase
406
415
  @ir = ir_new
407
416
  end
408
417
 
409
- def tear_down()
418
+ def teardown()
419
+ @ir.close()
420
+ @dirs.each {|dir| dir.close}
421
+ end
422
+ end
423
+
424
+ class MultiExternalReaderDirTest < Test::Unit::TestCase
425
+ include IndexReaderCommon
426
+
427
+ def ir_new
428
+ IndexReader.new(@dirs)
429
+ end
430
+
431
+ def iw_optimize
432
+ @dirs.each do |dir|
433
+ iw = IndexWriter.new(:dir => dir, :analyzer => WhiteSpaceAnalyzer.new())
434
+ iw.optimize()
435
+ iw.close()
436
+ end
437
+ end
438
+
439
+ def setup()
440
+ @dirs = []
441
+
442
+ [
443
+ [0, 10],
444
+ [10, 30],
445
+ [30, IndexTestHelper::INDEX_TEST_DOCS.size]
446
+ ].each do |start, finish|
447
+ dir = Ferret::Store::RAMDirectory.new()
448
+ @dirs << dir
449
+
450
+ iw = IndexWriter.new(:dir => dir,
451
+ :analyzer => WhiteSpaceAnalyzer.new(),
452
+ :create => true,
453
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS)
454
+ (start...finish).each do |doc_id|
455
+ iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
456
+ end
457
+ iw.close()
458
+ end
459
+ @ir = ir_new
460
+ end
461
+
462
+ def teardown()
463
+ @ir.close()
464
+ @dirs.each {|dir| dir.close}
465
+ end
466
+ end
467
+
468
+ class MultiExternalReaderPathTest < Test::Unit::TestCase
469
+ include IndexReaderCommon
470
+
471
+ def ir_new
472
+ IndexReader.new(@paths)
473
+ end
474
+
475
+ def iw_optimize
476
+ @paths.each do |path|
477
+ iw = IndexWriter.new(:path => path, :analyzer => WhiteSpaceAnalyzer.new())
478
+ iw.optimize()
479
+ iw.close()
480
+ end
481
+ end
482
+
483
+ def setup()
484
+ base_dir = File.expand_path(File.join(File.dirname(__FILE__),
485
+ '../../temp/multidir'))
486
+ FileUtils.mkdir_p(base_dir)
487
+ @paths = [
488
+ File.join(base_dir, "i1"),
489
+ File.join(base_dir, "i2"),
490
+ File.join(base_dir, "i3")
491
+ ]
492
+
493
+ [
494
+ [0, 10],
495
+ [10, 30],
496
+ [30, IndexTestHelper::INDEX_TEST_DOCS.size]
497
+ ].each_with_index do |(start, finish), i|
498
+ path = @paths[i]
499
+
500
+ iw = IndexWriter.new(:path => path,
501
+ :analyzer => WhiteSpaceAnalyzer.new(),
502
+ :create => true,
503
+ :field_infos => IndexTestHelper::INDEX_TEST_FIS)
504
+ (start...finish).each do |doc_id|
505
+ iw << IndexTestHelper::INDEX_TEST_DOCS[doc_id]
506
+ end
507
+ iw.close()
508
+ end
509
+ @ir = ir_new
510
+ end
511
+
512
+ def teardown()
410
513
  @ir.close()
411
- @dir.close()
412
514
  end
413
515
  end
414
516
 
@@ -420,7 +522,7 @@ class IndexReaderTest < Test::Unit::TestCase
420
522
  @dir = Ferret::Store::RAMDirectory.new()
421
523
  end
422
524
 
423
- def tear_down()
525
+ def teardown()
424
526
  @dir.close()
425
527
  end
426
528
 
@@ -445,6 +547,7 @@ class IndexReaderTest < Test::Unit::TestCase
445
547
  @dir = Ferret::Store::RAMDirectory.new(@fs_dir)
446
548
  ir = IndexReader.new(@dir)
447
549
  assert_equal(doc, ir.get_document(0).load)
550
+ ir.close
448
551
  end
449
552
 
450
553
  def do_test_term_vectors(ir)