ferret 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/TUTORIAL CHANGED
@@ -81,16 +81,17 @@ phrase "quick brown fox" in the content field. We'd write;
81
81
  end
82
82
 
83
83
  But "fast" has a pretty similar meaning to "quick" and we don't mind if the
84
- fox is a little red. So we could expand our search like this;
84
+ fox is a little red. Also, the phrase could be in the title so we'll search
85
+ there as well. So we could expand our search like this;
85
86
 
86
- index.search_each('content:"quick|fast brown|red fox"') do |doc, score|
87
+ index.search_each('title|content:"quick|fast brown|red fox"') do |doc, score|
87
88
  puts "Document #{doc} found with a score of #{score}"
88
89
  end
89
90
 
90
91
  What if we want to find all documents entered on or after 5th of September,
91
- 2005 with the words "ruby" or "rails" in it. We could type something like;
92
+ 2005 with the words "ruby" or "rails" in any field. We could type something like;
92
93
 
93
- index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |doc, score|
94
+ index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |doc, score|
94
95
  puts "Document #{doc} found with a score of #{score}"
95
96
  end
96
97
 
data/ext/Makefile ADDED
@@ -0,0 +1,140 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = .
7
+ topdir = /usr/lib/ruby/1.8/i486-linux
8
+ hdrdir = $(topdir)
9
+ VPATH = $(srcdir):$(topdir):$(hdrdir)
10
+ prefix = $(DESTDIR)/usr
11
+ exec_prefix = $(prefix)
12
+ sitedir = $(DESTDIR)/usr/local/lib/site_ruby
13
+ rubylibdir = $(libdir)/ruby/$(ruby_version)
14
+ archdir = $(rubylibdir)/$(arch)
15
+ sbindir = $(exec_prefix)/sbin
16
+ datadir = $(prefix)/share
17
+ includedir = $(prefix)/include
18
+ infodir = $(prefix)/info
19
+ sysconfdir = $(DESTDIR)/etc
20
+ mandir = $(datadir)/man
21
+ libdir = $(exec_prefix)/lib
22
+ sharedstatedir = $(prefix)/com
23
+ oldincludedir = $(DESTDIR)/usr/include
24
+ sitearchdir = $(sitelibdir)/$(sitearch)
25
+ bindir = $(exec_prefix)/bin
26
+ localstatedir = $(DESTDIR)/var
27
+ sitelibdir = $(sitedir)/$(ruby_version)
28
+ libexecdir = $(exec_prefix)/libexec
29
+
30
+ CC = gcc
31
+ LIBRUBY = $(LIBRUBY_SO)
32
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
33
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
34
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
35
+
36
+ CFLAGS = -fPIC -Wall -g -O2 -fPIC
37
+ CPPFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir)
38
+ CXXFLAGS = $(CFLAGS)
39
+ DLDFLAGS =
40
+ LDSHARED = $(CC) -shared
41
+ AR = ar
42
+ EXEEXT =
43
+
44
+ RUBY_INSTALL_NAME = ruby1.8
45
+ RUBY_SO_NAME = ruby1.8
46
+ arch = i486-linux
47
+ sitearch = i486-linux
48
+ ruby_version = 1.8
49
+ ruby = /usr/bin/ruby1.8
50
+ RUBY = $(ruby)
51
+ RM = rm -f
52
+ MAKEDIRS = mkdir -p
53
+ INSTALL = /usr/bin/install -c
54
+ INSTALL_PROG = $(INSTALL) -m 0755
55
+ INSTALL_DATA = $(INSTALL) -m 644
56
+ COPY = cp
57
+
58
+ #### End of system configuration section. ####
59
+
60
+ preload =
61
+
62
+ libpath = $(libdir)
63
+ LIBPATH = -L"$(libdir)"
64
+ DEFFILE =
65
+
66
+ CLEANFILES =
67
+ DISTCLEANFILES =
68
+
69
+ extout =
70
+ extout_prefix =
71
+ target_prefix =
72
+ LOCAL_LIBS =
73
+ LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lcrypt -lm -lc
74
+ SRCS = index_io.c term_buffer.c ram_directory.c priority_queue.c string_helper.c segment_merge_queue.c ferret.c term.c util.c
75
+ OBJS = index_io.o term_buffer.o ram_directory.o priority_queue.o string_helper.o segment_merge_queue.o ferret.o term.o util.o
76
+ TARGET = ferret_ext
77
+ DLLIB = $(TARGET).so
78
+ STATIC_LIB =
79
+
80
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
81
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
82
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
83
+
84
+ TARGET_SO = $(DLLIB)
85
+ CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
86
+ CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
87
+
88
+ all: $(DLLIB)
89
+ static: $(STATIC_LIB)
90
+
91
+ clean:
92
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
93
+
94
+ distclean: clean
95
+ @-$(RM) Makefile extconf.h conftest.* mkmf.log
96
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
97
+
98
+ realclean: distclean
99
+ install: install-so install-rb
100
+
101
+ install-so: $(RUBYARCHDIR)
102
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
103
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
104
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
105
+ install-rb: pre-install-rb install-rb-default
106
+ install-rb-default: pre-install-rb-default
107
+ pre-install-rb pre-install-rb-default: $(RUBYLIBDIR)
108
+ $(RUBYARCHDIR):
109
+ $(MAKEDIRS) $@
110
+ $(RUBYLIBDIR):
111
+ $(MAKEDIRS) $@
112
+
113
+ site-install: site-install-so site-install-rb
114
+ site-install-so: install-so
115
+ site-install-rb: install-rb
116
+
117
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
118
+
119
+ .cc.o:
120
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
121
+
122
+ .cxx.o:
123
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
124
+
125
+ .cpp.o:
126
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
127
+
128
+ .C.o:
129
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
130
+
131
+ .c.o:
132
+ $(CC) $(CFLAGS) $(CPPFLAGS) -c $<
133
+
134
+ $(DLLIB): $(OBJS)
135
+ @-$(RM) $@
136
+ $(LDSHARED) $(DLDFLAGS) $(LIBPATH) -o $@ $(OBJS) $(LOCAL_LIBS) $(LIBS)
137
+
138
+
139
+
140
+ $(OBJS): ruby.h defines.h
data/ext/ferret_ext.so ADDED
Binary file
data/lib/ferret.rb CHANGED
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.1.4'
25
+ VERSION = '0.2.0'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -123,11 +123,14 @@ module Ferret::Document
123
123
 
124
124
  # Sets the data in field +field+ to +text+. If there is more than one
125
125
  # field of that name then it will set the data in the first field of that
126
- # name.
126
+ # name. If there is no field of that name, then a new one will be created
127
127
  def []=(field_name, data)
128
128
  field = field(field_name.to_s)
129
- raise ArgumentError, "Field does not exist" unless field
130
- field.data = data
129
+ if field
130
+ field.data = data
131
+ else
132
+ add_field(Field.new(field_name.to_s, data))
133
+ end
131
134
  end
132
135
 
133
136
  # Returns an array of binaries of the field specified as the method
@@ -82,7 +82,11 @@ module Ferret::Index
82
82
  #
83
83
  def initialize(options = {})
84
84
  super()
85
+
86
+ options[:default_search_field] &&= options[:default_search_field].to_s
87
+ options[:default_field] &&= options[:default_field].to_s
85
88
  options[:create_if_missing] = true if options[:create_if_missing].nil?
89
+
86
90
  if options[:path]
87
91
  @dir = FSDirectory.new(options[:path], options[:create])
88
92
  options[:close_dir] = true
@@ -291,8 +295,25 @@ module Ferret::Index
291
295
  return @reader.delete_docs_with_term(t)
292
296
  elsif id.is_a?(Term)
293
297
  return @reader.delete_docs_with_term(id)
294
- else
298
+ elsif id.is_a?(Integer)
295
299
  return @reader.delete(id)
300
+ else
301
+ raise ArgumentError, "Cannot delete for id of type #{id.class}"
302
+ end
303
+ end
304
+ end
305
+
306
+ # Delete all documents returned by the query.
307
+ #
308
+ # query:: The query to find documents you wish to delete. Can either be a
309
+ # string (in which case it is parsed by the standard query parser)
310
+ # or an actual query object.
311
+ def query_delete(query)
312
+ @dir.synchronize do
313
+ ensure_searcher_open()
314
+ query = process_query(query)
315
+ @searcher.search_each(query) do |doc, score|
316
+ @reader.delete(doc)
296
317
  end
297
318
  end
298
319
  end
@@ -305,6 +326,72 @@ module Ferret::Index
305
326
  end
306
327
  end
307
328
 
329
+ # Update the document referenced by the document number +id+ if +id+ is an
330
+ # integer or all of the documents which have the term +id+ if +id+ is a
331
+ # term..
332
+ #
333
+ # id:: The number of the document to update. Can also be a string
334
+ # representing the value in the +id+ field or a term to match.
335
+ # new_val:: The values we are updating. This can be a string in which case
336
+ # the default field is updated, or it can be a hash, in which
337
+ # case, all fields in the hash are updated. You can also pass a
338
+ # full Document object but you must pass the doc_num as the id.
339
+ def update(id, new_val)
340
+ @dir.synchronize do
341
+ if id.is_a?(String)
342
+ query_update("id:#{id}", new_val)
343
+ elsif id.is_a?(Term)
344
+ query_update(TermQuery.new(id), new_val)
345
+ elsif id.is_a?(Integer)
346
+ ensure_reader_open()
347
+ document = doc(id)
348
+ if new_val.is_a?(Hash)
349
+ new_val.each_pair {|name, content| document[name] = content.to_s}
350
+ elsif new_val.is_a?(Document)
351
+ document = new_val
352
+ else
353
+ document[@options[:default_field]] = new_val.to_s
354
+ end
355
+ @reader.delete(id)
356
+ ensure_writer_open()
357
+ @writer.add_document(document)
358
+ else
359
+ raise ArgumentError, "Cannot update for id of type #{id.class}"
360
+ end
361
+ end
362
+ end
363
+
364
+ # Update all the documents returned by the query.
365
+ #
366
+ # query:: The query to find documents you wish to update. Can either be
367
+ # a string (in which case it is parsed by the standard query
368
+ # parser) or an actual query object.
369
+ # new_val:: The values we are updating. This can be a string in which case
370
+ # the default field is updated, or it can be a hash, in which
371
+ # case, all fields in the hash are updated. If you want to pass
372
+ # a full document see #update.
373
+ def query_update(query, new_val)
374
+ @dir.synchronize do
375
+ ensure_searcher_open()
376
+ docs_to_add = []
377
+ query = process_query(query)
378
+ @searcher.search_each(query) do |id, score|
379
+ document = doc(id)
380
+ if new_val.is_a?(Hash)
381
+ new_val.each_pair {|name, content| document[name] = content.to_s}
382
+ else
383
+ document[@options[:default_field]] = new_val.to_s
384
+ end
385
+ docs_to_add << document
386
+ @reader.delete(id)
387
+ end
388
+ ensure_writer_open()
389
+ docs_to_add.each do |document|
390
+ @writer.add_document(document)
391
+ end
392
+ end
393
+ end
394
+
308
395
  # Returns true if any documents have been deleted since the index was last
309
396
  # flushed.
310
397
  def has_deletions?()
@@ -432,7 +519,13 @@ module Ferret::Index
432
519
 
433
520
  def ensure_reader_open()
434
521
  raise "tried to use a closed index" if not @open
435
- return if @reader
522
+ if @reader
523
+ if not @reader.latest?
524
+ @reader = IndexReader.open(@dir, false)
525
+ end
526
+ return
527
+ end
528
+
436
529
  if @writer
437
530
  @writer.close
438
531
  @writer = nil
@@ -450,6 +543,12 @@ module Ferret::Index
450
543
  private
451
544
  def do_search(query, options)
452
545
  ensure_searcher_open()
546
+ query = process_query(query)
547
+
548
+ return @searcher.search(query, options)
549
+ end
550
+
551
+ def process_query(query)
453
552
  if query.is_a?(String)
454
553
  if @qp.nil?
455
554
  @qp = Ferret::QueryParser.new(@default_search_field, @options)
@@ -458,8 +557,7 @@ module Ferret::Index
458
557
  @qp.fields = @reader.get_field_names.to_a
459
558
  query = @qp.parse(query)
460
559
  end
461
-
462
- return @searcher.search(query, options)
560
+ return query
463
561
  end
464
562
  end
465
563
  end
@@ -343,6 +343,12 @@ module Ferret::Index
343
343
  end
344
344
  end
345
345
  end
346
+
347
+ # Returns true if the reader is reading from the latest version of the
348
+ # index.
349
+ def latest?()
350
+ SegmentInfos.read_current_version(@directory) == @segment_infos.version()
351
+ end
346
352
 
347
353
  # Deletes the document numbered +doc_num+. Once a document is deleted it
348
354
  # will not appear in TermDocEnum or TermPostitions enumerations. Attempts to
@@ -5,6 +5,7 @@ class IndexTest < Test::Unit::TestCase
5
5
  include Ferret::Index
6
6
  include Ferret::Analysis
7
7
  include Ferret::Store
8
+ include Ferret::Document
8
9
 
9
10
  def setup()
10
11
  @qp = Ferret::QueryParser.new()
@@ -289,4 +290,105 @@ class IndexTest < Test::Unit::TestCase
289
290
  assert_equal("romeo", index[3]["f"])
290
291
  index.close
291
292
  end
293
+
294
+ def test_auto_update_when_externally_modified()
295
+ fs_path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
296
+ index = Index.new(:path => fs_path, :default_field => "f", :create => true)
297
+ index << "document 1"
298
+ assert_equal(1, index.size)
299
+
300
+ index2 = Index.new(:path => fs_path, :default_field => "f")
301
+ assert_equal(1, index2.size)
302
+ index2 << "document 2"
303
+ assert_equal(2, index2.size)
304
+ assert_equal(2, index.size)
305
+
306
+ iw = IndexWriter.new(fs_path, :analyzer => WhiteSpaceAnalyzer.new())
307
+ doc = Document.new
308
+ doc << Field.new("f", "content3", Field::Store::YES, Field::Index::TOKENIZED)
309
+ iw << doc
310
+ iw.close()
311
+ assert_equal(3, index.size)
312
+ assert_equal("content3", index[2]["f"])
313
+ end
314
+
315
+ def test_delete
316
+ data = [
317
+ {:id => 0, :cat => "/cat1/subcat1"},
318
+ {:id => 1, :cat => "/cat1/subcat2"},
319
+ {:id => 2, :cat => "/cat1/subcat2"},
320
+ {:id => 3, :cat => "/cat1/subcat3"},
321
+ {:id => 4, :cat => "/cat1/subcat4"},
322
+ {:id => 5, :cat => "/cat2/subcat1"},
323
+ {:id => 6, :cat => "/cat2/subcat2"},
324
+ {:id => 7, :cat => "/cat2/subcat3"},
325
+ {:id => 8, :cat => "/cat2/subcat4"},
326
+ {:id => 9, :cat => "/cat2/subcat5"},
327
+ ]
328
+ index = Index.new(:analyzer => WhiteSpaceAnalyzer.new)
329
+ data.each {|doc| index << doc }
330
+ assert_equal(10, index.size)
331
+ assert_equal(1, index.search("id:9").size)
332
+ index.delete(9)
333
+ assert_equal(9, index.size)
334
+ assert_equal(0, index.search("id:9").size)
335
+ assert_equal(1, index.search("id:8").size)
336
+ index.delete("8")
337
+ assert_equal(8, index.size)
338
+ assert_equal(0, index.search("id:8").size)
339
+ assert_equal(5, index.search("cat:/cat1*").size)
340
+ index.query_delete("cat:/cat1*")
341
+ assert_equal(3, index.size)
342
+ assert_equal(0, index.search("cat:/cat1*").size)
343
+ end
344
+
345
+ def test_update
346
+ data = [
347
+ {:id => 0, :cat => "/cat1/subcat1", :content => "content0"},
348
+ {:id => 1, :cat => "/cat1/subcat2", :content => "content1"},
349
+ {:id => 2, :cat => "/cat1/subcat2", :content => "content2"},
350
+ {:id => 3, :cat => "/cat1/subcat3", :content => "content3"},
351
+ {:id => 4, :cat => "/cat1/subcat4", :content => "content4"},
352
+ {:id => 5, :cat => "/cat2/subcat1", :content => "content5"},
353
+ {:id => 6, :cat => "/cat2/subcat2", :content => "content6"},
354
+ {:id => 7, :cat => "/cat2/subcat3", :content => "content7"},
355
+ {:id => 8, :cat => "/cat2/subcat4", :content => "content8"},
356
+ {:id => 9, :cat => "/cat2/subcat5", :content => "content9"},
357
+ ]
358
+ index = Index.new(:analyzer => WhiteSpaceAnalyzer.new,
359
+ :default_field => :content)
360
+ data.each { |doc| index << doc }
361
+ assert_equal(10, index.size)
362
+ assert_equal("content5", index["5"][:content])
363
+ index.update(5, "content five")
364
+ assert_equal("content five", index["5"][:content])
365
+ assert_equal(nil, index["5"][:extra_content])
366
+ index.update("5", {:cat => "/cat1/subcat6",
367
+ :content => "high five",
368
+ :extra_content => "hello"})
369
+ assert_equal("hello", index["5"][:extra_content])
370
+ assert_equal("high five", index["5"][:content])
371
+ assert_equal("/cat1/subcat6", index["5"][:cat])
372
+ assert_equal("content9", index["9"][:content])
373
+ index.update(Term.new("content", "content9"), {:content => "content nine"})
374
+ assert_equal("content nine", index["9"][:content])
375
+ assert_equal("content0", index["0"][:content])
376
+ assert_equal(nil, index["0"][:extra_content])
377
+ document = index[0]
378
+ document[:content] = "content zero"
379
+ document[:extra_content] = "extra content"
380
+ index.update(0, document)
381
+ assert_equal("content zero", index["0"][:content])
382
+ assert_equal("extra content", index["0"][:extra_content])
383
+ assert_equal(nil, index["1"][:tag])
384
+ assert_equal(nil, index["2"][:tag])
385
+ assert_equal(nil, index["3"][:tag])
386
+ assert_equal(nil, index["4"][:tag])
387
+ index.query_update("id:<5 AND cat:>=/cat1/subcat2", {:tag => "cool"})
388
+ assert_equal("cool", index["1"][:tag])
389
+ assert_equal("cool", index["2"][:tag])
390
+ assert_equal("cool", index["3"][:tag])
391
+ assert_equal("cool", index["4"][:tag])
392
+ assert_equal(4, index.search("tag:cool").size)
393
+ end
292
394
  end
@@ -417,7 +417,6 @@ module IndexReaderCommon
417
417
  ir3.close()
418
418
  end
419
419
 
420
-
421
420
  end
422
421
 
423
422
  class SegmentReaderTest < Test::Unit::TestCase
@@ -618,5 +617,33 @@ class IndexReaderTest < Test::Unit::TestCase
618
617
  ir.close()
619
618
  fs_dir.close()
620
619
  end
620
+
621
+ def test_latest()
622
+ dpath = File.join(File.dirname(__FILE__),
623
+ '../../temp/fsdir')
624
+ fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
625
+
626
+ iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
627
+ doc = Document.new
628
+ doc << Field.new("field", "content", Field::Store::YES, Field::Index::TOKENIZED)
629
+ iw << doc
630
+ iw.close()
631
+
632
+ ir = IndexReader.open(fs_dir, false)
633
+ assert(ir.latest?)
634
+
635
+ iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
636
+ doc = Document.new
637
+ doc << Field.new("field", "content2", Field::Store::YES, Field::Index::TOKENIZED)
638
+ iw << doc
639
+ iw.close()
640
+
641
+ assert(!ir.latest?)
642
+
643
+ ir.close()
644
+ ir = IndexReader.open(fs_dir, false)
645
+ assert(ir.latest?)
646
+ ir.close()
647
+ end
621
648
  end
622
649
 
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: ferret
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.4
7
- date: 2005-11-01 00:00:00 +09:00
6
+ version: 0.2.0
7
+ date: 2005-11-12 00:00:00 +09:00
8
8
  summary: Ruby indexing library.
9
9
  require_paths:
10
10
  - lib
@@ -35,6 +35,7 @@ files:
35
35
  - MIT-LICENSE
36
36
  - TODO
37
37
  - TUTORIAL
38
+ - ext/Makefile
38
39
  - ext/index_io.c
39
40
  - ext/term_buffer.c
40
41
  - ext/ram_directory.c
@@ -47,6 +48,7 @@ files:
47
48
  - ext/ferret.h
48
49
  - ext/util.c
49
50
  - ext/tags
51
+ - ext/ferret_ext.so
50
52
  - ext/dummy.exe
51
53
  - lib/ferret.rb
52
54
  - lib/ferret/analysis.rb