RubyGems - ferret - Versions diffs - 0.1.4 → 0.2.0 - Mend

ferret 0.1.4 → 0.2.0

Files changed (10) hide show

data/TUTORIAL +5 -4
data/ext/Makefile +140 -0
data/ext/ferret_ext.so +0 -0
data/lib/ferret.rb +1 -1
data/lib/ferret/document/document.rb +6 -3
data/lib/ferret/index/index.rb +102 -4
data/lib/ferret/index/index_reader.rb +6 -0
data/test/unit/index/tc_index.rb +102 -0
data/test/unit/index/tc_index_reader.rb +28 -1
metadata +4 -2

data/TUTORIAL CHANGED Viewed

@@ -81,16 +81,17 @@ phrase "quick brown fox" in the content field. We'd write;
   end
 But "fast" has a pretty similar meaning to "quick" and we don't mind if the
-fox is a little red. So we could expand our search like this;
+fox is a little red. Also, the phrase could be in the title so we'll search
+there as well. So we could expand our search like this;
-  index.search_each('content:"quick|fast brown|red fox"') do |doc, score|
+  index.search_each('title|content:"quick|fast brown|red fox"') do |doc, score|
     puts "Document #{doc} found with a score of #{score}"
   end
 What if we want to find all documents entered on or after 5th of September,
-2005 with the words "ruby" or "rails" in it. We could type something like;
+2005 with the words "ruby" or "rails" in any field. We could type something like;
-  index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |doc, score|
+  index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |doc, score|
     puts "Document #{doc} found with a score of #{score}"
   end

data/ext/Makefile ADDED Viewed

@@ -0,0 +1,140 @@
+SHELL = /bin/sh
+#### Start of system configuration section. ####
+srcdir = .
+topdir = /usr/lib/ruby/1.8/i486-linux
+hdrdir = $(topdir)
+VPATH = $(srcdir):$(topdir):$(hdrdir)
+prefix = $(DESTDIR)/usr
+exec_prefix = $(prefix)
+sitedir = $(DESTDIR)/usr/local/lib/site_ruby
+rubylibdir = $(libdir)/ruby/$(ruby_version)
+archdir = $(rubylibdir)/$(arch)
+sbindir = $(exec_prefix)/sbin
+datadir = $(prefix)/share
+includedir = $(prefix)/include
+infodir = $(prefix)/info
+sysconfdir = $(DESTDIR)/etc
+mandir = $(datadir)/man
+libdir = $(exec_prefix)/lib
+sharedstatedir = $(prefix)/com
+oldincludedir = $(DESTDIR)/usr/include
+sitearchdir = $(sitelibdir)/$(sitearch)
+bindir = $(exec_prefix)/bin
+localstatedir = $(DESTDIR)/var
+sitelibdir = $(sitedir)/$(ruby_version)
+libexecdir = $(exec_prefix)/libexec
+CC = gcc
+LIBRUBY = $(LIBRUBY_SO)
+LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
+LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
+LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
+CFLAGS   =  -fPIC -Wall -g -O2  -fPIC
+CPPFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir)
+CXXFLAGS = $(CFLAGS)
+DLDFLAGS =
+LDSHARED = $(CC) -shared
+AR = ar
+EXEEXT =
+RUBY_INSTALL_NAME = ruby1.8
+RUBY_SO_NAME = ruby1.8
+arch = i486-linux
+sitearch = i486-linux
+ruby_version = 1.8
+ruby = /usr/bin/ruby1.8
+RUBY = $(ruby)
+RM = rm -f
+MAKEDIRS = mkdir -p
+INSTALL = /usr/bin/install -c
+INSTALL_PROG = $(INSTALL) -m 0755
+INSTALL_DATA = $(INSTALL) -m 644
+COPY = cp
+#### End of system configuration section. ####
+preload =
+libpath = $(libdir)
+LIBPATH =  -L"$(libdir)"
+DEFFILE =
+CLEANFILES =
+DISTCLEANFILES =
+extout =
+extout_prefix =
+target_prefix =
+LOCAL_LIBS =
+LIBS = $(LIBRUBYARG_SHARED)  -lpthread -ldl -lcrypt -lm   -lc
+SRCS = index_io.c term_buffer.c ram_directory.c priority_queue.c string_helper.c segment_merge_queue.c ferret.c term.c util.c
+OBJS = index_io.o term_buffer.o ram_directory.o priority_queue.o string_helper.o segment_merge_queue.o ferret.o term.o util.o
+TARGET = ferret_ext
+DLLIB = $(TARGET).so
+STATIC_LIB =
+RUBYCOMMONDIR = $(sitedir)$(target_prefix)
+RUBYLIBDIR    = $(sitelibdir)$(target_prefix)
+RUBYARCHDIR   = $(sitearchdir)$(target_prefix)
+TARGET_SO     = $(DLLIB)
+CLEANLIBS     = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
+CLEANOBJS     = *.o *.a *.s[ol] *.pdb *.exp *.bak
+all:		$(DLLIB)
+static:		$(STATIC_LIB)
+clean:
+		@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
+distclean:	clean
+		@-$(RM) Makefile extconf.h conftest.* mkmf.log
+		@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
+realclean:	distclean
+install: install-so install-rb
+install-so: $(RUBYARCHDIR)
+install-so: $(RUBYARCHDIR)/$(DLLIB)
+$(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
+	$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
+install-rb: pre-install-rb install-rb-default
+install-rb-default: pre-install-rb-default
+pre-install-rb pre-install-rb-default: $(RUBYLIBDIR)
+$(RUBYARCHDIR):
+	$(MAKEDIRS) $@
+$(RUBYLIBDIR):
+	$(MAKEDIRS) $@
+site-install: site-install-so site-install-rb
+site-install-so: install-so
+site-install-rb: install-rb
+.SUFFIXES: .c .m .cc .cxx .cpp .C .o
+.cc.o:
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+.cxx.o:
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+.cpp.o:
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+.C.o:
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+.c.o:
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
+$(DLLIB): $(OBJS)
+	@-$(RM) $@
+	$(LDSHARED) $(DLDFLAGS) $(LIBPATH) -o $@ $(OBJS) $(LOCAL_LIBS) $(LIBS)
+$(OBJS): ruby.h defines.h

data/ext/ferret_ext.so ADDED Viewed

Binary file

data/lib/ferret.rb CHANGED Viewed

@@ -22,7 +22,7 @@
 #++
 # :include: ../TUTORIAL
 module Ferret
-  VERSION = '0.1.4'
+  VERSION = '0.2.0'
 end
 require 'ferret/utils'

data/lib/ferret/document/document.rb CHANGED Viewed

@@ -123,11 +123,14 @@ module Ferret::Document
     # Sets the data in field +field+ to +text+. If there is more than one
     # field of that name then it will set the data in the first field of that
-    # name.
+    # name. If there is no field of that name, then a new one will be created
     def []=(field_name, data)
       field = field(field_name.to_s)
-      raise ArgumentError, "Field does not exist" unless field
-      field.data = data
+      if field
+        field.data = data
+      else
+        add_field(Field.new(field_name.to_s, data))
+      end
     end
     # Returns an array of binaries of the field specified as the method

data/lib/ferret/index/index.rb CHANGED Viewed

@@ -82,7 +82,11 @@ module Ferret::Index
     #
     def initialize(options = {})
       super()
+      options[:default_search_field] &&= options[:default_search_field].to_s
+      options[:default_field] &&= options[:default_field].to_s
       options[:create_if_missing] = true if options[:create_if_missing].nil?
       if options[:path]
         @dir = FSDirectory.new(options[:path], options[:create])
         options[:close_dir] = true
@@ -291,8 +295,25 @@ module Ferret::Index
           return @reader.delete_docs_with_term(t)
         elsif id.is_a?(Term)
           return @reader.delete_docs_with_term(id)
-        else
+        elsif id.is_a?(Integer)
           return @reader.delete(id)
+        else
+          raise ArgumentError, "Cannot delete for id of type #{id.class}"
+        end
+      end
+    end
+    # Delete all documents returned by the query.
+    #
+    # query:: The query to find documents you wish to delete. Can either be a
+    #         string (in which case it is parsed by the standard query parser)
+    #         or an actual query object.
+    def query_delete(query)
+      @dir.synchronize do
+        ensure_searcher_open()
+        query = process_query(query)
+        @searcher.search_each(query) do |doc, score|
+          @reader.delete(doc)
         end
       end
     end
@@ -305,6 +326,72 @@ module Ferret::Index
       end
     end
+    # Update the document referenced by the document number +id+ if +id+ is an
+    # integer or all of the documents which have the term +id+ if +id+ is a
+    # term..
+    #
+    # id::      The number of the document to update. Can also be a string
+    #           representing the value in the +id+ field or a term to match.
+    # new_val:: The values we are updating. This can be a string in which case
+    #           the default field is updated, or it can be a hash, in which
+    #           case, all fields in the hash are updated. You can also pass a
+    #           full Document object but you must pass the doc_num as the id.
+    def update(id, new_val)
+      @dir.synchronize do
+        if id.is_a?(String)
+          query_update("id:#{id}", new_val)
+        elsif id.is_a?(Term)
+          query_update(TermQuery.new(id), new_val)
+        elsif id.is_a?(Integer)
+          ensure_reader_open()
+          document = doc(id)
+          if new_val.is_a?(Hash)
+            new_val.each_pair {|name, content| document[name] = content.to_s}
+          elsif new_val.is_a?(Document)
+            document = new_val
+          else
+            document[@options[:default_field]] = new_val.to_s
+          end
+          @reader.delete(id)
+          ensure_writer_open()
+          @writer.add_document(document)
+        else
+          raise ArgumentError, "Cannot update for id of type #{id.class}"
+        end
+      end
+    end
+    # Update all the documents returned by the query.
+    #
+    # query::   The query to find documents you wish to update. Can either be
+    #           a string (in which case it is parsed by the standard query
+    #           parser) or an actual query object.
+    # new_val:: The values we are updating. This can be a string in which case
+    #           the default field is updated, or it can be a hash, in which
+    #           case, all fields in the hash are updated. If you want to pass
+    #           a full document see #update.
+    def query_update(query, new_val)
+      @dir.synchronize do
+        ensure_searcher_open()
+        docs_to_add = []
+        query = process_query(query)
+        @searcher.search_each(query) do |id, score|
+          document = doc(id)
+          if new_val.is_a?(Hash)
+            new_val.each_pair {|name, content| document[name] = content.to_s}
+          else
+            document[@options[:default_field]] = new_val.to_s
+          end
+          docs_to_add << document
+          @reader.delete(id)
+        end
+        ensure_writer_open()
+        docs_to_add.each do |document|
+          @writer.add_document(document)
+        end
+      end
+    end
     # Returns true if any documents have been deleted since the index was last
     # flushed.
     def has_deletions?()
@@ -432,7 +519,13 @@ module Ferret::Index
       def ensure_reader_open()
         raise "tried to use a closed index" if not @open
-        return if @reader
+        if @reader
+          if not @reader.latest?
+            @reader = IndexReader.open(@dir, false)
+          end
+          return
+        end
         if @writer
           @writer.close
           @writer = nil
@@ -450,6 +543,12 @@ module Ferret::Index
     private
       def do_search(query, options)
         ensure_searcher_open()
+        query = process_query(query)
+        return @searcher.search(query, options)
+      end
+      def process_query(query)
         if query.is_a?(String)
           if @qp.nil?
             @qp = Ferret::QueryParser.new(@default_search_field, @options)
@@ -458,8 +557,7 @@ module Ferret::Index
           @qp.fields = @reader.get_field_names.to_a
           query = @qp.parse(query)
         end
-        return @searcher.search(query, options)
+        return query
       end
   end
 end

data/lib/ferret/index/index_reader.rb CHANGED Viewed

@@ -343,6 +343,12 @@ module Ferret::Index
         end
       end
     end
+    # Returns true if the reader is reading from the latest version of the
+    # index.
+    def latest?()
+      SegmentInfos.read_current_version(@directory) == @segment_infos.version()
+    end
     # Deletes the document numbered +doc_num+.  Once a document is deleted it
     # will not appear in TermDocEnum or TermPostitions enumerations.  Attempts to

data/test/unit/index/tc_index.rb CHANGED Viewed

@@ -5,6 +5,7 @@ class IndexTest < Test::Unit::TestCase
   include Ferret::Index
   include Ferret::Analysis
   include Ferret::Store
+  include Ferret::Document
   def setup()
     @qp = Ferret::QueryParser.new()
@@ -289,4 +290,105 @@ class IndexTest < Test::Unit::TestCase
     assert_equal("romeo", index[3]["f"])
     index.close
   end
+  def test_auto_update_when_externally_modified()
+    fs_path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
+    index = Index.new(:path => fs_path, :default_field => "f", :create => true)
+    index << "document 1"
+    assert_equal(1, index.size)
+    index2 = Index.new(:path => fs_path, :default_field => "f")
+    assert_equal(1, index2.size)
+    index2 << "document 2"
+    assert_equal(2, index2.size)
+    assert_equal(2, index.size)
+    iw = IndexWriter.new(fs_path, :analyzer => WhiteSpaceAnalyzer.new())
+    doc = Document.new
+    doc << Field.new("f", "content3", Field::Store::YES, Field::Index::TOKENIZED)
+    iw << doc
+    iw.close()
+    assert_equal(3, index.size)
+    assert_equal("content3", index[2]["f"])
+  end
+  def test_delete
+    data = [
+      {:id => 0, :cat => "/cat1/subcat1"},
+      {:id => 1, :cat => "/cat1/subcat2"},
+      {:id => 2, :cat => "/cat1/subcat2"},
+      {:id => 3, :cat => "/cat1/subcat3"},
+      {:id => 4, :cat => "/cat1/subcat4"},
+      {:id => 5, :cat => "/cat2/subcat1"},
+      {:id => 6, :cat => "/cat2/subcat2"},
+      {:id => 7, :cat => "/cat2/subcat3"},
+      {:id => 8, :cat => "/cat2/subcat4"},
+      {:id => 9, :cat => "/cat2/subcat5"},
+    ]
+    index = Index.new(:analyzer => WhiteSpaceAnalyzer.new)
+    data.each {|doc| index << doc }
+    assert_equal(10, index.size)
+    assert_equal(1, index.search("id:9").size)
+    index.delete(9)
+    assert_equal(9, index.size)
+    assert_equal(0, index.search("id:9").size)
+    assert_equal(1, index.search("id:8").size)
+    index.delete("8")
+    assert_equal(8, index.size)
+    assert_equal(0, index.search("id:8").size)
+    assert_equal(5, index.search("cat:/cat1*").size)
+    index.query_delete("cat:/cat1*")
+    assert_equal(3, index.size)
+    assert_equal(0, index.search("cat:/cat1*").size)
+  end
+  def test_update
+    data = [
+      {:id => 0, :cat => "/cat1/subcat1", :content => "content0"},
+      {:id => 1, :cat => "/cat1/subcat2", :content => "content1"},
+      {:id => 2, :cat => "/cat1/subcat2", :content => "content2"},
+      {:id => 3, :cat => "/cat1/subcat3", :content => "content3"},
+      {:id => 4, :cat => "/cat1/subcat4", :content => "content4"},
+      {:id => 5, :cat => "/cat2/subcat1", :content => "content5"},
+      {:id => 6, :cat => "/cat2/subcat2", :content => "content6"},
+      {:id => 7, :cat => "/cat2/subcat3", :content => "content7"},
+      {:id => 8, :cat => "/cat2/subcat4", :content => "content8"},
+      {:id => 9, :cat => "/cat2/subcat5", :content => "content9"},
+    ]
+    index = Index.new(:analyzer => WhiteSpaceAnalyzer.new,
+                      :default_field => :content)
+    data.each { |doc| index << doc }
+    assert_equal(10, index.size)
+    assert_equal("content5", index["5"][:content])
+    index.update(5, "content five")
+    assert_equal("content five", index["5"][:content])
+    assert_equal(nil, index["5"][:extra_content])
+    index.update("5", {:cat => "/cat1/subcat6",
+                       :content => "high five",
+                       :extra_content => "hello"})
+    assert_equal("hello", index["5"][:extra_content])
+    assert_equal("high five", index["5"][:content])
+    assert_equal("/cat1/subcat6", index["5"][:cat])
+    assert_equal("content9", index["9"][:content])
+    index.update(Term.new("content", "content9"), {:content => "content nine"})
+    assert_equal("content nine", index["9"][:content])
+    assert_equal("content0", index["0"][:content])
+    assert_equal(nil, index["0"][:extra_content])
+    document = index[0]
+    document[:content] = "content zero"
+    document[:extra_content] = "extra content"
+    index.update(0, document)
+    assert_equal("content zero", index["0"][:content])
+    assert_equal("extra content", index["0"][:extra_content])
+    assert_equal(nil, index["1"][:tag])
+    assert_equal(nil, index["2"][:tag])
+    assert_equal(nil, index["3"][:tag])
+    assert_equal(nil, index["4"][:tag])
+    index.query_update("id:<5 AND cat:>=/cat1/subcat2", {:tag => "cool"})
+    assert_equal("cool", index["1"][:tag])
+    assert_equal("cool", index["2"][:tag])
+    assert_equal("cool", index["3"][:tag])
+    assert_equal("cool", index["4"][:tag])
+    assert_equal(4, index.search("tag:cool").size)
+  end
 end

data/test/unit/index/tc_index_reader.rb CHANGED Viewed

@@ -417,7 +417,6 @@ module IndexReaderCommon
     ir3.close()
   end
 end
 class SegmentReaderTest < Test::Unit::TestCase
@@ -618,5 +617,33 @@ class IndexReaderTest < Test::Unit::TestCase
     ir.close()
     fs_dir.close()
   end
+  def test_latest()
+    dpath = File.join(File.dirname(__FILE__),
+                       '../../temp/fsdir')
+    fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
+    iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
+    doc = Document.new
+    doc << Field.new("field", "content", Field::Store::YES, Field::Index::TOKENIZED)
+    iw << doc
+    iw.close()
+    ir = IndexReader.open(fs_dir, false)
+    assert(ir.latest?)
+    iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
+    doc = Document.new
+    doc << Field.new("field", "content2", Field::Store::YES, Field::Index::TOKENIZED)
+    iw << doc
+    iw.close()
+    assert(!ir.latest?)
+    ir.close()
+    ir = IndexReader.open(fs_dir, false)
+    assert(ir.latest?)
+    ir.close()
+  end
 end

metadata CHANGED Viewed

@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
 specification_version: 1
 name: ferret
 version: !ruby/object:Gem::Version
-  version: 0.1.4
-date: 2005-11-01 00:00:00 +09:00
+  version: 0.2.0
+date: 2005-11-12 00:00:00 +09:00
 summary: Ruby indexing library.
 require_paths:
   - lib
@@ -35,6 +35,7 @@ files:
   - MIT-LICENSE
   - TODO
   - TUTORIAL
+  - ext/Makefile
   - ext/index_io.c
   - ext/term_buffer.c
   - ext/ram_directory.c
@@ -47,6 +48,7 @@ files:
   - ext/ferret.h
   - ext/util.c
   - ext/tags
+  - ext/ferret_ext.so
   - ext/dummy.exe
   - lib/ferret.rb
   - lib/ferret/analysis.rb