RubyGems - ferret - Versions diffs - 0.1.4 → 0.2.0 - Mend

ferret 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/TUTORIAL +5 -4
data/ext/Makefile +140 -0
data/ext/ferret_ext.so +0 -0
data/lib/ferret.rb +1 -1
data/lib/ferret/document/document.rb +6 -3
data/lib/ferret/index/index.rb +102 -4
data/lib/ferret/index/index_reader.rb +6 -0
data/test/unit/index/tc_index.rb +102 -0
data/test/unit/index/tc_index_reader.rb +28 -1
metadata +4 -2

data/TUTORIAL CHANGED Viewed

@@ -81,16 +81,17 @@ phrase "quick brown fox" in the content field. We'd write;
   end
 But "fast" has a pretty similar meaning to "quick" and we don't mind if the
-fox is a little red. So we could expand our search like this;
+fox is a little red. Also, the phrase could be in the title so we'll search
+there as well. So we could expand our search like this;
-  index.search_each('content:"quick|fast brown|red fox"') do |doc, score|
+  index.search_each('title|content:"quick|fast brown|red fox"') do |doc, score|
     puts "Document #{doc} found with a score of #{score}"
   end
 What if we want to find all documents entered on or after 5th of September,
-2005 with the words "ruby" or "rails" in it. We could type something like;
+2005 with the words "ruby" or "rails" in any field. We could type something like;
-  index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |doc, score|
+  index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |doc, score|
     puts "Document #{doc} found with a score of #{score}"
   end

data/ext/Makefile ADDED Viewed

@@ -0,0 +1,140 @@
+SHELL = /bin/sh
+#### Start of system configuration section. ####
+srcdir = .
+topdir = /usr/lib/ruby/1.8/i486-linux
+hdrdir = $(topdir)
+VPATH = $(srcdir):$(topdir):$(hdrdir)
+prefix = $(DESTDIR)/usr
+exec_prefix = $(prefix)
+sitedir = $(DESTDIR)/usr/local/lib/site_ruby
+rubylibdir = $(libdir)/ruby/$(ruby_version)
+archdir = $(rubylibdir)/$(arch)
+sbindir = $(exec_prefix)/sbin
+datadir = $(prefix)/share
+includedir = $(prefix)/include
+infodir = $(prefix)/info
+sysconfdir = $(DESTDIR)/etc
+mandir = $(datadir)/man
+libdir = $(exec_prefix)/lib
+sharedstatedir = $(prefix)/com
+oldincludedir = $(DESTDIR)/usr/include
+sitearchdir = $(sitelibdir)/$(sitearch)
+bindir = $(exec_prefix)/bin
+localstatedir = $(DESTDIR)/var
+sitelibdir = $(sitedir)/$(ruby_version)
+libexecdir = $(exec_prefix)/libexec
+CC = gcc
+LIBRUBY = $(LIBRUBY_SO)
+LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
+LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
+LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
+CFLAGS   =  -fPIC -Wall -g -O2  -fPIC
+CPPFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir)
+CXXFLAGS = $(CFLAGS)
+DLDFLAGS =
+LDSHARED = $(CC) -shared
+AR = ar
+EXEEXT =
+RUBY_INSTALL_NAME = ruby1.8
+RUBY_SO_NAME = ruby1.8
+arch = i486-linux
+sitearch = i486-linux
+ruby_version = 1.8
+ruby = /usr/bin/ruby1.8
+RUBY = $(ruby)
+RM = rm -f
+MAKEDIRS = mkdir -p
+INSTALL = /usr/bin/install -c
+INSTALL_PROG = $(INSTALL) -m 0755
+INSTALL_DATA = $(INSTALL) -m 644
+COPY = cp
+#### End of system configuration section. ####
+preload =
+libpath = $(libdir)
+LIBPATH =  -L"$(libdir)"
+DEFFILE =
+CLEANFILES =
+DISTCLEANFILES =
+extout =
+extout_prefix =
+target_prefix =
+LOCAL_LIBS =
+LIBS = $(LIBRUBYARG_SHARED)  -lpthread -ldl -lcrypt -lm   -lc
+SRCS = index_io.c term_buffer.c ram_directory.c priority_queue.c string_helper.c segment_merge_queue.c ferret.c term.c util.c
+OBJS = index_io.o term_buffer.o ram_directory.o priority_queue.o string_helper.o segment_merge_queue.o ferret.o term.o util.o
+TARGET = ferret_ext
+DLLIB = $(TARGET).so
+STATIC_LIB =
+RUBYCOMMONDIR = $(sitedir)$(target_prefix)
+RUBYLIBDIR    = $(sitelibdir)$(target_prefix)
+RUBYARCHDIR   = $(sitearchdir)$(target_prefix)
+TARGET_SO     = $(DLLIB)
+CLEANLIBS     = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
+CLEANOBJS     = *.o *.a *.s[ol] *.pdb *.exp *.bak
+all:		$(DLLIB)
+static:		$(STATIC_LIB)
+clean:
+		@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
+distclean:	clean
+		@-$(RM) Makefile extconf.h conftest.* mkmf.log
+		@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
+realclean:	distclean
+install: install-so install-rb
+install-so: $(RUBYARCHDIR)
+install-so: $(RUBYARCHDIR)/$(DLLIB)
+$(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
+	$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
+install-rb: pre-install-rb install-rb-default
+install-rb-default: pre-install-rb-default
+pre-install-rb pre-install-rb-default: $(RUBYLIBDIR)
+$(RUBYARCHDIR):
+	$(MAKEDIRS) $@
+$(RUBYLIBDIR):
+	$(MAKEDIRS) $@
+site-install: site-install-so site-install-rb
+site-install-so: install-so
+site-install-rb: install-rb
+.SUFFIXES: .c .m .cc .cxx .cpp .C .o
+.cc.o:
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+.cxx.o:
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+.cpp.o:
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+.C.o:
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
+.c.o:
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
+$(DLLIB): $(OBJS)
+	@-$(RM) $@
+	$(LDSHARED) $(DLDFLAGS) $(LIBPATH) -o $@ $(OBJS) $(LOCAL_LIBS) $(LIBS)
+$(OBJS): ruby.h defines.h

data/ext/ferret_ext.so ADDED Viewed

Binary file

data/lib/ferret.rb CHANGED Viewed

@@ -22,7 +22,7 @@
 #++
 # :include: ../TUTORIAL
 module Ferret
-  VERSION = '0.1.4'
+  VERSION = '0.2.0'
 end
 require 'ferret/utils'

data/lib/ferret/document/document.rb CHANGED Viewed

@@ -123,11 +123,14 @@ module Ferret::Document
     # Sets the data in field +field+ to +text+. If there is more than one
     # field of that name then it will set the data in the first field of that
-    # name.
+    # name. If there is no field of that name, then a new one will be created
     def []=(field_name, data)
       field = field(field_name.to_s)
-      raise ArgumentError, "Field does not exist" unless field
-      field.data = data
+      if field
+        field.data = data
+      else
+        add_field(Field.new(field_name.to_s, data))
+      end
     end
     # Returns an array of binaries of the field specified as the method

data/lib/ferret/index/index.rb CHANGED Viewed

@@ -82,7 +82,11 @@ module Ferret::Index
     #
     def initialize(options = {})
       super()
+      options[:default_search_field] &&= options[:default_search_field].to_s
+      options[:default_field] &&= options[:default_field].to_s
       options[:create_if_missing] = true if options[:create_if_missing].nil?
       if options[:path]
         @dir = FSDirectory.new(options[:path], options[:create])
         options[:close_dir] = true
@@ -291,8 +295,25 @@ module Ferret::Index
           return @reader.delete_docs_with_term(t)
         elsif id.is_a?(Term)
           return @reader.delete_docs_with_term(id)
-        else
+        elsif id.is_a?(Integer)
           return @reader.delete(id)
+        else
+          raise ArgumentError, "Cannot delete for id of type #{id.class}"
+        end
+      end
+    end
+    # Delete all documents returned by the query.
+    #
+    # query:: The query to find documents you wish to delete. Can either be a
+    #         string (in which case it is parsed by the standard query parser)
+    #         or an actual query object.
+    def query_delete(query)
+      @dir.synchronize do
+        ensure_searcher_open()
+        query = process_query(query)
+        @searcher.search_each(query) do |doc, score|
+          @reader.delete(doc)
         end
       end
     end
@@ -305,6 +326,72 @@ module Ferret::Index
       end
     end
+    # Update the document referenced by the document number +id+ if +id+ is an
+    # integer or all of the documents which have the term +id+ if +id+ is a
+    # term..
+    #
+    # id::      The number of the document to update. Can also be a string
+    #           representing the value in the +id+ field or a term to match.
+    # new_val:: The values we are updating. This can be a string in which case
+    #           the default field is updated, or it can be a hash, in which
+    #           case, all fields in the hash are updated. You can also pass a
+    #           full Document object but you must pass the doc_num as the id.
+    def update(id, new_val)
+      @dir.synchronize do
+        if id.is_a?(String)
+          query_update("id:#{id}", new_val)
+        elsif id.is_a?(Term)
+          query_update(TermQuery.new(id), new_val)
+        elsif id.is_a?(Integer)
+          ensure_reader_open()
+          document = doc(id)
+          if new_val.is_a?(Hash)
+            new_val.each_pair {|name, content| document[name] = content.to_s}
+          elsif new_val.is_a?(Document)
+            document = new_val
+          else
+            document[@options[:default_field]] = new_val.to_s
+          end
+          @reader.delete(id)
+          ensure_writer_open()
+          @writer.add_document(document)
+        else
+          raise ArgumentError, "Cannot update for id of type #{id.class}"
+        end
+      end
+    end
+    # Update all the documents returned by the query.
+    #
+    # query::   The query to find documents you wish to update. Can either be
+    #           a string (in which case it is parsed by the standard query
+    #           parser) or an actual query object.
+    # new_val:: The values we are updating. This can be a string in which case
+    #           the default field is updated, or it can be a hash, in which
+    #           case, all fields in the hash are updated. If you want to pass
+    #           a full document see #update.
+    def query_update(query, new_val)
+      @dir.synchronize do
+        ensure_searcher_open()
+        docs_to_add = []
+        query = process_query(query)
+        @searcher.search_each(query) do |id, score|
+          document = doc(id)
+          if new_val.is_a?(Hash)
+            new_val.each_pair {|name, content| document[name] = content.to_s}
+          else
+            document[@options[:default_field]] = new_val.to_s
+          end
+          docs_to_add << document
+          @reader.delete(id)
+        end
+        ensure_writer_open()
+        docs_to_add.each do |document|
+          @writer.add_document(document)
+        end
+      end
+    end
     # Returns true if any documents have been deleted since the index was last
     # flushed.
     def has_deletions?()
@@ -432,7 +519,13 @@ module Ferret::Index
       def ensure_reader_open()
         raise "tried to use a closed index" if not @open
-        return if @reader
+        if @reader
+          if not @reader.latest?
+            @reader = IndexReader.open(@dir, false)
+          end
+          return
+        end
         if @writer
           @writer.close
           @writer = nil
@@ -450,6 +543,12 @@ module Ferret::Index
     private
       def do_search(query, options)
         ensure_searcher_open()
+        query = process_query(query)
+        return @searcher.search(query, options)
+      end
+      def process_query(query)
         if query.is_a?(String)
           if @qp.nil?
             @qp = Ferret::QueryParser.new(@default_search_field, @options)
@@ -458,8 +557,7 @@ module Ferret::Index
           @qp.fields = @reader.get_field_names.to_a
           query = @qp.parse(query)
         end
-        return @searcher.search(query, options)
+        return query
       end
   end
 end

data/lib/ferret/index/index_reader.rb CHANGED Viewed

@@ -343,6 +343,12 @@ module Ferret::Index
         end
       end
     end
+    # Returns true if the reader is reading from the latest version of the
+    # index.
+    def latest?()
+      SegmentInfos.read_current_version(@directory) == @segment_infos.version()
+    end
     # Deletes the document numbered +doc_num+.  Once a document is deleted it
     # will not appear in TermDocEnum or TermPostitions enumerations.  Attempts to

data/test/unit/index/tc_index.rb CHANGED Viewed

@@ -5,6 +5,7 @@ class IndexTest < Test::Unit::TestCase
   include Ferret::Index
   include Ferret::Analysis
   include Ferret::Store
+  include Ferret::Document
   def setup()
     @qp = Ferret::QueryParser.new()
@@ -289,4 +290,105 @@ class IndexTest < Test::Unit::TestCase
     assert_equal("romeo", index[3]["f"])
     index.close
   end
+  def test_auto_update_when_externally_modified()
+    fs_path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
+    index = Index.new(:path => fs_path, :default_field => "f", :create => true)
+    index << "document 1"
+    assert_equal(1, index.size)
+    index2 = Index.new(:path => fs_path, :default_field => "f")
+    assert_equal(1, index2.size)
+    index2 << "document 2"
+    assert_equal(2, index2.size)
+    assert_equal(2, index.size)
+    iw = IndexWriter.new(fs_path, :analyzer => WhiteSpaceAnalyzer.new())
+    doc = Document.new
+    doc << Field.new("f", "content3", Field::Store::YES, Field::Index::TOKENIZED)
+    iw << doc
+    iw.close()
+    assert_equal(3, index.size)
+    assert_equal("content3", index[2]["f"])
+  end
+  def test_delete
+    data = [
+      {:id => 0, :cat => "/cat1/subcat1"},
+      {:id => 1, :cat => "/cat1/subcat2"},
+      {:id => 2, :cat => "/cat1/subcat2"},
+      {:id => 3, :cat => "/cat1/subcat3"},
+      {:id => 4, :cat => "/cat1/subcat4"},
+      {:id => 5, :cat => "/cat2/subcat1"},
+      {:id => 6, :cat => "/cat2/subcat2"},
+      {:id => 7, :cat => "/cat2/subcat3"},
+      {:id => 8, :cat => "/cat2/subcat4"},
+      {:id => 9, :cat => "/cat2/subcat5"},
+    ]
+    index = Index.new(:analyzer => WhiteSpaceAnalyzer.new)
+    data.each {|doc| index << doc }
+    assert_equal(10, index.size)
+    assert_equal(1, index.search("id:9").size)
+    index.delete(9)
+    assert_equal(9, index.size)
+    assert_equal(0, index.search("id:9").size)
+    assert_equal(1, index.search("id:8").size)
+    index.delete("8")
+    assert_equal(8, index.size)
+    assert_equal(0, index.search("id:8").size)
+    assert_equal(5, index.search("cat:/cat1*").size)
+    index.query_delete("cat:/cat1*")
+    assert_equal(3, index.size)
+    assert_equal(0, index.search("cat:/cat1*").size)
+  end
+  def test_update
+    data = [
+      {:id => 0, :cat => "/cat1/subcat1", :content => "content0"},
+      {:id => 1, :cat => "/cat1/subcat2", :content => "content1"},
+      {:id => 2, :cat => "/cat1/subcat2", :content => "content2"},
+      {:id => 3, :cat => "/cat1/subcat3", :content => "content3"},
+      {:id => 4, :cat => "/cat1/subcat4", :content => "content4"},
+      {:id => 5, :cat => "/cat2/subcat1", :content => "content5"},
+      {:id => 6, :cat => "/cat2/subcat2", :content => "content6"},
+      {:id => 7, :cat => "/cat2/subcat3", :content => "content7"},
+      {:id => 8, :cat => "/cat2/subcat4", :content => "content8"},
+      {:id => 9, :cat => "/cat2/subcat5", :content => "content9"},
+    ]
+    index = Index.new(:analyzer => WhiteSpaceAnalyzer.new,
+                      :default_field => :content)
+    data.each { |doc| index << doc }
+    assert_equal(10, index.size)
+    assert_equal("content5", index["5"][:content])
+    index.update(5, "content five")
+    assert_equal("content five", index["5"][:content])
+    assert_equal(nil, index["5"][:extra_content])
+    index.update("5", {:cat => "/cat1/subcat6",
+                       :content => "high five",
+                       :extra_content => "hello"})
+    assert_equal("hello", index["5"][:extra_content])
+    assert_equal("high five", index["5"][:content])
+    assert_equal("/cat1/subcat6", index["5"][:cat])
+    assert_equal("content9", index["9"][:content])
+    index.update(Term.new("content", "content9"), {:content => "content nine"})
+    assert_equal("content nine", index["9"][:content])
+    assert_equal("content0", index["0"][:content])
+    assert_equal(nil, index["0"][:extra_content])
+    document = index[0]
+    document[:content] = "content zero"
+    document[:extra_content] = "extra content"
+    index.update(0, document)
+    assert_equal("content zero", index["0"][:content])
+    assert_equal("extra content", index["0"][:extra_content])
+    assert_equal(nil, index["1"][:tag])
+    assert_equal(nil, index["2"][:tag])
+    assert_equal(nil, index["3"][:tag])
+    assert_equal(nil, index["4"][:tag])
+    index.query_update("id:<5 AND cat:>=/cat1/subcat2", {:tag => "cool"})
+    assert_equal("cool", index["1"][:tag])
+    assert_equal("cool", index["2"][:tag])
+    assert_equal("cool", index["3"][:tag])
+    assert_equal("cool", index["4"][:tag])
+    assert_equal(4, index.search("tag:cool").size)
+  end
 end

data/test/unit/index/tc_index_reader.rb CHANGED Viewed

@@ -417,7 +417,6 @@ module IndexReaderCommon
     ir3.close()
   end
 end
 class SegmentReaderTest < Test::Unit::TestCase
@@ -618,5 +617,33 @@ class IndexReaderTest < Test::Unit::TestCase
     ir.close()
     fs_dir.close()
   end
+  def test_latest()
+    dpath = File.join(File.dirname(__FILE__),
+                       '../../temp/fsdir')
+    fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
+    iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
+    doc = Document.new
+    doc << Field.new("field", "content", Field::Store::YES, Field::Index::TOKENIZED)
+    iw << doc
+    iw.close()
+    ir = IndexReader.open(fs_dir, false)
+    assert(ir.latest?)
+    iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
+    doc = Document.new
+    doc << Field.new("field", "content2", Field::Store::YES, Field::Index::TOKENIZED)
+    iw << doc
+    iw.close()
+    assert(!ir.latest?)
+    ir.close()
+    ir = IndexReader.open(fs_dir, false)
+    assert(ir.latest?)
+    ir.close()
+  end
 end

metadata CHANGED Viewed

@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
 specification_version: 1
 name: ferret
 version: !ruby/object:Gem::Version
-  version: 0.1.4
-date: 2005-11-01 00:00:00 +09:00
+  version: 0.2.0
+date: 2005-11-12 00:00:00 +09:00
 summary: Ruby indexing library.
 require_paths:
   - lib
@@ -35,6 +35,7 @@ files:
   - MIT-LICENSE
   - TODO
   - TUTORIAL
+  - ext/Makefile
   - ext/index_io.c
   - ext/term_buffer.c
   - ext/ram_directory.c
@@ -47,6 +48,7 @@ files:
   - ext/ferret.h
   - ext/util.c
   - ext/tags
+  - ext/ferret_ext.so
   - ext/dummy.exe
   - lib/ferret.rb
   - lib/ferret/analysis.rb