ferret 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/TUTORIAL CHANGED
@@ -81,16 +81,17 @@ phrase "quick brown fox" in the content field. We'd write;
81
81
  end
82
82
 
83
83
  But "fast" has a pretty similar meaning to "quick" and we don't mind if the
84
- fox is a little red. So we could expand our search like this;
84
+ fox is a little red. Also, the phrase could be in the title so we'll search
85
+ there as well. So we could expand our search like this;
85
86
 
86
- index.search_each('content:"quick|fast brown|red fox"') do |doc, score|
87
+ index.search_each('title|content:"quick|fast brown|red fox"') do |doc, score|
87
88
  puts "Document #{doc} found with a score of #{score}"
88
89
  end
89
90
 
90
91
  What if we want to find all documents entered on or after 5th of September,
91
- 2005 with the words "ruby" or "rails" in it. We could type something like;
92
+ 2005 with the words "ruby" or "rails" in any field. We could type something like;
92
93
 
93
- index.search_each('date:( >= 20050905) content:(ruby OR rails)') do |doc, score|
94
+ index.search_each('date:( >= 20050905) *:(ruby OR rails)') do |doc, score|
94
95
  puts "Document #{doc} found with a score of #{score}"
95
96
  end
96
97
 
data/ext/Makefile ADDED
@@ -0,0 +1,140 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = .
7
+ topdir = /usr/lib/ruby/1.8/i486-linux
8
+ hdrdir = $(topdir)
9
+ VPATH = $(srcdir):$(topdir):$(hdrdir)
10
+ prefix = $(DESTDIR)/usr
11
+ exec_prefix = $(prefix)
12
+ sitedir = $(DESTDIR)/usr/local/lib/site_ruby
13
+ rubylibdir = $(libdir)/ruby/$(ruby_version)
14
+ archdir = $(rubylibdir)/$(arch)
15
+ sbindir = $(exec_prefix)/sbin
16
+ datadir = $(prefix)/share
17
+ includedir = $(prefix)/include
18
+ infodir = $(prefix)/info
19
+ sysconfdir = $(DESTDIR)/etc
20
+ mandir = $(datadir)/man
21
+ libdir = $(exec_prefix)/lib
22
+ sharedstatedir = $(prefix)/com
23
+ oldincludedir = $(DESTDIR)/usr/include
24
+ sitearchdir = $(sitelibdir)/$(sitearch)
25
+ bindir = $(exec_prefix)/bin
26
+ localstatedir = $(DESTDIR)/var
27
+ sitelibdir = $(sitedir)/$(ruby_version)
28
+ libexecdir = $(exec_prefix)/libexec
29
+
30
+ CC = gcc
31
+ LIBRUBY = $(LIBRUBY_SO)
32
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
33
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
34
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
35
+
36
+ CFLAGS = -fPIC -Wall -g -O2 -fPIC
37
+ CPPFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir)
38
+ CXXFLAGS = $(CFLAGS)
39
+ DLDFLAGS =
40
+ LDSHARED = $(CC) -shared
41
+ AR = ar
42
+ EXEEXT =
43
+
44
+ RUBY_INSTALL_NAME = ruby1.8
45
+ RUBY_SO_NAME = ruby1.8
46
+ arch = i486-linux
47
+ sitearch = i486-linux
48
+ ruby_version = 1.8
49
+ ruby = /usr/bin/ruby1.8
50
+ RUBY = $(ruby)
51
+ RM = rm -f
52
+ MAKEDIRS = mkdir -p
53
+ INSTALL = /usr/bin/install -c
54
+ INSTALL_PROG = $(INSTALL) -m 0755
55
+ INSTALL_DATA = $(INSTALL) -m 644
56
+ COPY = cp
57
+
58
+ #### End of system configuration section. ####
59
+
60
+ preload =
61
+
62
+ libpath = $(libdir)
63
+ LIBPATH = -L"$(libdir)"
64
+ DEFFILE =
65
+
66
+ CLEANFILES =
67
+ DISTCLEANFILES =
68
+
69
+ extout =
70
+ extout_prefix =
71
+ target_prefix =
72
+ LOCAL_LIBS =
73
+ LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lcrypt -lm -lc
74
+ SRCS = index_io.c term_buffer.c ram_directory.c priority_queue.c string_helper.c segment_merge_queue.c ferret.c term.c util.c
75
+ OBJS = index_io.o term_buffer.o ram_directory.o priority_queue.o string_helper.o segment_merge_queue.o ferret.o term.o util.o
76
+ TARGET = ferret_ext
77
+ DLLIB = $(TARGET).so
78
+ STATIC_LIB =
79
+
80
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
81
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
82
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
83
+
84
+ TARGET_SO = $(DLLIB)
85
+ CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
86
+ CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
87
+
88
+ all: $(DLLIB)
89
+ static: $(STATIC_LIB)
90
+
91
+ clean:
92
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
93
+
94
+ distclean: clean
95
+ @-$(RM) Makefile extconf.h conftest.* mkmf.log
96
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
97
+
98
+ realclean: distclean
99
+ install: install-so install-rb
100
+
101
+ install-so: $(RUBYARCHDIR)
102
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
103
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
104
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
105
+ install-rb: pre-install-rb install-rb-default
106
+ install-rb-default: pre-install-rb-default
107
+ pre-install-rb pre-install-rb-default: $(RUBYLIBDIR)
108
+ $(RUBYARCHDIR):
109
+ $(MAKEDIRS) $@
110
+ $(RUBYLIBDIR):
111
+ $(MAKEDIRS) $@
112
+
113
+ site-install: site-install-so site-install-rb
114
+ site-install-so: install-so
115
+ site-install-rb: install-rb
116
+
117
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
118
+
119
+ .cc.o:
120
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
121
+
122
+ .cxx.o:
123
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
124
+
125
+ .cpp.o:
126
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
127
+
128
+ .C.o:
129
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
130
+
131
+ .c.o:
132
+ $(CC) $(CFLAGS) $(CPPFLAGS) -c $<
133
+
134
+ $(DLLIB): $(OBJS)
135
+ @-$(RM) $@
136
+ $(LDSHARED) $(DLDFLAGS) $(LIBPATH) -o $@ $(OBJS) $(LOCAL_LIBS) $(LIBS)
137
+
138
+
139
+
140
+ $(OBJS): ruby.h defines.h
data/ext/ferret_ext.so ADDED
Binary file
data/lib/ferret.rb CHANGED
@@ -22,7 +22,7 @@
22
22
  #++
23
23
  # :include: ../TUTORIAL
24
24
  module Ferret
25
- VERSION = '0.1.4'
25
+ VERSION = '0.2.0'
26
26
  end
27
27
 
28
28
  require 'ferret/utils'
@@ -123,11 +123,14 @@ module Ferret::Document
123
123
 
124
124
  # Sets the data in field +field+ to +text+. If there is more than one
125
125
  # field of that name then it will set the data in the first field of that
126
- # name.
126
+ # name. If there is no field of that name, then a new one will be created
127
127
  def []=(field_name, data)
128
128
  field = field(field_name.to_s)
129
- raise ArgumentError, "Field does not exist" unless field
130
- field.data = data
129
+ if field
130
+ field.data = data
131
+ else
132
+ add_field(Field.new(field_name.to_s, data))
133
+ end
131
134
  end
132
135
 
133
136
  # Returns an array of binaries of the field specified as the method
@@ -82,7 +82,11 @@ module Ferret::Index
82
82
  #
83
83
  def initialize(options = {})
84
84
  super()
85
+
86
+ options[:default_search_field] &&= options[:default_search_field].to_s
87
+ options[:default_field] &&= options[:default_field].to_s
85
88
  options[:create_if_missing] = true if options[:create_if_missing].nil?
89
+
86
90
  if options[:path]
87
91
  @dir = FSDirectory.new(options[:path], options[:create])
88
92
  options[:close_dir] = true
@@ -291,8 +295,25 @@ module Ferret::Index
291
295
  return @reader.delete_docs_with_term(t)
292
296
  elsif id.is_a?(Term)
293
297
  return @reader.delete_docs_with_term(id)
294
- else
298
+ elsif id.is_a?(Integer)
295
299
  return @reader.delete(id)
300
+ else
301
+ raise ArgumentError, "Cannot delete for id of type #{id.class}"
302
+ end
303
+ end
304
+ end
305
+
306
+ # Delete all documents returned by the query.
307
+ #
308
+ # query:: The query to find documents you wish to delete. Can either be a
309
+ # string (in which case it is parsed by the standard query parser)
310
+ # or an actual query object.
311
+ def query_delete(query)
312
+ @dir.synchronize do
313
+ ensure_searcher_open()
314
+ query = process_query(query)
315
+ @searcher.search_each(query) do |doc, score|
316
+ @reader.delete(doc)
296
317
  end
297
318
  end
298
319
  end
@@ -305,6 +326,72 @@ module Ferret::Index
305
326
  end
306
327
  end
307
328
 
329
+ # Update the document referenced by the document number +id+ if +id+ is an
330
+ # integer or all of the documents which have the term +id+ if +id+ is a
331
+ # term..
332
+ #
333
+ # id:: The number of the document to update. Can also be a string
334
+ # representing the value in the +id+ field or a term to match.
335
+ # new_val:: The values we are updating. This can be a string in which case
336
+ # the default field is updated, or it can be a hash, in which
337
+ # case, all fields in the hash are updated. You can also pass a
338
+ # full Document object but you must pass the doc_num as the id.
339
+ def update(id, new_val)
340
+ @dir.synchronize do
341
+ if id.is_a?(String)
342
+ query_update("id:#{id}", new_val)
343
+ elsif id.is_a?(Term)
344
+ query_update(TermQuery.new(id), new_val)
345
+ elsif id.is_a?(Integer)
346
+ ensure_reader_open()
347
+ document = doc(id)
348
+ if new_val.is_a?(Hash)
349
+ new_val.each_pair {|name, content| document[name] = content.to_s}
350
+ elsif new_val.is_a?(Document)
351
+ document = new_val
352
+ else
353
+ document[@options[:default_field]] = new_val.to_s
354
+ end
355
+ @reader.delete(id)
356
+ ensure_writer_open()
357
+ @writer.add_document(document)
358
+ else
359
+ raise ArgumentError, "Cannot update for id of type #{id.class}"
360
+ end
361
+ end
362
+ end
363
+
364
+ # Update all the documents returned by the query.
365
+ #
366
+ # query:: The query to find documents you wish to update. Can either be
367
+ # a string (in which case it is parsed by the standard query
368
+ # parser) or an actual query object.
369
+ # new_val:: The values we are updating. This can be a string in which case
370
+ # the default field is updated, or it can be a hash, in which
371
+ # case, all fields in the hash are updated. If you want to pass
372
+ # a full document see #update.
373
+ def query_update(query, new_val)
374
+ @dir.synchronize do
375
+ ensure_searcher_open()
376
+ docs_to_add = []
377
+ query = process_query(query)
378
+ @searcher.search_each(query) do |id, score|
379
+ document = doc(id)
380
+ if new_val.is_a?(Hash)
381
+ new_val.each_pair {|name, content| document[name] = content.to_s}
382
+ else
383
+ document[@options[:default_field]] = new_val.to_s
384
+ end
385
+ docs_to_add << document
386
+ @reader.delete(id)
387
+ end
388
+ ensure_writer_open()
389
+ docs_to_add.each do |document|
390
+ @writer.add_document(document)
391
+ end
392
+ end
393
+ end
394
+
308
395
  # Returns true if any documents have been deleted since the index was last
309
396
  # flushed.
310
397
  def has_deletions?()
@@ -432,7 +519,13 @@ module Ferret::Index
432
519
 
433
520
  def ensure_reader_open()
434
521
  raise "tried to use a closed index" if not @open
435
- return if @reader
522
+ if @reader
523
+ if not @reader.latest?
524
+ @reader = IndexReader.open(@dir, false)
525
+ end
526
+ return
527
+ end
528
+
436
529
  if @writer
437
530
  @writer.close
438
531
  @writer = nil
@@ -450,6 +543,12 @@ module Ferret::Index
450
543
  private
451
544
  def do_search(query, options)
452
545
  ensure_searcher_open()
546
+ query = process_query(query)
547
+
548
+ return @searcher.search(query, options)
549
+ end
550
+
551
+ def process_query(query)
453
552
  if query.is_a?(String)
454
553
  if @qp.nil?
455
554
  @qp = Ferret::QueryParser.new(@default_search_field, @options)
@@ -458,8 +557,7 @@ module Ferret::Index
458
557
  @qp.fields = @reader.get_field_names.to_a
459
558
  query = @qp.parse(query)
460
559
  end
461
-
462
- return @searcher.search(query, options)
560
+ return query
463
561
  end
464
562
  end
465
563
  end
@@ -343,6 +343,12 @@ module Ferret::Index
343
343
  end
344
344
  end
345
345
  end
346
+
347
+ # Returns true if the reader is reading from the latest version of the
348
+ # index.
349
+ def latest?()
350
+ SegmentInfos.read_current_version(@directory) == @segment_infos.version()
351
+ end
346
352
 
347
353
  # Deletes the document numbered +doc_num+. Once a document is deleted it
348
354
  # will not appear in TermDocEnum or TermPostitions enumerations. Attempts to
@@ -5,6 +5,7 @@ class IndexTest < Test::Unit::TestCase
5
5
  include Ferret::Index
6
6
  include Ferret::Analysis
7
7
  include Ferret::Store
8
+ include Ferret::Document
8
9
 
9
10
  def setup()
10
11
  @qp = Ferret::QueryParser.new()
@@ -289,4 +290,105 @@ class IndexTest < Test::Unit::TestCase
289
290
  assert_equal("romeo", index[3]["f"])
290
291
  index.close
291
292
  end
293
+
294
+ def test_auto_update_when_externally_modified()
295
+ fs_path = File.expand_path(File.join(File.dirname(__FILE__), '../../temp/fsdir'))
296
+ index = Index.new(:path => fs_path, :default_field => "f", :create => true)
297
+ index << "document 1"
298
+ assert_equal(1, index.size)
299
+
300
+ index2 = Index.new(:path => fs_path, :default_field => "f")
301
+ assert_equal(1, index2.size)
302
+ index2 << "document 2"
303
+ assert_equal(2, index2.size)
304
+ assert_equal(2, index.size)
305
+
306
+ iw = IndexWriter.new(fs_path, :analyzer => WhiteSpaceAnalyzer.new())
307
+ doc = Document.new
308
+ doc << Field.new("f", "content3", Field::Store::YES, Field::Index::TOKENIZED)
309
+ iw << doc
310
+ iw.close()
311
+ assert_equal(3, index.size)
312
+ assert_equal("content3", index[2]["f"])
313
+ end
314
+
315
+ def test_delete
316
+ data = [
317
+ {:id => 0, :cat => "/cat1/subcat1"},
318
+ {:id => 1, :cat => "/cat1/subcat2"},
319
+ {:id => 2, :cat => "/cat1/subcat2"},
320
+ {:id => 3, :cat => "/cat1/subcat3"},
321
+ {:id => 4, :cat => "/cat1/subcat4"},
322
+ {:id => 5, :cat => "/cat2/subcat1"},
323
+ {:id => 6, :cat => "/cat2/subcat2"},
324
+ {:id => 7, :cat => "/cat2/subcat3"},
325
+ {:id => 8, :cat => "/cat2/subcat4"},
326
+ {:id => 9, :cat => "/cat2/subcat5"},
327
+ ]
328
+ index = Index.new(:analyzer => WhiteSpaceAnalyzer.new)
329
+ data.each {|doc| index << doc }
330
+ assert_equal(10, index.size)
331
+ assert_equal(1, index.search("id:9").size)
332
+ index.delete(9)
333
+ assert_equal(9, index.size)
334
+ assert_equal(0, index.search("id:9").size)
335
+ assert_equal(1, index.search("id:8").size)
336
+ index.delete("8")
337
+ assert_equal(8, index.size)
338
+ assert_equal(0, index.search("id:8").size)
339
+ assert_equal(5, index.search("cat:/cat1*").size)
340
+ index.query_delete("cat:/cat1*")
341
+ assert_equal(3, index.size)
342
+ assert_equal(0, index.search("cat:/cat1*").size)
343
+ end
344
+
345
+ def test_update
346
+ data = [
347
+ {:id => 0, :cat => "/cat1/subcat1", :content => "content0"},
348
+ {:id => 1, :cat => "/cat1/subcat2", :content => "content1"},
349
+ {:id => 2, :cat => "/cat1/subcat2", :content => "content2"},
350
+ {:id => 3, :cat => "/cat1/subcat3", :content => "content3"},
351
+ {:id => 4, :cat => "/cat1/subcat4", :content => "content4"},
352
+ {:id => 5, :cat => "/cat2/subcat1", :content => "content5"},
353
+ {:id => 6, :cat => "/cat2/subcat2", :content => "content6"},
354
+ {:id => 7, :cat => "/cat2/subcat3", :content => "content7"},
355
+ {:id => 8, :cat => "/cat2/subcat4", :content => "content8"},
356
+ {:id => 9, :cat => "/cat2/subcat5", :content => "content9"},
357
+ ]
358
+ index = Index.new(:analyzer => WhiteSpaceAnalyzer.new,
359
+ :default_field => :content)
360
+ data.each { |doc| index << doc }
361
+ assert_equal(10, index.size)
362
+ assert_equal("content5", index["5"][:content])
363
+ index.update(5, "content five")
364
+ assert_equal("content five", index["5"][:content])
365
+ assert_equal(nil, index["5"][:extra_content])
366
+ index.update("5", {:cat => "/cat1/subcat6",
367
+ :content => "high five",
368
+ :extra_content => "hello"})
369
+ assert_equal("hello", index["5"][:extra_content])
370
+ assert_equal("high five", index["5"][:content])
371
+ assert_equal("/cat1/subcat6", index["5"][:cat])
372
+ assert_equal("content9", index["9"][:content])
373
+ index.update(Term.new("content", "content9"), {:content => "content nine"})
374
+ assert_equal("content nine", index["9"][:content])
375
+ assert_equal("content0", index["0"][:content])
376
+ assert_equal(nil, index["0"][:extra_content])
377
+ document = index[0]
378
+ document[:content] = "content zero"
379
+ document[:extra_content] = "extra content"
380
+ index.update(0, document)
381
+ assert_equal("content zero", index["0"][:content])
382
+ assert_equal("extra content", index["0"][:extra_content])
383
+ assert_equal(nil, index["1"][:tag])
384
+ assert_equal(nil, index["2"][:tag])
385
+ assert_equal(nil, index["3"][:tag])
386
+ assert_equal(nil, index["4"][:tag])
387
+ index.query_update("id:<5 AND cat:>=/cat1/subcat2", {:tag => "cool"})
388
+ assert_equal("cool", index["1"][:tag])
389
+ assert_equal("cool", index["2"][:tag])
390
+ assert_equal("cool", index["3"][:tag])
391
+ assert_equal("cool", index["4"][:tag])
392
+ assert_equal(4, index.search("tag:cool").size)
393
+ end
292
394
  end
@@ -417,7 +417,6 @@ module IndexReaderCommon
417
417
  ir3.close()
418
418
  end
419
419
 
420
-
421
420
  end
422
421
 
423
422
  class SegmentReaderTest < Test::Unit::TestCase
@@ -618,5 +617,33 @@ class IndexReaderTest < Test::Unit::TestCase
618
617
  ir.close()
619
618
  fs_dir.close()
620
619
  end
620
+
621
+ def test_latest()
622
+ dpath = File.join(File.dirname(__FILE__),
623
+ '../../temp/fsdir')
624
+ fs_dir = Ferret::Store::FSDirectory.new(dpath, true)
625
+
626
+ iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new(), :create => true)
627
+ doc = Document.new
628
+ doc << Field.new("field", "content", Field::Store::YES, Field::Index::TOKENIZED)
629
+ iw << doc
630
+ iw.close()
631
+
632
+ ir = IndexReader.open(fs_dir, false)
633
+ assert(ir.latest?)
634
+
635
+ iw = IndexWriter.new(fs_dir, :analyzer => WhiteSpaceAnalyzer.new())
636
+ doc = Document.new
637
+ doc << Field.new("field", "content2", Field::Store::YES, Field::Index::TOKENIZED)
638
+ iw << doc
639
+ iw.close()
640
+
641
+ assert(!ir.latest?)
642
+
643
+ ir.close()
644
+ ir = IndexReader.open(fs_dir, false)
645
+ assert(ir.latest?)
646
+ ir.close()
647
+ end
621
648
  end
622
649
 
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: ferret
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.1.4
7
- date: 2005-11-01 00:00:00 +09:00
6
+ version: 0.2.0
7
+ date: 2005-11-12 00:00:00 +09:00
8
8
  summary: Ruby indexing library.
9
9
  require_paths:
10
10
  - lib
@@ -35,6 +35,7 @@ files:
35
35
  - MIT-LICENSE
36
36
  - TODO
37
37
  - TUTORIAL
38
+ - ext/Makefile
38
39
  - ext/index_io.c
39
40
  - ext/term_buffer.c
40
41
  - ext/ram_directory.c
@@ -47,6 +48,7 @@ files:
47
48
  - ext/ferret.h
48
49
  - ext/util.c
49
50
  - ext/tags
51
+ - ext/ferret_ext.so
50
52
  - ext/dummy.exe
51
53
  - lib/ferret.rb
52
54
  - lib/ferret/analysis.rb