ferret 0.11.6 → 0.11.8.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. data/README +10 -22
  2. data/RELEASE_CHANGES +137 -0
  3. data/RELEASE_NOTES +60 -0
  4. data/Rakefile +379 -274
  5. data/TODO +100 -8
  6. data/bin/ferret-browser +0 -0
  7. data/ext/BZLIB_blocksort.c +1094 -0
  8. data/ext/BZLIB_bzlib.c +1578 -0
  9. data/ext/BZLIB_compress.c +672 -0
  10. data/ext/BZLIB_crctable.c +104 -0
  11. data/ext/BZLIB_decompress.c +626 -0
  12. data/ext/BZLIB_huffman.c +205 -0
  13. data/ext/BZLIB_randtable.c +84 -0
  14. data/ext/{api.c → STEMMER_api.c} +7 -10
  15. data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
  16. data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
  17. data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
  18. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  19. data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
  20. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  21. data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
  22. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  25. data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
  26. data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
  27. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  29. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  30. data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
  31. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  32. data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
  33. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  34. data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
  35. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  36. data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
  37. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  38. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  39. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  40. data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
  41. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  42. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  43. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  44. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  45. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  46. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  47. data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
  48. data/ext/analysis.c +276 -121
  49. data/ext/analysis.h +190 -143
  50. data/ext/api.h +3 -4
  51. data/ext/array.c +5 -3
  52. data/ext/array.h +52 -43
  53. data/ext/bitvector.c +38 -482
  54. data/ext/bitvector.h +446 -124
  55. data/ext/bzlib.h +282 -0
  56. data/ext/bzlib_private.h +503 -0
  57. data/ext/compound_io.c +23 -22
  58. data/ext/config.h +21 -11
  59. data/ext/document.c +43 -40
  60. data/ext/document.h +31 -21
  61. data/ext/except.c +20 -38
  62. data/ext/except.h +89 -76
  63. data/ext/extconf.rb +3 -2
  64. data/ext/ferret.c +49 -35
  65. data/ext/ferret.h +14 -11
  66. data/ext/field_index.c +262 -0
  67. data/ext/field_index.h +52 -0
  68. data/ext/filter.c +11 -10
  69. data/ext/fs_store.c +65 -47
  70. data/ext/global.c +245 -165
  71. data/ext/global.h +252 -54
  72. data/ext/hash.c +200 -243
  73. data/ext/hash.h +205 -163
  74. data/ext/hashset.c +118 -96
  75. data/ext/hashset.h +110 -82
  76. data/ext/header.h +19 -19
  77. data/ext/helper.c +11 -10
  78. data/ext/helper.h +14 -6
  79. data/ext/index.c +745 -366
  80. data/ext/index.h +503 -529
  81. data/ext/internal.h +1020 -0
  82. data/ext/lang.c +10 -0
  83. data/ext/lang.h +35 -15
  84. data/ext/mempool.c +5 -4
  85. data/ext/mempool.h +30 -22
  86. data/ext/modules.h +35 -7
  87. data/ext/multimapper.c +43 -2
  88. data/ext/multimapper.h +32 -23
  89. data/ext/posh.c +0 -0
  90. data/ext/posh.h +4 -38
  91. data/ext/priorityqueue.c +10 -12
  92. data/ext/priorityqueue.h +33 -21
  93. data/ext/q_boolean.c +22 -9
  94. data/ext/q_const_score.c +3 -2
  95. data/ext/q_filtered_query.c +15 -12
  96. data/ext/q_fuzzy.c +147 -135
  97. data/ext/q_match_all.c +3 -2
  98. data/ext/q_multi_term.c +28 -32
  99. data/ext/q_parser.c +451 -173
  100. data/ext/q_phrase.c +158 -79
  101. data/ext/q_prefix.c +16 -18
  102. data/ext/q_range.c +363 -31
  103. data/ext/q_span.c +130 -141
  104. data/ext/q_term.c +21 -21
  105. data/ext/q_wildcard.c +19 -23
  106. data/ext/r_analysis.c +369 -242
  107. data/ext/r_index.c +421 -434
  108. data/ext/r_qparser.c +142 -92
  109. data/ext/r_search.c +790 -407
  110. data/ext/r_store.c +44 -44
  111. data/ext/r_utils.c +264 -96
  112. data/ext/ram_store.c +29 -23
  113. data/ext/scanner.c +895 -0
  114. data/ext/scanner.h +36 -0
  115. data/ext/scanner_mb.c +6701 -0
  116. data/ext/scanner_utf8.c +4415 -0
  117. data/ext/search.c +210 -87
  118. data/ext/search.h +556 -488
  119. data/ext/similarity.c +17 -16
  120. data/ext/similarity.h +51 -44
  121. data/ext/sort.c +157 -354
  122. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  123. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  124. data/ext/stem_UTF_8_hungarian.h +16 -0
  125. data/ext/stem_UTF_8_romanian.h +16 -0
  126. data/ext/stem_UTF_8_turkish.h +16 -0
  127. data/ext/stopwords.c +287 -278
  128. data/ext/store.c +57 -51
  129. data/ext/store.h +308 -286
  130. data/ext/symbol.c +10 -0
  131. data/ext/symbol.h +23 -0
  132. data/ext/term_vectors.c +14 -293
  133. data/ext/threading.h +22 -22
  134. data/ext/win32.h +12 -4
  135. data/lib/ferret.rb +2 -1
  136. data/lib/ferret/browser.rb +1 -1
  137. data/lib/ferret/field_symbol.rb +94 -0
  138. data/lib/ferret/index.rb +221 -34
  139. data/lib/ferret/number_tools.rb +6 -6
  140. data/lib/ferret/version.rb +3 -0
  141. data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
  142. data/test/test_helper.rb +7 -2
  143. data/test/test_installed.rb +1 -0
  144. data/test/threading/thread_safety_index_test.rb +10 -1
  145. data/test/threading/thread_safety_read_write_test.rb +4 -7
  146. data/test/threading/thread_safety_test.rb +0 -0
  147. data/test/unit/analysis/tc_analyzer.rb +29 -27
  148. data/test/unit/analysis/tc_token_stream.rb +23 -16
  149. data/test/unit/index/tc_index.rb +116 -11
  150. data/test/unit/index/tc_index_reader.rb +27 -27
  151. data/test/unit/index/tc_index_writer.rb +10 -0
  152. data/test/unit/index/th_doc.rb +38 -21
  153. data/test/unit/search/tc_filter.rb +31 -10
  154. data/test/unit/search/tc_index_searcher.rb +6 -0
  155. data/test/unit/search/tm_searcher.rb +53 -1
  156. data/test/unit/store/tc_fs_store.rb +40 -2
  157. data/test/unit/store/tc_ram_store.rb +0 -0
  158. data/test/unit/store/tm_store.rb +0 -0
  159. data/test/unit/store/tm_store_lock.rb +7 -6
  160. data/test/unit/tc_field_symbol.rb +26 -0
  161. data/test/unit/ts_analysis.rb +0 -0
  162. data/test/unit/ts_index.rb +0 -0
  163. data/test/unit/ts_store.rb +0 -0
  164. data/test/unit/ts_utils.rb +0 -0
  165. data/test/unit/utils/tc_number_tools.rb +0 -0
  166. data/test/utils/content_generator.rb +226 -0
  167. metadata +262 -221
  168. data/ext/inc/lang.h +0 -48
  169. data/ext/inc/threading.h +0 -31
  170. data/ext/stem_ISO_8859_1_english.c +0 -1156
  171. data/ext/stem_ISO_8859_1_french.c +0 -1276
  172. data/ext/stem_ISO_8859_1_italian.c +0 -1091
  173. data/ext/stem_ISO_8859_1_norwegian.c +0 -296
  174. data/ext/stem_ISO_8859_1_spanish.c +0 -1119
  175. data/ext/stem_ISO_8859_1_swedish.c +0 -307
  176. data/ext/stem_UTF_8_danish.c +0 -344
  177. data/ext/stem_UTF_8_english.c +0 -1176
  178. data/ext/stem_UTF_8_french.c +0 -1296
  179. data/ext/stem_UTF_8_italian.c +0 -1113
  180. data/ext/stem_UTF_8_norwegian.c +0 -302
  181. data/ext/stem_UTF_8_portuguese.c +0 -1055
  182. data/ext/stem_UTF_8_russian.c +0 -709
  183. data/ext/stem_UTF_8_spanish.c +0 -1137
  184. data/ext/stem_UTF_8_swedish.c +0 -313
  185. data/lib/ferret_version.rb +0 -3
@@ -23,7 +23,8 @@
23
23
  # :include: ../TUTORIAL
24
24
  $: << File.expand_path(File.join(File.dirname(__FILE__), "../ext"))
25
25
  require 'ferret_ext'
26
- require 'ferret_version'
26
+ require 'ferret/version'
27
27
  require 'ferret/document'
28
28
  require 'ferret/index'
29
29
  require 'ferret/field_infos'
30
+ require 'ferret/field_symbol'
@@ -77,7 +77,7 @@ module Ferret::Browser
77
77
  end
78
78
 
79
79
  def tick_or_cross(t)
80
- "<img src=\"/s/i/#{t ?'tick':'cross'}.png\" alt=\"#{t ?'yes':'no'}\"/>"
80
+ "<img src=\"/s/i/#{t ? 'tick' : 'cross'}.png\" alt=\"#{t ? 'yes' : 'no'}\"/>"
81
81
  end
82
82
  end
83
83
 
@@ -0,0 +1,94 @@
1
+ module Ferret
2
+ FIELD_TYPES = %w(integer float string byte).map{|t| t.to_sym}
3
+
4
+ if defined?(BasicObject)
5
+ # Ruby 1.9.x
6
+ class BlankSlate < BasicObject
7
+ end
8
+ else
9
+ # Ruby 1.8.x
10
+ # BlankSlate is a class with no instance methods except for __send__ and
11
+ # __id__. It is useful for creating proxy classes. It is currently used by
12
+ # the FieldSymbol class which is a proxy to the Symbol class
13
+ class BlankSlate
14
+ instance_methods.each { |m| undef_method m unless m =~ /^__|object_id/ }
15
+ end
16
+ end
17
+
18
+ # The FieldSymbolMethods module contains the methods that are added to both
19
+ # the Symbol class and the FieldSymbol class. These methods allow you to set
20
+ # the type easily set the type of a field by calling a method on a symbol.
21
+ #
22
+ # Right now this is only useful for Sorting and grouping, but some day Ferret
23
+ # may have typed fields, in which case these this methods will come in handy.
24
+ #
25
+ # The available types are specified in Ferret::FIELD_TYPES.
26
+ #
27
+ # == Examples
28
+ #
29
+ # index.search(query, :sort => :title.string.desc)
30
+ #
31
+ # index.search(query, :sort => [:price.float, :count.integer.desc])
32
+ #
33
+ # index.search(query, :group_by => :catalogue.string)
34
+ #
35
+ # == Note
36
+ #
37
+ # If you set the field type multiple times, the last type specified will be
38
+ # the type used. For example;
39
+ #
40
+ # puts :title.integer.float.byte.string.type.inspect # => :string
41
+ #
42
+ # Calling #desc twice will set desc? to false
43
+ #
44
+ # puts :title.desc? # => false
45
+ # puts :title.desc.desc? # => true
46
+ # puts :title.desc.desc.desc? # => false
47
+ module FieldSymbolMethods
48
+ FIELD_TYPES.each do |method|
49
+ define_method(method) do
50
+ fsym = FieldSymbol.new(self, respond_to?(:desc?) ? desc? : false)
51
+ fsym.type = method
52
+ fsym
53
+ end
54
+ end
55
+
56
+ # Set a field to be a descending field. This only makes sense in sort
57
+ # specifications.
58
+ def desc
59
+ fsym = FieldSymbol.new(self, respond_to?(:desc?) ? !desc? : true)
60
+ fsym.type = type if respond_to? :type
61
+ fsym
62
+ end
63
+
64
+ # Return whether or not this field should be a descending field
65
+ def desc?
66
+ @desc == true
67
+ end
68
+
69
+ # Return the type of this field
70
+ def type
71
+ @type || nil
72
+ end
73
+ end
74
+
75
+ # See FieldSymbolMethods
76
+ class FieldSymbol < BlankSlate
77
+ include FieldSymbolMethods
78
+ def initialize(symbol, desc = false)
79
+ @symbol = symbol
80
+ @desc = desc
81
+ end
82
+
83
+ def method_missing(method, *args)
84
+ @symbol.__send__(method, *args)
85
+ end
86
+
87
+ attr_writer :type, :desc
88
+ end
89
+ end
90
+
91
+ # See FieldSymbolMethods
92
+ class Symbol
93
+ include Ferret::FieldSymbolMethods
94
+ end
@@ -1,20 +1,6 @@
1
1
  require 'monitor'
2
2
 
3
3
  module Ferret::Index
4
- module SynchroLockMixin
5
- def synchrolock
6
- trys = 5
7
- begin
8
- synchronize {yield}
9
- rescue Ferret::Store::Lock::LockError => e
10
- if (trys -= 1) <= 0
11
- raise e
12
- else
13
- retry
14
- end
15
- end
16
- end
17
- end
18
4
  # This is a simplified interface to the index. See the TUTORIAL for more
19
5
  # information on how to use this class.
20
6
  class Index
@@ -40,7 +26,7 @@ module Ferret::Index
40
26
  # default_input_field:: Default: "id". This specifies the default field
41
27
  # that will be used when you add a simple string
42
28
  # to the index using #add_document or <<.
43
- # id_field: Default: "id". This field is as the field to
29
+ # id_field:: Default: "id". This field is as the field to
44
30
  # search when doing searches on a term. For
45
31
  # example, if you do a lookup by term "cat", ie
46
32
  # index["cat"], this will be the field that is
@@ -75,8 +61,16 @@ module Ferret::Index
75
61
  # Directory object to this class and you want
76
62
  # Index to close it when it is closed itself then
77
63
  # set this to true.
78
- #
79
- # Some examples;
64
+ # use_typed_range_query:: Default: true. Use TypedRangeQuery instead of
65
+ # the standard RangeQuery when parsing
66
+ # range queries. This is useful if you have number
67
+ # fields which you want to perform range queries
68
+ # on. You won't need to pad or normalize the data
69
+ # in the field in anyway to get correct results.
70
+ # However, performance will be a lot slower for
71
+ # large indexes, hence the default.
72
+ #
73
+ # == Examples
80
74
  #
81
75
  # index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
82
76
  #
@@ -130,7 +124,7 @@ module Ferret::Index
130
124
  @dir = RAMDirectory.new
131
125
  end
132
126
 
133
- @dir.extend(MonitorMixin).extend(SynchroLockMixin)
127
+ @dir.extend(MonitorMixin) unless @dir.kind_of? MonitorMixin
134
128
  options[:dir] = @dir
135
129
  options[:lock_retry_time]||= 2
136
130
  @options = options
@@ -138,6 +132,9 @@ module Ferret::Index
138
132
  IndexWriter.new(options).close
139
133
  end
140
134
  options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new
135
+ if options[:use_typed_range_query].nil?
136
+ options[:use_typed_range_query] = true
137
+ end
141
138
 
142
139
  @searcher = nil
143
140
  @writer = nil
@@ -264,7 +261,7 @@ module Ferret::Index
264
261
  #
265
262
  # See FieldInfos for more information on how to set field properties.
266
263
  def add_document(doc, analyzer = nil)
267
- @dir.synchrolock do
264
+ @dir.synchronize do
268
265
  ensure_writer_open()
269
266
  if doc.is_a?(String) or doc.is_a?(Array)
270
267
  doc = {@default_input_field => doc}
@@ -281,9 +278,7 @@ module Ferret::Index
281
278
  else
282
279
  id = doc[@key].to_s
283
280
  if id
284
- ensure_writer_open()
285
281
  @writer.delete(@key, id)
286
- @writer.commit
287
282
  end
288
283
  end
289
284
  end
@@ -397,6 +392,50 @@ module Ferret::Index
397
392
  end
398
393
  end
399
394
 
395
+ # Run a query through the Searcher on the index, ignoring scoring and
396
+ # starting at +:start_doc+ and stopping when +:limit+ matches have been
397
+ # found. It returns an array of the matching document numbers.
398
+ #
399
+ # There is a big performance advange when using this search method on a
400
+ # very large index when there are potentially thousands of matching
401
+ # documents and you only want say 50 of them. The other search methods need
402
+ # to look at every single match to decide which one has the highest score.
403
+ # This search method just needs to find +:limit+ number of matches before
404
+ # it returns.
405
+ #
406
+ # === Options
407
+ #
408
+ # start_doc:: Default: 0. The start document to start the search from.
409
+ # NOTE very carefully that this is not the same as the
410
+ # +:offset+ parameter used in the other search methods
411
+ # which refers to the offset in the result-set. This is the
412
+ # document to start the scan from. So if you scanning
413
+ # through the index in increments of 50 documents at a time
414
+ # you need to use the last matched doc in the previous
415
+ # search to start your next search. See the example below.
416
+ # limit:: Default: 50. This is the number of results you want
417
+ # returned, also called the page size. Set +:limit+ to
418
+ # +:all+ to return all results.
419
+ # TODO: add option to return loaded documents instead
420
+ #
421
+ # === Options
422
+ #
423
+ # start_doc = 0
424
+ # begin
425
+ # results = @searcher.scan(query, :start_doc => start_doc)
426
+ # yield results # or do something with them
427
+ # start_doc = results.last
428
+ # # start_doc will be nil now if results is empty, ie no more matches
429
+ # end while start_doc
430
+ def scan(query, options = {})
431
+ @dir.synchronize do
432
+ ensure_searcher_open()
433
+ query = do_process_query(query)
434
+
435
+ @searcher.scan(query, options)
436
+ end
437
+ end
438
+
400
439
  # Retrieves a document/documents from the index. The method for retrieval
401
440
  # depends on the type of the argument passed.
402
441
  #
@@ -408,7 +447,7 @@ module Ferret::Index
408
447
  #
409
448
  # If +arg+ is a String then search for the first document with +arg+ in
410
449
  # the +id+ field. The +id+ field is either :id or whatever you set
411
- # :id_field parameter to when you create the Index object.
450
+ # +:id_field+ parameter to when you create the Index object.
412
451
  def doc(*arg)
413
452
  @dir.synchronize do
414
453
  id = arg[0]
@@ -424,6 +463,38 @@ module Ferret::Index
424
463
  end
425
464
  alias :[] :doc
426
465
 
466
+ # Retrieves the term_vector for a document. The document can be referenced
467
+ # by either a string id to match the id field or an integer corresponding
468
+ # to Ferret's document number.
469
+ #
470
+ # See Ferret::Index::IndexReader#term_vector
471
+ def term_vector(id, field)
472
+ @dir.synchronize do
473
+ ensure_reader_open()
474
+ if id.kind_of?(String) or id.kind_of?(Symbol)
475
+ term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
476
+ if term_doc_enum.next?
477
+ id = term_doc_enum.doc
478
+ else
479
+ return nil
480
+ end
481
+ end
482
+ return @reader.term_vector(id, field)
483
+ end
484
+ end
485
+
486
+ # iterate through all documents in the index. This method preloads the
487
+ # documents so you don't need to call #load on the document to load all the
488
+ # fields.
489
+ def each
490
+ @dir.synchronize do
491
+ ensure_reader_open
492
+ (0...@reader.max_doc).each do |i|
493
+ yield @reader[i].load unless @reader.deleted?(i)
494
+ end
495
+ end
496
+ end
497
+
427
498
  # Deletes a document/documents from the index. The method for determining
428
499
  # the document to delete depends on the type of the argument passed.
429
500
  #
@@ -431,18 +502,28 @@ module Ferret::Index
431
502
  # document number. Will raise an error if the document does not exist.
432
503
  #
433
504
  # If +arg+ is a String then search for the documents with +arg+ in the
434
- # +id+ field. The +id+ field is either :id or whatever you set :id_field
505
+ # +id+ field. The +id+ field is either :id or whatever you set +:id_field+
435
506
  # parameter to when you create the Index object. Will fail quietly if the
436
507
  # no document exists.
508
+ #
509
+ # If +arg+ is a Hash or an Array then a batch delete will be performed.
510
+ # If +arg+ is an Array then it will be considered an array of +id+'s. If
511
+ # it is a Hash, then its keys will be used instead as the Array of
512
+ # document +id+'s. If the +id+ is an Integer then it is considered a
513
+ # Ferret document number and the corresponding document will be deleted.
514
+ # If the +id+ is a String or a Symbol then the +id+ will be considered a
515
+ # term and the documents that contain that term in the +:id_field+ will be
516
+ # deleted.
437
517
  def delete(arg)
438
- @dir.synchrolock do
439
- ensure_writer_open()
518
+ @dir.synchronize do
440
519
  if arg.is_a?(String) or arg.is_a?(Symbol)
441
520
  ensure_writer_open()
442
521
  @writer.delete(@id_field, arg.to_s)
443
522
  elsif arg.is_a?(Integer)
444
523
  ensure_reader_open()
445
524
  cnt = @reader.delete(arg)
525
+ elsif arg.is_a?(Hash) or arg.is_a?(Array)
526
+ batch_delete(arg)
446
527
  else
447
528
  raise ArgumentError, "Cannot delete for arg of type #{arg.class}"
448
529
  end
@@ -457,7 +538,7 @@ module Ferret::Index
457
538
  # string (in which case it is parsed by the standard query parser)
458
539
  # or an actual query object.
459
540
  def query_delete(query)
460
- @dir.synchrolock do
541
+ @dir.synchronize do
461
542
  ensure_writer_open()
462
543
  ensure_searcher_open()
463
544
  query = do_process_query(query)
@@ -479,13 +560,14 @@ module Ferret::Index
479
560
  # Update the document referenced by the document number +id+ if +id+ is an
480
561
  # integer or all of the documents which have the term +id+ if +id+ is a
481
562
  # term..
563
+ # For batch update of set of documents, for performance reasons, see batch_update
482
564
  #
483
565
  # id:: The number of the document to update. Can also be a string
484
566
  # representing the value in the +id+ field. Also consider using
485
567
  # the :key attribute.
486
568
  # new_doc:: The document to replace the old document with
487
569
  def update(id, new_doc)
488
- @dir.synchrolock do
570
+ @dir.synchronize do
489
571
  ensure_writer_open()
490
572
  delete(id)
491
573
  if id.is_a?(String) or id.is_a?(Symbol)
@@ -498,6 +580,73 @@ module Ferret::Index
498
580
  end
499
581
  end
500
582
 
583
+ # Batch updates the documents in an index. You can pass either a Hash or
584
+ # an Array.
585
+ #
586
+ # === Array (recommended)
587
+ #
588
+ # If you pass an Array then each value needs to be a Document or a Hash
589
+ # and each of those documents must have an +:id_field+ which will be used
590
+ # to delete the old document that this document is replacing.
591
+ #
592
+ # === Hash
593
+ #
594
+ # If you pass a Hash then the keys of the Hash will be considered the
595
+ # +id+'s and the values will be the new documents to replace the old ones
596
+ # with.If the +id+ is an Integer then it is considered a Ferret document
597
+ # number and the corresponding document will be deleted. If the +id+ is a
598
+ # String or a Symbol then the +id+ will be considered a term and the
599
+ # documents that contain that term in the +:id_field+ will be deleted.
600
+ #
601
+ # Note: No error will be raised if the document does not currently
602
+ # exist. A new document will simply be created.
603
+ #
604
+ # == Examples
605
+ #
606
+ # # will replace the documents with the +id+'s id:133 and id:254
607
+ # @index.batch_update({
608
+ # '133' => {:id => '133', :content => 'yada yada yada'},
609
+ # '253' => {:id => '253', :content => 'bla bla bal'}
610
+ # })
611
+ #
612
+ # # will replace the documents with the Ferret Document numbers 2 and 92
613
+ # @index.batch_update({
614
+ # 2 => {:id => '133', :content => 'yada yada yada'},
615
+ # 92 => {:id => '253', :content => 'bla bla bal'}
616
+ # })
617
+ #
618
+ # # will replace the documents with the +id+'s id:133 and id:254
619
+ # # this is recommended as it guarantees no duplicate keys
620
+ # @index.batch_update([
621
+ # {:id => '133', :content => 'yada yada yada'},
622
+ # {:id => '253', :content => 'bla bla bal'}
623
+ # ])
624
+ #
625
+ # docs:: A Hash of id/document pairs. The set of documents to be updated
626
+ def batch_update(docs)
627
+ @dir.synchronize do
628
+ ids = values = nil
629
+ case docs
630
+ when Array
631
+ ids = docs.collect{|doc| doc[@id_field].to_s}
632
+ if ids.include?(nil)
633
+ raise ArgumentError, "all documents must have an #{@id_field} "
634
+ "field when doing a batch update"
635
+ end
636
+ when Hash
637
+ ids = docs.keys
638
+ docs = docs.values
639
+ else
640
+ raise ArgumentError, "must pass Hash or Array, not #{docs.class}"
641
+ end
642
+ batch_delete(ids)
643
+ ensure_writer_open()
644
+ docs.each {|new_doc| @writer << new_doc }
645
+ flush()
646
+ end
647
+ end
648
+
649
+
501
650
  # Update all the documents returned by the query.
502
651
  #
503
652
  # query:: The query to find documents you wish to update. Can either be
@@ -523,12 +672,12 @@ module Ferret::Index
523
672
  # #=> {:id => "28", :title => "My Oh My", :artist => "David Gray"}
524
673
  #
525
674
  def query_update(query, new_val)
526
- @dir.synchrolock do
675
+ @dir.synchronize do
527
676
  ensure_writer_open()
528
677
  ensure_searcher_open()
529
678
  docs_to_add = []
530
679
  query = do_process_query(query)
531
- @searcher.search_each(query) do |id, score|
680
+ @searcher.search_each(query, :limit => :all) do |id, score|
532
681
  document = @searcher[id].load
533
682
  if new_val.is_a?(Hash)
534
683
  document.merge!(new_val)
@@ -568,7 +717,8 @@ module Ferret::Index
568
717
  end
569
718
  @reader.commit
570
719
  elsif @writer
571
- @writer.commit
720
+ @writer.close
721
+ @writer = nil
572
722
  end
573
723
  end
574
724
  end
@@ -577,7 +727,7 @@ module Ferret::Index
577
727
  # optimizes the index. This should only be called when the index will no
578
728
  # longer be updated very often, but will be read a lot.
579
729
  def optimize()
580
- @dir.synchrolock do
730
+ @dir.synchronize do
581
731
  ensure_writer_open()
582
732
  @writer.optimize()
583
733
  @writer.close()
@@ -605,7 +755,7 @@ module Ferret::Index
605
755
  #
606
756
  # After this completes, the index is optimized.
607
757
  def add_indexes(indexes)
608
- @dir.synchrolock do
758
+ @dir.synchronize do
609
759
  ensure_writer_open()
610
760
  indexes = [indexes].flatten # make sure we have an array
611
761
  return if indexes.size == 0 # nothing to do
@@ -648,7 +798,7 @@ module Ferret::Index
648
798
  elsif directory.is_a?(Ferret::Store::Directory)
649
799
  @dir = directory
650
800
  end
651
- @dir.extend(MonitorMixin).extend(SynchroLockMixin)
801
+ @dir.extend(MonitorMixin) unless @dir.kind_of? MonitorMixin
652
802
  @options[:dir] = @dir
653
803
  @options[:create_if_missing] = true
654
804
  add_indexes([old_dir])
@@ -690,7 +840,7 @@ module Ferret::Index
690
840
  # Returns the field_infos object so that you can add new fields to the
691
841
  # index.
692
842
  def field_infos
693
- @dir.synchrolock do
843
+ @dir.synchronize do
694
844
  ensure_writer_open()
695
845
  return @writer.field_infos
696
846
  end
@@ -778,6 +928,43 @@ module Ferret::Index
778
928
  @writer = nil
779
929
  end
780
930
  end
931
+
932
+ # If +docs+ is a Hash or an Array then a batch delete will be performed.
933
+ # If +docs+ is an Array then it will be considered an array of +id+'s. If
934
+ # it is a Hash, then its keys will be used instead as the Array of
935
+ # document +id+'s. If the +id+ is an Integers then it is considered a
936
+ # Ferret document number and the corresponding document will be deleted.
937
+ # If the +id+ is a String or a Symbol then the +id+ will be considered a
938
+ # term and the documents that contain that term in the +:id_field+ will
939
+ # be deleted.
940
+ #
941
+ # docs:: An Array of docs to be deleted, or a Hash (in which case the keys
942
+ # are used)
943
+ def batch_delete(docs)
944
+ docs = docs.keys if docs.is_a?(Hash)
945
+ raise ArgumentError, "must pass Array or Hash" unless docs.is_a? Array
946
+ ids = []
947
+ terms = []
948
+ docs.each do |doc|
949
+ case doc
950
+ when String then terms << doc
951
+ when Symbol then terms << doc.to_s
952
+ when Integer then ids << doc
953
+ else
954
+ raise ArgumentError, "Cannot delete for arg of type #{id.class}"
955
+ end
956
+ end
957
+ if ids.size > 0
958
+ ensure_reader_open
959
+ ids.each {|id| @reader.delete(id)}
960
+ end
961
+ if terms.size > 0
962
+ ensure_writer_open()
963
+ @writer.delete(@id_field, terms)
964
+ end
965
+ return self
966
+ end
967
+
781
968
  end
782
969
  end
783
970