jk-ferret 0.11.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. data/CHANGELOG +24 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README +90 -0
  4. data/RELEASE_CHANGES +137 -0
  5. data/RELEASE_NOTES +60 -0
  6. data/Rakefile +443 -0
  7. data/TODO +109 -0
  8. data/TUTORIAL +231 -0
  9. data/bin/ferret-browser +79 -0
  10. data/ext/BZLIB_blocksort.c +1094 -0
  11. data/ext/BZLIB_bzlib.c +1578 -0
  12. data/ext/BZLIB_compress.c +672 -0
  13. data/ext/BZLIB_crctable.c +104 -0
  14. data/ext/BZLIB_decompress.c +626 -0
  15. data/ext/BZLIB_huffman.c +205 -0
  16. data/ext/BZLIB_randtable.c +84 -0
  17. data/ext/STEMMER_api.c +66 -0
  18. data/ext/STEMMER_libstemmer.c +93 -0
  19. data/ext/STEMMER_stem_ISO_8859_1_danish.c +337 -0
  20. data/ext/STEMMER_stem_ISO_8859_1_dutch.c +624 -0
  21. data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
  22. data/ext/STEMMER_stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
  24. data/ext/STEMMER_stem_ISO_8859_1_german.c +503 -0
  25. data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
  26. data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
  27. data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
  28. data/ext/STEMMER_stem_ISO_8859_1_porter.c +749 -0
  29. data/ext/STEMMER_stem_ISO_8859_1_portuguese.c +1017 -0
  30. data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
  31. data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
  32. data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
  33. data/ext/STEMMER_stem_KOI8_R_russian.c +700 -0
  34. data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
  35. data/ext/STEMMER_stem_UTF_8_dutch.c +634 -0
  36. data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
  37. data/ext/STEMMER_stem_UTF_8_finnish.c +768 -0
  38. data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
  39. data/ext/STEMMER_stem_UTF_8_german.c +509 -0
  40. data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
  41. data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
  42. data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
  43. data/ext/STEMMER_stem_UTF_8_porter.c +755 -0
  44. data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
  45. data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
  46. data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
  47. data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
  48. data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
  49. data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
  50. data/ext/STEMMER_utilities.c +478 -0
  51. data/ext/analysis.c +1710 -0
  52. data/ext/analysis.h +266 -0
  53. data/ext/api.h +26 -0
  54. data/ext/array.c +125 -0
  55. data/ext/array.h +62 -0
  56. data/ext/bitvector.c +96 -0
  57. data/ext/bitvector.h +594 -0
  58. data/ext/bzlib.h +282 -0
  59. data/ext/bzlib_private.h +503 -0
  60. data/ext/compound_io.c +384 -0
  61. data/ext/config.h +52 -0
  62. data/ext/document.c +159 -0
  63. data/ext/document.h +63 -0
  64. data/ext/except.c +102 -0
  65. data/ext/except.h +176 -0
  66. data/ext/extconf.rb +15 -0
  67. data/ext/ferret.c +416 -0
  68. data/ext/ferret.h +94 -0
  69. data/ext/field_index.c +262 -0
  70. data/ext/field_index.h +52 -0
  71. data/ext/filter.c +157 -0
  72. data/ext/fs_store.c +493 -0
  73. data/ext/global.c +458 -0
  74. data/ext/global.h +302 -0
  75. data/ext/hash.c +524 -0
  76. data/ext/hash.h +515 -0
  77. data/ext/hashset.c +192 -0
  78. data/ext/hashset.h +215 -0
  79. data/ext/header.h +58 -0
  80. data/ext/helper.c +63 -0
  81. data/ext/helper.h +21 -0
  82. data/ext/index.c +6804 -0
  83. data/ext/index.h +935 -0
  84. data/ext/internal.h +1019 -0
  85. data/ext/lang.c +10 -0
  86. data/ext/lang.h +68 -0
  87. data/ext/libstemmer.h +79 -0
  88. data/ext/mempool.c +88 -0
  89. data/ext/mempool.h +43 -0
  90. data/ext/modules.h +190 -0
  91. data/ext/multimapper.c +351 -0
  92. data/ext/multimapper.h +60 -0
  93. data/ext/posh.c +1006 -0
  94. data/ext/posh.h +973 -0
  95. data/ext/priorityqueue.c +149 -0
  96. data/ext/priorityqueue.h +155 -0
  97. data/ext/q_boolean.c +1621 -0
  98. data/ext/q_const_score.c +162 -0
  99. data/ext/q_filtered_query.c +212 -0
  100. data/ext/q_fuzzy.c +280 -0
  101. data/ext/q_match_all.c +149 -0
  102. data/ext/q_multi_term.c +673 -0
  103. data/ext/q_parser.c +3103 -0
  104. data/ext/q_phrase.c +1206 -0
  105. data/ext/q_prefix.c +98 -0
  106. data/ext/q_range.c +682 -0
  107. data/ext/q_span.c +2390 -0
  108. data/ext/q_term.c +337 -0
  109. data/ext/q_wildcard.c +167 -0
  110. data/ext/r_analysis.c +2626 -0
  111. data/ext/r_index.c +3468 -0
  112. data/ext/r_qparser.c +635 -0
  113. data/ext/r_search.c +4490 -0
  114. data/ext/r_store.c +513 -0
  115. data/ext/r_utils.c +1131 -0
  116. data/ext/ram_store.c +476 -0
  117. data/ext/scanner.c +895 -0
  118. data/ext/scanner.h +36 -0
  119. data/ext/scanner_mb.c +6701 -0
  120. data/ext/scanner_utf8.c +4415 -0
  121. data/ext/search.c +1864 -0
  122. data/ext/search.h +953 -0
  123. data/ext/similarity.c +151 -0
  124. data/ext/similarity.h +89 -0
  125. data/ext/sort.c +786 -0
  126. data/ext/stem_ISO_8859_1_danish.h +16 -0
  127. data/ext/stem_ISO_8859_1_dutch.h +16 -0
  128. data/ext/stem_ISO_8859_1_english.h +16 -0
  129. data/ext/stem_ISO_8859_1_finnish.h +16 -0
  130. data/ext/stem_ISO_8859_1_french.h +16 -0
  131. data/ext/stem_ISO_8859_1_german.h +16 -0
  132. data/ext/stem_ISO_8859_1_hungarian.h +16 -0
  133. data/ext/stem_ISO_8859_1_italian.h +16 -0
  134. data/ext/stem_ISO_8859_1_norwegian.h +16 -0
  135. data/ext/stem_ISO_8859_1_porter.h +16 -0
  136. data/ext/stem_ISO_8859_1_portuguese.h +16 -0
  137. data/ext/stem_ISO_8859_1_spanish.h +16 -0
  138. data/ext/stem_ISO_8859_1_swedish.h +16 -0
  139. data/ext/stem_ISO_8859_2_romanian.h +16 -0
  140. data/ext/stem_KOI8_R_russian.h +16 -0
  141. data/ext/stem_UTF_8_danish.h +16 -0
  142. data/ext/stem_UTF_8_dutch.h +16 -0
  143. data/ext/stem_UTF_8_english.h +16 -0
  144. data/ext/stem_UTF_8_finnish.h +16 -0
  145. data/ext/stem_UTF_8_french.h +16 -0
  146. data/ext/stem_UTF_8_german.h +16 -0
  147. data/ext/stem_UTF_8_hungarian.h +16 -0
  148. data/ext/stem_UTF_8_italian.h +16 -0
  149. data/ext/stem_UTF_8_norwegian.h +16 -0
  150. data/ext/stem_UTF_8_porter.h +16 -0
  151. data/ext/stem_UTF_8_portuguese.h +16 -0
  152. data/ext/stem_UTF_8_romanian.h +16 -0
  153. data/ext/stem_UTF_8_russian.h +16 -0
  154. data/ext/stem_UTF_8_spanish.h +16 -0
  155. data/ext/stem_UTF_8_swedish.h +16 -0
  156. data/ext/stem_UTF_8_turkish.h +16 -0
  157. data/ext/stopwords.c +410 -0
  158. data/ext/store.c +698 -0
  159. data/ext/store.h +799 -0
  160. data/ext/symbol.c +10 -0
  161. data/ext/symbol.h +23 -0
  162. data/ext/term_vectors.c +73 -0
  163. data/ext/threading.h +31 -0
  164. data/ext/win32.h +62 -0
  165. data/lib/ferret.rb +30 -0
  166. data/lib/ferret/browser.rb +246 -0
  167. data/lib/ferret/browser/s/global.js +192 -0
  168. data/lib/ferret/browser/s/style.css +148 -0
  169. data/lib/ferret/browser/views/document/list.rhtml +49 -0
  170. data/lib/ferret/browser/views/document/show.rhtml +27 -0
  171. data/lib/ferret/browser/views/error/index.rhtml +7 -0
  172. data/lib/ferret/browser/views/help/index.rhtml +8 -0
  173. data/lib/ferret/browser/views/home/index.rhtml +29 -0
  174. data/lib/ferret/browser/views/layout.rhtml +22 -0
  175. data/lib/ferret/browser/views/term-vector/index.rhtml +4 -0
  176. data/lib/ferret/browser/views/term/index.rhtml +199 -0
  177. data/lib/ferret/browser/views/term/termdocs.rhtml +1 -0
  178. data/lib/ferret/browser/webrick.rb +14 -0
  179. data/lib/ferret/document.rb +130 -0
  180. data/lib/ferret/field_infos.rb +44 -0
  181. data/lib/ferret/field_symbol.rb +87 -0
  182. data/lib/ferret/index.rb +973 -0
  183. data/lib/ferret/number_tools.rb +157 -0
  184. data/lib/ferret/version.rb +3 -0
  185. data/setup.rb +1555 -0
  186. data/test/long_running/largefile/tc_largefile.rb +46 -0
  187. data/test/test_all.rb +5 -0
  188. data/test/test_helper.rb +29 -0
  189. data/test/test_installed.rb +1 -0
  190. data/test/threading/number_to_spoken.rb +132 -0
  191. data/test/threading/thread_safety_index_test.rb +88 -0
  192. data/test/threading/thread_safety_read_write_test.rb +73 -0
  193. data/test/threading/thread_safety_test.rb +133 -0
  194. data/test/unit/analysis/tc_analyzer.rb +550 -0
  195. data/test/unit/analysis/tc_token_stream.rb +653 -0
  196. data/test/unit/index/tc_index.rb +867 -0
  197. data/test/unit/index/tc_index_reader.rb +699 -0
  198. data/test/unit/index/tc_index_writer.rb +447 -0
  199. data/test/unit/index/th_doc.rb +332 -0
  200. data/test/unit/query_parser/tc_query_parser.rb +238 -0
  201. data/test/unit/search/tc_filter.rb +156 -0
  202. data/test/unit/search/tc_fuzzy_query.rb +147 -0
  203. data/test/unit/search/tc_index_searcher.rb +67 -0
  204. data/test/unit/search/tc_multi_searcher.rb +128 -0
  205. data/test/unit/search/tc_multiple_search_requests.rb +58 -0
  206. data/test/unit/search/tc_search_and_sort.rb +179 -0
  207. data/test/unit/search/tc_sort.rb +49 -0
  208. data/test/unit/search/tc_sort_field.rb +27 -0
  209. data/test/unit/search/tc_spans.rb +190 -0
  210. data/test/unit/search/tm_searcher.rb +436 -0
  211. data/test/unit/store/tc_fs_store.rb +115 -0
  212. data/test/unit/store/tc_ram_store.rb +35 -0
  213. data/test/unit/store/tm_store.rb +34 -0
  214. data/test/unit/store/tm_store_lock.rb +68 -0
  215. data/test/unit/tc_document.rb +81 -0
  216. data/test/unit/tc_field_symbol.rb +26 -0
  217. data/test/unit/ts_analysis.rb +2 -0
  218. data/test/unit/ts_index.rb +2 -0
  219. data/test/unit/ts_largefile.rb +4 -0
  220. data/test/unit/ts_query_parser.rb +2 -0
  221. data/test/unit/ts_search.rb +2 -0
  222. data/test/unit/ts_store.rb +2 -0
  223. data/test/unit/ts_utils.rb +2 -0
  224. data/test/unit/utils/tc_bit_vector.rb +295 -0
  225. data/test/unit/utils/tc_number_tools.rb +117 -0
  226. data/test/unit/utils/tc_priority_queue.rb +106 -0
  227. data/test/utils/content_generator.rb +226 -0
  228. metadata +319 -0
@@ -0,0 +1,973 @@
1
+ require 'monitor'
2
+
3
+ module Ferret::Index
4
+ # This is a simplified interface to the index. See the TUTORIAL for more
5
+ # information on how to use this class.
6
+ class Index
7
+ include MonitorMixin
8
+
9
+ include Ferret::Store
10
+ include Ferret::Search
11
+
12
+ attr_reader :options
13
+
14
+ # If you create an Index without any options, it'll simply create an index
15
+ # in memory. But this class is highly configurable and every option that
16
+ # you can supply to IndexWriter and QueryParser, you can also set here.
17
+ # Please look at the options for the constructors to these classes.
18
+ #
19
+ # === Options
20
+ #
21
+ # See;
22
+ #
23
+ # * QueryParser
24
+ # * IndexWriter
25
+ #
26
+ # default_input_field:: Default: "id". This specifies the default field
27
+ # that will be used when you add a simple string
28
+ # to the index using #add_document or <<.
29
+ # id_field:: Default: "id". This field is as the field to
30
+ # search when doing searches on a term. For
31
+ # example, if you do a lookup by term "cat", ie
32
+ # index["cat"], this will be the field that is
33
+ # searched.
34
+ # key:: Default: nil. Expert: This should only be used
35
+ # if you really know what you are doing. Basically
36
+ # you can set a field or an array of fields to be
37
+ # the key for the index. So if you add a document
38
+ # with a same key as an existing document, the
39
+ # existing document will be replaced by the new
40
+ # object. Using a multiple field key will slow
41
+ # down indexing so it should not be done if
42
+ # performance is a concern. A single field key (or
43
+ # id) should be find however. Also, you must make
44
+ # sure that your key/keys are either untokenized
45
+ # or that they are not broken up by the analyzer.
46
+ # auto_flush:: Default: false. Set this option to true if you
47
+ # want the index automatically flushed every time
48
+ # you do a write (includes delete) to the index.
49
+ # This is useful if you have multiple processes
50
+ # accessing the index and you don't want lock
51
+ # errors. Setting :auto_flush to true has a huge
52
+ # performance impact so don't use it if you are
53
+ # concerned about performance. In that case you
54
+ # should think about setting up a DRb indexing
55
+ # service.
56
+ # lock_retry_time:: Default: 2 seconds. This parameter specifies how
57
+ # long to wait before retrying to obtain the
58
+ # commit lock when detecting if the IndexReader is
59
+ # at the latest version.
60
+ # close_dir:: Default: false. If you explicitly pass a
61
+ # Directory object to this class and you want
62
+ # Index to close it when it is closed itself then
63
+ # set this to true.
64
+ # use_typed_range_query:: Default: true. Use TypedRangeQuery instead of
65
+ # the standard RangeQuery when parsing
66
+ # range queries. This is useful if you have number
67
+ # fields which you want to perform range queries
68
+ # on. You won't need to pad or normalize the data
69
+ # in the field in anyway to get correct results.
70
+ # However, performance will be a lot slower for
71
+ # large indexes, hence the default.
72
+ #
73
+ # == Examples
74
+ #
75
+ # index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
76
+ #
77
+ # index = Index::Index.new(:path => '/path/to/index',
78
+ # :create_if_missing => false,
79
+ # :auto_flush => true)
80
+ #
81
+ # index = Index::Index.new(:dir => directory,
82
+ # :default_slop => 2,
83
+ # :handle_parse_errors => false)
84
+ #
85
+ # You can also pass a block if you like. The index will be yielded and
86
+ # closed at the index of the box. For example;
87
+ #
88
+ # Ferret::I.new() do |index|
89
+ # # do stuff with index. Most of your actions will be cached.
90
+ # end
91
+ def initialize(options = {}, &block)
92
+ super()
93
+
94
+ if options[:key]
95
+ @key = options[:key]
96
+ if @key.is_a?(Array)
97
+ @key.flatten.map {|k| k.to_s.intern}
98
+ end
99
+ else
100
+ @key = nil
101
+ end
102
+
103
+ if (fi = options[:field_infos]).is_a?(String)
104
+ options[:field_infos] = FieldInfos.load(fi)
105
+ end
106
+
107
+ @close_dir = options[:close_dir]
108
+ if options[:dir].is_a?(String)
109
+ options[:path] = options[:dir]
110
+ end
111
+ if options[:path]
112
+ @close_dir = true
113
+ begin
114
+ @dir = FSDirectory.new(options[:path], options[:create])
115
+ rescue IOError => io
116
+ @dir = FSDirectory.new(options[:path],
117
+ options[:create_if_missing] != false)
118
+ end
119
+ elsif options[:dir]
120
+ @dir = options[:dir]
121
+ else
122
+ options[:create] = true # this should always be true for a new RAMDir
123
+ @close_dir = true
124
+ @dir = RAMDirectory.new
125
+ end
126
+
127
+ @dir.extend(MonitorMixin) unless @dir.kind_of? MonitorMixin
128
+ options[:dir] = @dir
129
+ options[:lock_retry_time]||= 2
130
+ @options = options
131
+ if (!@dir.exists?("segments")) || options[:create]
132
+ IndexWriter.new(options).close
133
+ end
134
+ options[:analyzer]||= Ferret::Analysis::StandardAnalyzer.new
135
+ if options[:use_typed_range_query].nil?
136
+ options[:use_typed_range_query] = true
137
+ end
138
+
139
+ @searcher = nil
140
+ @writer = nil
141
+ @reader = nil
142
+
143
+ @options.delete(:create) # only create the first time if at all
144
+ @auto_flush = @options[:auto_flush] || false
145
+ if (@options[:id_field].nil? and @key.is_a?(Symbol))
146
+ @id_field = @key
147
+ else
148
+ @id_field = @options[:id_field] || :id
149
+ end
150
+ @default_field = (@options[:default_field]||= :*)
151
+ @default_input_field = options[:default_input_field] || @id_field
152
+
153
+ if @default_input_field.respond_to?(:intern)
154
+ @default_input_field = @default_input_field.intern
155
+ end
156
+ @open = true
157
+ @qp = nil
158
+ if block
159
+ yield self
160
+ self.close
161
+ end
162
+ end
163
+
164
+ # Returns an array of strings with the matches highlighted. The +query+ can
165
+ # either a query String or a Ferret::Search::Query object. The doc_id is
166
+ # the id of the document you want to highlight (usually returned by the
167
+ # search methods). There are also a number of options you can pass;
168
+ #
169
+ # === Options
170
+ #
171
+ # field:: Default: @options[:default_field]. The default_field
172
+ # is the field that is usually highlighted but you can
173
+ # specify which field you want to highlight here. If
174
+ # you want to highlight multiple fields then you will
175
+ # need to call this method multiple times.
176
+ # excerpt_length:: Default: 150. Length of excerpt to show. Highlighted
177
+ # terms will be in the centre of the excerpt. Set to
178
+ # :all to highlight the entire field.
179
+ # num_excerpts:: Default: 2. Number of excerpts to return.
180
+ # pre_tag:: Default: "<b>". Tag to place to the left of the
181
+ # match. You'll probably want to change this to a
182
+ # "<span>" tag with a class. Try "\033[36m" for use in
183
+ # a terminal.
184
+ # post_tag:: Default: "</b>". This tag should close the
185
+ # +:pre_tag+. Try tag "\033[m" in the terminal.
186
+ # ellipsis:: Default: "...". This is the string that is appended
187
+ # at the beginning and end of excerpts (unless the
188
+ # excerpt hits the start or end of the field.
189
+ # Alternatively you may want to use the HTML entity
190
+ # &#8230; or the UTF-8 string "\342\200\246".
191
+ def highlight(query, doc_id, options = {})
192
+ @dir.synchronize do
193
+ ensure_searcher_open()
194
+ @searcher.highlight(do_process_query(query),
195
+ doc_id,
196
+ options[:field]||@options[:default_field],
197
+ options)
198
+ end
199
+ end
200
+
201
+ # Closes this index by closing its associated reader and writer objects.
202
+ def close
203
+ @dir.synchronize do
204
+ if not @open
205
+ raise(StandardError, "tried to close an already closed directory")
206
+ end
207
+ @searcher.close() if @searcher
208
+ @reader.close() if @reader
209
+ @writer.close() if @writer
210
+ @dir.close() if @close_dir
211
+
212
+ @open = false
213
+ end
214
+ end
215
+
216
+ # Get the reader for this index.
217
+ # NOTE:: This will close the writer from this index.
218
+ def reader
219
+ ensure_reader_open()
220
+ return @reader
221
+ end
222
+
223
+ # Get the searcher for this index.
224
+ # NOTE:: This will close the writer from this index.
225
+ def searcher
226
+ ensure_searcher_open()
227
+ return @searcher
228
+ end
229
+
230
+ # Get the writer for this index.
231
+ # NOTE:: This will close the reader from this index.
232
+ def writer
233
+ ensure_writer_open()
234
+ return @writer
235
+ end
236
+
237
+ # Adds a document to this index, using the provided analyzer instead of
238
+ # the local analyzer if provided. If the document contains more than
239
+ # IndexWriter::MAX_FIELD_LENGTH terms for a given field, the remainder are
240
+ # discarded.
241
+ #
242
+ # There are three ways to add a document to the index.
243
+ # To add a document you can simply add a string or an array of strings.
244
+ # This will store all the strings in the "" (ie empty string) field
245
+ # (unless you specify the default_field when you create the index).
246
+ #
247
+ # index << "This is a new document to be indexed"
248
+ # index << ["And here", "is another", "new document", "to be indexed"]
249
+ #
250
+ # But these are pretty simple documents. If this is all you want to index
251
+ # you could probably just use SimpleSearch. So let's give our documents
252
+ # some fields;
253
+ #
254
+ # index << {:title => "Programming Ruby", :content => "blah blah blah"}
255
+ # index << {:title => "Programming Ruby", :content => "yada yada yada"}
256
+ #
257
+ # Or if you are indexing data stored in a database, you'll probably want
258
+ # to store the id;
259
+ #
260
+ # index << {:id => row.id, :title => row.title, :date => row.date}
261
+ #
262
+ # See FieldInfos for more information on how to set field properties.
263
+ def add_document(doc, analyzer = nil)
264
+ @dir.synchronize do
265
+ ensure_writer_open()
266
+ if doc.is_a?(String) or doc.is_a?(Array)
267
+ doc = {@default_input_field => doc}
268
+ end
269
+
270
+ # delete existing documents with the same key
271
+ if @key
272
+ if @key.is_a?(Array)
273
+ query = @key.inject(BooleanQuery.new()) do |bq, field|
274
+ bq.add_query(TermQuery.new(field, doc[field].to_s), :must)
275
+ bq
276
+ end
277
+ query_delete(query)
278
+ else
279
+ id = doc[@key].to_s
280
+ if id
281
+ @writer.delete(@key, id)
282
+ end
283
+ end
284
+ end
285
+ ensure_writer_open()
286
+
287
+ if analyzer
288
+ old_analyzer = @writer.analyzer
289
+ @writer.analyzer = analyzer
290
+ @writer.add_document(doc)
291
+ @writer.analyzer = old_analyzer
292
+ else
293
+ @writer.add_document(doc)
294
+ end
295
+
296
+ flush() if @auto_flush
297
+ end
298
+ end
299
+ alias :<< :add_document
300
+
301
+ # Run a query through the Searcher on the index. A TopDocs object is
302
+ # returned with the relevant results. The +query+ is a built in Query
303
+ # object or a query string that can be parsed by the Ferret::QueryParser.
304
+ # Here are the options;
305
+ #
306
+ # === Options
307
+ #
308
+ # offset:: Default: 0. The offset of the start of the section of the
309
+ # result-set to return. This is used for paging through
310
+ # results. Let's say you have a page size of 10. If you
311
+ # don't find the result you want among the first 10 results
312
+ # then set +:offset+ to 10 and look at the next 10 results,
313
+ # then 20 and so on.
314
+ # limit:: Default: 10. This is the number of results you want
315
+ # returned, also called the page size. Set +:limit+ to
316
+ # +:all+ to return all results
317
+ # sort:: A Sort object or sort string describing how the field
318
+ # should be sorted. A sort string is made up of field names
319
+ # which cannot contain spaces and the word "DESC" if you
320
+ # want the field reversed, all separated by commas. For
321
+ # example; "rating DESC, author, title". Note that Ferret
322
+ # will try to determine a field's type by looking at the
323
+ # first term in the index and seeing if it can be parsed as
324
+ # an integer or a float. Keep this in mind as you may need
325
+ # to specify a fields type to sort it correctly. For more
326
+ # on this, see the documentation for SortField
327
+ # filter:: a Filter object to filter the search results with
328
+ # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
329
+ # and the Searcher object as its parameters and returns a
330
+ # Boolean value specifying whether the result should be
331
+ # included in the result set.
332
+ def search(query, options = {})
333
+ @dir.synchronize do
334
+ return do_search(query, options)
335
+ end
336
+ end
337
+
338
+ # Run a query through the Searcher on the index. A TopDocs object is
339
+ # returned with the relevant results. The +query+ is a Query object or a
340
+ # query string that can be validly parsed by the Ferret::QueryParser. The
341
+ # Searcher#search_each method yields the internal document id (used to
342
+ # reference documents in the Searcher object like this;
343
+ # +searcher[doc_id]+) and the search score for that document. It is
344
+ # possible for the score to be greater than 1.0 for some queries and
345
+ # taking boosts into account. This method will also normalize scores to
346
+ # the range 0.0..1.0 when the max-score is greater than 1.0. Here are the
347
+ # options;
348
+ #
349
+ # === Options
350
+ #
351
+ # offset:: Default: 0. The offset of the start of the section of the
352
+ # result-set to return. This is used for paging through
353
+ # results. Let's say you have a page size of 10. If you
354
+ # don't find the result you want among the first 10 results
355
+ # then set +:offset+ to 10 and look at the next 10 results,
356
+ # then 20 and so on.
357
+ # limit:: Default: 10. This is the number of results you want
358
+ # returned, also called the page size. Set +:limit+ to
359
+ # +:all+ to return all results
360
+ # sort:: A Sort object or sort string describing how the field
361
+ # should be sorted. A sort string is made up of field names
362
+ # which cannot contain spaces and the word "DESC" if you
363
+ # want the field reversed, all separated by commas. For
364
+ # example; "rating DESC, author, title". Note that Ferret
365
+ # will try to determine a field's type by looking at the
366
+ # first term in the index and seeing if it can be parsed as
367
+ # an integer or a float. Keep this in mind as you may need
368
+ # to specify a fields type to sort it correctly. For more
369
+ # on this, see the documentation for SortField
370
+ # filter:: a Filter object to filter the search results with
371
+ # filter_proc:: a filter Proc is a Proc which takes the doc_id, the score
372
+ # and the Searcher object as its parameters and returns a
373
+ # Boolean value specifying whether the result should be
374
+ # included in the result set.
375
+ #
376
+ # returns:: The total number of hits.
377
+ #
378
+ # === Example
379
+ # eg.
380
+ # index.search_each(query, options = {}) do |doc, score|
381
+ # puts "hit document number #{doc} with a score of #{score}"
382
+ # end
383
+ #
384
+ def search_each(query, options = {}) # :yield: doc, score
385
+ @dir.synchronize do
386
+ ensure_searcher_open()
387
+ query = do_process_query(query)
388
+
389
+ @searcher.search_each(query, options) do |doc, score|
390
+ yield doc, score
391
+ end
392
+ end
393
+ end
394
+
395
+ # Run a query through the Searcher on the index, ignoring scoring and
396
+ # starting at +:start_doc+ and stopping when +:limit+ matches have been
397
+ # found. It returns an array of the matching document numbers.
398
+ #
399
+ # There is a big performance advange when using this search method on a
400
+ # very large index when there are potentially thousands of matching
401
+ # documents and you only want say 50 of them. The other search methods need
402
+ # to look at every single match to decide which one has the highest score.
403
+ # This search method just needs to find +:limit+ number of matches before
404
+ # it returns.
405
+ #
406
+ # === Options
407
+ #
408
+ # start_doc:: Default: 0. The start document to start the search from.
409
+ # NOTE very carefully that this is not the same as the
410
+ # +:offset+ parameter used in the other search methods
411
+ # which refers to the offset in the result-set. This is the
412
+ # document to start the scan from. So if you scanning
413
+ # through the index in increments of 50 documents at a time
414
+ # you need to use the last matched doc in the previous
415
+ # search to start your next search. See the example below.
416
+ # limit:: Default: 50. This is the number of results you want
417
+ # returned, also called the page size. Set +:limit+ to
418
+ # +:all+ to return all results.
419
+ # TODO: add option to return loaded documents instead
420
+ #
421
+ # === Options
422
+ #
423
+ # start_doc = 0
424
+ # begin
425
+ # results = @searcher.scan(query, :start_doc => start_doc)
426
+ # yield results # or do something with them
427
+ # start_doc = results.last
428
+ # # start_doc will be nil now if results is empty, ie no more matches
429
+ # end while start_doc
430
+ def scan(query, options = {})
431
+ @dir.synchronize do
432
+ ensure_searcher_open()
433
+ query = do_process_query(query)
434
+
435
+ @searcher.scan(query, options)
436
+ end
437
+ end
438
+
439
+ # Retrieves a document/documents from the index. The method for retrieval
440
+ # depends on the type of the argument passed.
441
+ #
442
+ # If +arg+ is an Integer then return the document based on the internal
443
+ # document number.
444
+ #
445
+ # If +arg+ is a Range, then return the documents within the range based on
446
+ # internal document number.
447
+ #
448
+ # If +arg+ is a String then search for the first document with +arg+ in
449
+ # the +id+ field. The +id+ field is either :id or whatever you set
450
+ # +:id_field+ parameter to when you create the Index object.
451
+ def doc(*arg)
452
+ @dir.synchronize do
453
+ id = arg[0]
454
+ if id.kind_of?(String) or id.kind_of?(Symbol)
455
+ ensure_reader_open()
456
+ term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
457
+ return term_doc_enum.next? ? @reader[term_doc_enum.doc] : nil
458
+ else
459
+ ensure_reader_open(false)
460
+ return @reader[*arg]
461
+ end
462
+ end
463
+ end
464
+ alias :[] :doc
465
+
466
+ # Retrieves the term_vector for a document. The document can be referenced
467
+ # by either a string id to match the id field or an integer corresponding
468
+ # to Ferret's document number.
469
+ #
470
+ # See Ferret::Index::IndexReader#term_vector
471
+ def term_vector(id, field)
472
+ @dir.synchronize do
473
+ ensure_reader_open()
474
+ if id.kind_of?(String) or id.kind_of?(Symbol)
475
+ term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
476
+ if term_doc_enum.next?
477
+ id = term_doc_enum.doc
478
+ else
479
+ return nil
480
+ end
481
+ end
482
+ return @reader.term_vector(id, field)
483
+ end
484
+ end
485
+
486
+ # iterate through all documents in the index. This method preloads the
487
+ # documents so you don't need to call #load on the document to load all the
488
+ # fields.
489
+ def each
490
+ @dir.synchronize do
491
+ ensure_reader_open
492
+ (0...@reader.max_doc).each do |i|
493
+ yield @reader[i].load unless @reader.deleted?(i)
494
+ end
495
+ end
496
+ end
497
+
498
+ # Deletes a document/documents from the index. The method for determining
499
+ # the document to delete depends on the type of the argument passed.
500
+ #
501
+ # If +arg+ is an Integer then delete the document based on the internal
502
+ # document number. Will raise an error if the document does not exist.
503
+ #
504
+ # If +arg+ is a String then search for the documents with +arg+ in the
505
+ # +id+ field. The +id+ field is either :id or whatever you set +:id_field+
506
+ # parameter to when you create the Index object. Will fail quietly if the
507
+ # no document exists.
508
+ #
509
+ # If +arg+ is a Hash or an Array then a batch delete will be performed.
510
+ # If +arg+ is an Array then it will be considered an array of +id+'s. If
511
+ # it is a Hash, then its keys will be used instead as the Array of
512
+ # document +id+'s. If the +id+ is an Integer then it is considered a
513
+ # Ferret document number and the corresponding document will be deleted.
514
+ # If the +id+ is a String or a Symbol then the +id+ will be considered a
515
+ # term and the documents that contain that term in the +:id_field+ will be
516
+ # deleted.
517
+ def delete(arg)
518
+ @dir.synchronize do
519
+ if arg.is_a?(String) or arg.is_a?(Symbol)
520
+ ensure_writer_open()
521
+ @writer.delete(@id_field, arg.to_s)
522
+ elsif arg.is_a?(Integer)
523
+ ensure_reader_open()
524
+ cnt = @reader.delete(arg)
525
+ elsif arg.is_a?(Hash) or arg.is_a?(Array)
526
+ batch_delete(arg)
527
+ else
528
+ raise ArgumentError, "Cannot delete for arg of type #{arg.class}"
529
+ end
530
+ flush() if @auto_flush
531
+ end
532
+ return self
533
+ end
534
+
535
+ # Delete all documents returned by the query.
536
+ #
537
+ # query:: The query to find documents you wish to delete. Can either be a
538
+ # string (in which case it is parsed by the standard query parser)
539
+ # or an actual query object.
540
+ def query_delete(query)
541
+ @dir.synchronize do
542
+ ensure_writer_open()
543
+ ensure_searcher_open()
544
+ query = do_process_query(query)
545
+ @searcher.search_each(query, :limit => :all) do |doc, score|
546
+ @reader.delete(doc)
547
+ end
548
+ flush() if @auto_flush
549
+ end
550
+ end
551
+
552
+ # Returns true if document +n+ has been deleted
553
+ def deleted?(n)
554
+ @dir.synchronize do
555
+ ensure_reader_open()
556
+ return @reader.deleted?(n)
557
+ end
558
+ end
559
+
560
+ # Update the document referenced by the document number +id+ if +id+ is an
561
+ # integer or all of the documents which have the term +id+ if +id+ is a
562
+ # term..
563
+ # For batch update of set of documents, for performance reasons, see batch_update
564
+ #
565
+ # id:: The number of the document to update. Can also be a string
566
+ # representing the value in the +id+ field. Also consider using
567
+ # the :key attribute.
568
+ # new_doc:: The document to replace the old document with
569
+ def update(id, new_doc)
570
+ @dir.synchronize do
571
+ ensure_writer_open()
572
+ delete(id)
573
+ if id.is_a?(String) or id.is_a?(Symbol)
574
+ @writer.commit
575
+ else
576
+ ensure_writer_open()
577
+ end
578
+ @writer << new_doc
579
+ flush() if @auto_flush
580
+ end
581
+ end
582
+
583
+ # Batch updates the documents in an index. You can pass either a Hash or
584
+ # an Array.
585
+ #
586
+ # === Array (recommended)
587
+ #
588
+ # If you pass an Array then each value needs to be a Document or a Hash
589
+ # and each of those documents must have an +:id_field+ which will be used
590
+ # to delete the old document that this document is replacing.
591
+ #
592
+ # === Hash
593
+ #
594
+ # If you pass a Hash then the keys of the Hash will be considered the
595
+ # +id+'s and the values will be the new documents to replace the old ones
596
+ # with.If the +id+ is an Integer then it is considered a Ferret document
597
+ # number and the corresponding document will be deleted. If the +id+ is a
598
+ # String or a Symbol then the +id+ will be considered a term and the
599
+ # documents that contain that term in the +:id_field+ will be deleted.
600
+ #
601
+ # Note: No error will be raised if the document does not currently
602
+ # exist. A new document will simply be created.
603
+ #
604
+ # == Examples
605
+ #
606
+ # # will replace the documents with the +id+'s id:133 and id:254
607
+ # @index.batch_update({
608
+ # '133' => {:id => '133', :content => 'yada yada yada'},
609
+ # '253' => {:id => '253', :content => 'bla bla bal'}
610
+ # })
611
+ #
612
+ # # will replace the documents with the Ferret Document numbers 2 and 92
613
+ # @index.batch_update({
614
+ # 2 => {:id => '133', :content => 'yada yada yada'},
615
+ # 92 => {:id => '253', :content => 'bla bla bal'}
616
+ # })
617
+ #
618
+ # # will replace the documents with the +id+'s id:133 and id:254
619
+ # # this is recommended as it guarantees no duplicate keys
620
+ # @index.batch_update([
621
+ # {:id => '133', :content => 'yada yada yada'},
622
+ # {:id => '253', :content => 'bla bla bal'}
623
+ # ])
624
+ #
625
+ # docs:: A Hash of id/document pairs. The set of documents to be updated
626
+ def batch_update(docs)
627
+ @dir.synchronize do
628
+ ids = values = nil
629
+ case docs
630
+ when Array
631
+ ids = docs.collect{|doc| doc[@id_field].to_s}
632
+ if ids.include?(nil)
633
+ raise ArgumentError, "all documents must have an #{@id_field} "
634
+ "field when doing a batch update"
635
+ end
636
+ when Hash
637
+ ids = docs.keys
638
+ docs = docs.values
639
+ else
640
+ raise ArgumentError, "must pass Hash or Array, not #{docs.class}"
641
+ end
642
+ batch_delete(ids)
643
+ ensure_writer_open()
644
+ docs.each {|new_doc| @writer << new_doc }
645
+ flush()
646
+ end
647
+ end
648
+
649
+
650
+ # Update all the documents returned by the query.
651
+ #
652
+ # query:: The query to find documents you wish to update. Can either be
653
+ # a string (in which case it is parsed by the standard query
654
+ # parser) or an actual query object.
655
+ # new_val:: The values we are updating. This can be a string in which case
656
+ # the default field is updated, or it can be a hash, in which
657
+ # case, all fields in the hash are merged into the old hash.
658
+ # That is, the old fields are replaced by values in the new hash
659
+ # if they exist.
660
+ #
661
+ # === Example
662
+ #
663
+ # index << {:id => "26", :title => "Babylon", :artist => "David Grey"}
664
+ # index << {:id => "29", :title => "My Oh My", :artist => "David Grey"}
665
+ #
666
+ # # correct
667
+ # index.query_update('artist:"David Grey"', {:artist => "David Gray"})
668
+ #
669
+ # index["26"]
670
+ # #=> {:id => "26", :title => "Babylon", :artist => "David Gray"}
671
+ # index["28"]
672
+ # #=> {:id => "28", :title => "My Oh My", :artist => "David Gray"}
673
+ #
674
+ def query_update(query, new_val)
675
+ @dir.synchronize do
676
+ ensure_writer_open()
677
+ ensure_searcher_open()
678
+ docs_to_add = []
679
+ query = do_process_query(query)
680
+ @searcher.search_each(query, :limit => :all) do |id, score|
681
+ document = @searcher[id].load
682
+ if new_val.is_a?(Hash)
683
+ document.merge!(new_val)
684
+ else new_val.is_a?(String) or new_val.is_a?(Symbol)
685
+ document[@default_input_field] = new_val.to_s
686
+ end
687
+ docs_to_add << document
688
+ @reader.delete(id)
689
+ end
690
+ ensure_writer_open()
691
+ docs_to_add.each {|doc| @writer << doc }
692
+ flush() if @auto_flush
693
+ end
694
+ end
695
+
696
+ # Returns true if any documents have been deleted since the index was last
697
+ # flushed.
698
+ def has_deletions?()
699
+ @dir.synchronize do
700
+ ensure_reader_open()
701
+ return @reader.has_deletions?
702
+ end
703
+ end
704
+
705
+ # Flushes all writes to the index. This will not optimize the index but it
706
+ # will make sure that all writes are written to it.
707
+ #
708
+ # NOTE: this is not necessary if you are only using this class. All writes
709
+ # will automatically flush when you perform an operation that reads the
710
+ # index.
711
+ def flush()
712
+ @dir.synchronize do
713
+ if @reader
714
+ if @searcher
715
+ @searcher.close
716
+ @searcher = nil
717
+ end
718
+ @reader.commit
719
+ elsif @writer
720
+ @writer.close
721
+ @writer = nil
722
+ end
723
+ end
724
+ end
725
+ alias :commit :flush
726
+
727
+ # optimizes the index. This should only be called when the index will no
728
+ # longer be updated very often, but will be read a lot.
729
+ def optimize()
730
+ @dir.synchronize do
731
+ ensure_writer_open()
732
+ @writer.optimize()
733
+ @writer.close()
734
+ @writer = nil
735
+ end
736
+ end
737
+
738
+ # returns the number of documents in the index
739
+ def size()
740
+ @dir.synchronize do
741
+ ensure_reader_open()
742
+ return @reader.num_docs()
743
+ end
744
+ end
745
+
746
+ # Merges all segments from an index or an array of indexes into this
747
+ # index. You can pass a single Index::Index, Index::Reader,
748
+ # Store::Directory or an array of any single one of these.
749
+ #
750
+ # This may be used to parallelize batch indexing. A large document
751
+ # collection can be broken into sub-collections. Each sub-collection can
752
+ # be indexed in parallel, on a different thread, process or machine and
753
+ # perhaps all in memory. The complete index can then be created by
754
+ # merging sub-collection indexes with this method.
755
+ #
756
+ # After this completes, the index is optimized.
757
+ def add_indexes(indexes)
758
+ @dir.synchronize do
759
+ ensure_writer_open()
760
+ indexes = [indexes].flatten # make sure we have an array
761
+ return if indexes.size == 0 # nothing to do
762
+ if indexes[0].is_a?(Index)
763
+ indexes.delete(self) # don't merge with self
764
+ indexes = indexes.map {|index| index.reader }
765
+ elsif indexes[0].is_a?(Ferret::Store::Directory)
766
+ indexes.delete(@dir) # don't merge with self
767
+ indexes = indexes.map {|dir| IndexReader.new(dir) }
768
+ elsif indexes[0].is_a?(IndexReader)
769
+ indexes.delete(@reader) # don't merge with self
770
+ else
771
+ raise ArgumentError, "Unknown index type when trying to merge indexes"
772
+ end
773
+ ensure_writer_open
774
+ @writer.add_readers(indexes)
775
+ end
776
+ end
777
+
778
+ # This is a simple utility method for saving an in memory or RAM index to
779
+ # the file system. The same thing can be achieved by using the
780
+ # Index::Index#add_indexes method and you will have more options when
781
+ # creating the new index, however this is a simple way to turn a RAM index
782
+ # into a file system index.
783
+ #
784
+ # directory:: This can either be a Store::Directory object or a String
785
+ # representing the path to the directory where you would
786
+ # like to store the index.
787
+ #
788
+ # create:: True if you'd like to create the directory if it doesn't
789
+ # exist or copy over an existing directory. False if you'd
790
+ # like to merge with the existing directory. This defaults to
791
+ # false.
792
+ def persist(directory, create = true)
793
+ synchronize do
794
+ close_all()
795
+ old_dir = @dir
796
+ if directory.is_a?(String)
797
+ @dir = FSDirectory.new(directory, create)
798
+ elsif directory.is_a?(Ferret::Store::Directory)
799
+ @dir = directory
800
+ end
801
+ @dir.extend(MonitorMixin) unless @dir.kind_of? MonitorMixin
802
+ @options[:dir] = @dir
803
+ @options[:create_if_missing] = true
804
+ add_indexes([old_dir])
805
+ end
806
+ end
807
+
808
+ def to_s
809
+ buf = ""
810
+ (0...(size)).each do |i|
811
+ buf << self[i].to_s + "\n" if not deleted?(i)
812
+ end
813
+ buf
814
+ end
815
+
816
+ # Returns an Explanation that describes how +doc+ scored against
817
+ # +query+.
818
+ #
819
+ # This is intended to be used in developing Similarity implementations,
820
+ # and, for good performance, should not be displayed with every hit.
821
+ # Computing an explanation is as expensive as executing the query over the
822
+ # entire index.
823
+ def explain(query, doc)
824
+ @dir.synchronize do
825
+ ensure_searcher_open()
826
+ query = do_process_query(query)
827
+
828
+ return @searcher.explain(query, doc)
829
+ end
830
+ end
831
+
832
+ # Turn a query string into a Query object with the Index's QueryParser
833
+ def process_query(query)
834
+ @dir.synchronize do
835
+ ensure_searcher_open()
836
+ return do_process_query(query)
837
+ end
838
+ end
839
+
840
+ # Returns the field_infos object so that you can add new fields to the
841
+ # index.
842
+ def field_infos
843
+ @dir.synchronize do
844
+ ensure_writer_open()
845
+ return @writer.field_infos
846
+ end
847
+ end
848
+
849
+
850
+ protected
851
+ def ensure_writer_open()
852
+ raise "tried to use a closed index" if not @open
853
+ return if @writer
854
+ if @reader
855
+ @searcher.close if @searcher
856
+ @reader.close
857
+ @reader = nil
858
+ @searcher = nil
859
+ end
860
+ @writer = IndexWriter.new(@options)
861
+ end
862
+
863
+ # returns the new reader if one is opened
864
+ def ensure_reader_open(get_latest = true)
865
+ raise "tried to use a closed index" if not @open
866
+ if @reader
867
+ if get_latest
868
+ latest = false
869
+ begin
870
+ latest = @reader.latest?
871
+ rescue Lock::LockError => le
872
+ sleep(@options[:lock_retry_time]) # sleep for 2 seconds and try again
873
+ latest = @reader.latest?
874
+ end
875
+ if not latest
876
+ @searcher.close if @searcher
877
+ @reader.close
878
+ return @reader = IndexReader.new(@dir)
879
+ end
880
+ end
881
+ else
882
+ if @writer
883
+ @writer.close
884
+ @writer = nil
885
+ end
886
+ return @reader = IndexReader.new(@dir)
887
+ end
888
+ return false
889
+ end
890
+
891
+ def ensure_searcher_open()
892
+ raise "tried to use a closed index" if not @open
893
+ if ensure_reader_open() or not @searcher
894
+ @searcher = Searcher.new(@reader)
895
+ end
896
+ end
897
+
898
+ private
899
+ def do_process_query(query)
900
+ if query.is_a?(String)
901
+ if @qp.nil?
902
+ @qp = Ferret::QueryParser.new(@options)
903
+ end
904
+ # we need to set this every time, in case a new field has been added
905
+ @qp.fields =
906
+ @reader.fields unless options[:all_fields] || options[:fields]
907
+ @qp.tokenized_fields =
908
+ @reader.tokenized_fields unless options[:tokenized_fields]
909
+ query = @qp.parse(query)
910
+ end
911
+ return query
912
+ end
913
+
914
+ def do_search(query, options)
915
+ ensure_searcher_open()
916
+ query = do_process_query(query)
917
+
918
+ return @searcher.search(query, options)
919
+ end
920
+
921
+ def close_all()
922
+ @dir.synchronize do
923
+ @searcher.close if @searcher
924
+ @reader.close if @reader
925
+ @writer.close if @writer
926
+ @reader = nil
927
+ @searcher = nil
928
+ @writer = nil
929
+ end
930
+ end
931
+
932
+ # If +docs+ is a Hash or an Array then a batch delete will be performed.
933
+ # If +docs+ is an Array then it will be considered an array of +id+'s. If
934
+ # it is a Hash, then its keys will be used instead as the Array of
935
+ # document +id+'s. If the +id+ is an Integers then it is considered a
936
+ # Ferret document number and the corresponding document will be deleted.
937
+ # If the +id+ is a String or a Symbol then the +id+ will be considered a
938
+ # term and the documents that contain that term in the +:id_field+ will
939
+ # be deleted.
940
+ #
941
+ # docs:: An Array of docs to be deleted, or a Hash (in which case the keys
942
+ # are used)
943
+ def batch_delete(docs)
944
+ docs = docs.keys if docs.is_a?(Hash)
945
+ raise ArgumentError, "must pass Array or Hash" unless docs.is_a? Array
946
+ ids = []
947
+ terms = []
948
+ docs.each do |doc|
949
+ case doc
950
+ when String then terms << doc
951
+ when Symbol then terms << doc.to_s
952
+ when Integer then ids << doc
953
+ else
954
+ raise ArgumentError, "Cannot delete for arg of type #{id.class}"
955
+ end
956
+ end
957
+ if ids.size > 0
958
+ ensure_reader_open
959
+ ids.each {|id| @reader.delete(id)}
960
+ end
961
+ if terms.size > 0
962
+ ensure_writer_open()
963
+ @writer.delete(@id_field, terms)
964
+ end
965
+ return self
966
+ end
967
+
968
+ end
969
+ end
970
+
971
+ module Ferret
972
+ I = Index::Index
973
+ end