ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,2 +1,130 @@
1
- require 'ferret/document/field'
2
- require 'ferret/document/document'
1
+ module Ferret
2
+ # Instead of using documents to add data to an index you can use Hashes and
3
+ # Arrays. The only real benefits of using a Document over a Hash are pretty
4
+ # printing and the boost attribute. You can add the boost attribute to
5
+ # Hashes and arrays using the BoostMixin. For example;
6
+ #
7
+ # class Hash
8
+ # include BoostMixin
9
+ # end
10
+ #
11
+ # class Array
12
+ # include BoostMixin
13
+ # end
14
+ #
15
+ # class String
16
+ # include BoostMixin
17
+ # end
18
+ module BoostMixin
19
+ attr_accessor :boost
20
+ end
21
+
22
+ # Documents are the unit of indexing and search.
23
+ #
24
+ # A Document is a set of fields. Each field has a name and an array of
25
+ # textual values. If you are coming from a Lucene background you should note
26
+ # that Fields don't have any properties except for the boost property. You
27
+ # should use the FieldInfos class to set field properties accross the whole
28
+ # index instead.
29
+ #
30
+ # === Boost
31
+ #
32
+ # The boost attribute makes a Document more important in the index. That is,
33
+ # you can increase the score of a match for queries that match a particular
34
+ # document, making it more likely to appear at the top of search results.
35
+ # You may, for example, want to boost products that have a higher user
36
+ # rating so that they are more likely to appear in search results.
37
+ #
38
+ # Note: that fields which are _not_ stored (see FieldInfos) are _not_
39
+ # available in documents retrieved from the index, e.g. Searcher#doc or
40
+ # IndexReader#doc.
41
+ #
42
+ # Note: that modifying a Document retrieved from the index will not modify
43
+ # the document contained within the index. You need to delete the old
44
+ # version of the document and add the new version of the document.
45
+ class Document < Hash
46
+ include BoostMixin
47
+
48
+ # Create a new Document object with a boost. The boost defaults to 1.0.
49
+ def initialize(boost = 1.0)
50
+ @boost = boost
51
+ end
52
+
53
+ # Return true if the documents are equal, ie they have the same fields
54
+ def eql?(o)
55
+ return (o.is_a? Document and (o.boost == @boost) and
56
+ (self.keys == o.keys) and (self.values == o.values))
57
+ end
58
+ alias :== :eql?
59
+
60
+ # Create a string represention of the document
61
+ def to_s
62
+ buf = ["Document {"]
63
+ self.keys.sort_by {|key| key.to_s}.each do |key|
64
+ val = self[key]
65
+ val_str = if val.instance_of? Array then %{["#{val.join('", "')}"]}
66
+ elsif val.is_a? Field then val.to_s
67
+ else %{"#{val.to_s}"}
68
+ end
69
+ buf << " :#{key} => #{val_str}"
70
+ end
71
+ buf << ["}#{@boost == 1.0 ? "" : "^" + @boost.to_s}"]
72
+ return buf.join("\n")
73
+ end
74
+ end
75
+
76
+ # A Field is a section of a Document. A Field is basically an array with a
77
+ # boost attribute. It also provides pretty printing of the field with the
78
+ # #to_s method.
79
+ #
80
+ # === Boost
81
+ #
82
+ # The boost attribute makes a field more important in the index. That is,
83
+ # you can increase the score of a match for queries that match terms in a
84
+ # boosted field. You may, for example, want to boost a title field so that
85
+ # matches that match in the :title field score more highly than matches that
86
+ # match in the :contents field.
87
+ #
88
+ # Note: If you'd like to use boosted fields without having to use
89
+ # the Field class you can just include the BoostMixin in the Array class.
90
+ # See BoostMixin.
91
+ class Field < Array
92
+ include BoostMixin
93
+
94
+ # Create a new Field object. You can pass data to the field as either a
95
+ # string;
96
+ #
97
+ # f = Field.new("This is the fields data")
98
+ #
99
+ # or as an array of strings;
100
+ #
101
+ # f = Field.new(["this", "is", "an", "array", "of", "field", "data"])
102
+ #
103
+ # Of course Fields can also be boosted;
104
+ #
105
+ # f = Field.new("field data", 1000.0)
106
+ def initialize(data = [], boost = 1.0)
107
+ @boost = boost
108
+ if data.is_a? Array
109
+ data.each {|v| self << v}
110
+ else
111
+ self << data.to_s
112
+ end
113
+ end
114
+
115
+ def eql?(o)
116
+ return (o.is_a? Field and (o.boost == @boost) and super(o))
117
+ end
118
+ alias :== :eql?
119
+
120
+ def +(o)
121
+ return Field.new(super(o), self.boost)
122
+ end
123
+
124
+ def to_s
125
+ buf = %{["#{self.join('", "')}"]}
126
+ buf << "^#@boost" if @boost != 1.0
127
+ return buf
128
+ end
129
+ end
130
+ end
data/lib/ferret/index.rb CHANGED
@@ -1,26 +1,577 @@
1
- require 'ferret/index/index_file_names'
2
- require 'ferret/index/term'
3
- require 'ferret/index/term_buffer'
4
- require 'ferret/index/term_doc_enum'
5
- require 'ferret/index/multiple_term_doc_pos_enum'
6
- require 'ferret/index/term_enum'
7
- require 'ferret/index/term_info'
8
- require 'ferret/index/term_infos_io'
9
- require 'ferret/index/term_vector_offset_info'
10
- require 'ferret/index/term_vectors_io'
11
- require 'ferret/index/field_infos'
12
- require 'ferret/index/fields_io'
13
- require 'ferret/index/compound_file_io'
14
- require 'ferret/index/term_buffer'
15
- require 'ferret/index/segment_term_enum'
16
- require 'ferret/index/segment_term_vector'
17
- require 'ferret/index/segment_merge_info'
18
- require 'ferret/index/segment_merge_queue'
19
- require 'ferret/index/segment_infos'
20
- require 'ferret/index/document_writer'
21
- require 'ferret/index/index_reader'
22
- require 'ferret/index/index_writer'
23
- require 'ferret/index/multi_reader'
24
- require 'ferret/index/segment_merger'
25
- require 'ferret/index/segment_reader'
26
- require 'ferret/index/index'
1
+ require 'monitor'
2
+
3
+ module Ferret::Index
4
+ # This is a simplified interface to the index. See the TUTORIAL for more
5
+ # information on how to use this class.
6
+ class Index
7
+ include MonitorMixin
8
+
9
+ include Ferret::Store
10
+ include Ferret::Search
11
+
12
+ attr_reader :options
13
+ # If you create an Index without any options, it'll simply create an index
14
+ # in memory. But this class is highly configurable and every option that
15
+ # you can supply to IndexWriter and QueryParser, you can also set here.
16
+ # Please look at the options for the constructors to these classes.
17
+ #
18
+ # === Options
19
+ #
20
+ # See;
21
+ #
22
+ # * QueryParser
23
+ # * IndexWriter
24
+ #
25
+ # default_input_field:: Default: "id". This specifies the default field
26
+ # that will be used when you add a simple string
27
+ # to the index using #add_document or <<.
28
+ # id_field: Default: "id". This field is as the field to
29
+ # search when doing searches on a term. For
30
+ # example, if you do a lookup by term "cat", ie
31
+ # index["cat"], this will be the field that is
32
+ # searched.
33
+ # key:: Default: nil. Expert: This should only be used
34
+ # if you really know what you are doing. Basically
35
+ # you can set a field or an array of fields to be
36
+ # the key for the index. So if you add a document
37
+ # with a same key as an existing document, the
38
+ # existing document will be replaced by the new
39
+ # object. Using a multiple field key will slow
40
+ # down indexing so it should not be done if
41
+ # performance is a concern. A single field key (or
42
+ # id) should be find however. Also, you must make
43
+ # sure that your key/keys are either untokenized
44
+ # or that they are not broken up by the analyzer.
45
+ # auto_flush:: Default: false. Set this option to true if you
46
+ # want the index automatically flushed every time
47
+ # you do a write (includes delete) to the index.
48
+ # This is useful if you have multiple processes
49
+ # accessing the index and you don't want lock
50
+ # errors. Setting :auto_flush to true has a huge
51
+ # performance impact so don't use it if you are
52
+ # concerned about performance. In that case you
53
+ # should think about setting up a DRb indexing
54
+ # service.
55
+ #
56
+ # Some examples;
57
+ #
58
+ # index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
59
+ #
60
+ # index = Index::Index.new(:path => '/path/to/index',
61
+ # :create_if_missing => false,
62
+ # :auto_flush => true)
63
+ #
64
+ # index = Index::Index.new(:dir => directory,
65
+ # :default_slop => 2,
66
+ # :handle_parse_errors => false)
67
+ #
68
+ def initialize(options = {})
69
+ super()
70
+
71
+ if options[:key]
72
+ @key = options[:key]
73
+ if @key.is_a?(Array)
74
+ @key.flatten.map {|k| k.to_s.intern}
75
+ end
76
+ end
77
+
78
+ if options[:dir].is_a?(String)
79
+ options[:path] = options[:dir]
80
+ end
81
+ if options[:path]
82
+ begin
83
+ @dir = FSDirectory.new(options[:path], options[:create])
84
+ rescue IOError => io
85
+ @dir = FSDirectory.new(options[:path], options[:create_if_missing])
86
+ end
87
+ elsif options[:dir]
88
+ @dir = options[:dir]
89
+ else
90
+ options[:create] = true # this should always be true for a new RAMDir
91
+ @dir = RAMDirectory.new
92
+ end
93
+
94
+ options[:dir] = @dir
95
+ @dir.extend(MonitorMixin)
96
+ @dir.synchronize do
97
+ @options = options
98
+ @writer = IndexWriter.new(options) # create the index if need be
99
+ options[:analyzer] = @analyzer = @writer.analyzer
100
+ @writer.close
101
+ @writer = nil
102
+ @reader = nil
103
+ @options.delete(:create) # only want to create the first time if at all
104
+ @auto_flush = @options[:auto_flush] || false
105
+ if (@options[:id_field].nil? and
106
+ @key.is_a?(Symbol))
107
+ @id_field = @key
108
+ else
109
+ @id_field = @options[:id_field] || :id
110
+ end
111
+ @default_field = (@options[:default_field]||= :*)
112
+ @default_input_field = options[:default_input_field] || @id_field
113
+
114
+ if @default_input_field.respond_to?(:intern)
115
+ @default_input_field = @default_input_field.intern
116
+ end
117
+ @open = true
118
+ @qp = nil
119
+ end
120
+ end
121
+
122
+ # Closes this index by closing its associated reader and writer objects.
123
+ def close
124
+ @dir.synchronize do
125
+ if not @open
126
+ raise "tried to close an already closed directory"
127
+ end
128
+ @searcher.close() if @searcher
129
+ @reader.close() if @reader
130
+ @writer.close() if @writer
131
+ @dir.close()
132
+
133
+ @open = false
134
+ end
135
+ end
136
+
137
+ # Get the reader for this index.
138
+ # NOTE:: This will close the writer from this index.
139
+ def reader
140
+ ensure_reader_open()
141
+ return @reader
142
+ end
143
+
144
+ # Get the searcher for this index.
145
+ # NOTE:: This will close the writer from this index.
146
+ def searcher
147
+ ensure_searcher_open()
148
+ return @searcher
149
+ end
150
+
151
+ # Get the writer for this index.
152
+ # NOTE:: This will close the reader from this index.
153
+ def writer
154
+ ensure_writer_open()
155
+ return @writer
156
+ end
157
+ protected :reader, :writer, :searcher
158
+
159
+ # Adds a document to this index, using the provided analyzer instead of
160
+ # the local analyzer if provided. If the document contains more than
161
+ # IndexWriter::MAX_FIELD_LENGTH terms for a given field, the remainder are
162
+ # discarded.
163
+ #
164
+ # There are three ways to add a document to the index.
165
+ # To add a document you can simply add a string or an array of strings.
166
+ # This will store all the strings in the "" (ie empty string) field
167
+ # (unless you specify the default_field when you create the index).
168
+ #
169
+ # index << "This is a new document to be indexed"
170
+ # index << ["And here", "is another", "new document", "to be indexed"]
171
+ #
172
+ # But these are pretty simple documents. If this is all you want to index
173
+ # you could probably just use SimpleSearch. So let's give our documents
174
+ # some fields;
175
+ #
176
+ # index << {:title => "Programming Ruby", :content => "blah blah blah"}
177
+ # index << {:title => "Programming Ruby", :content => "yada yada yada"}
178
+ #
179
+ # Or if you are indexing data stored in a database, you'll probably want
180
+ # to store the id;
181
+ #
182
+ # index << {:id => row.id, :title => row.title, :date => row.date}
183
+ #
184
+ # See FieldInfos for more information on how to set field properties.
185
+ def add_document(doc, analyzer = nil)
186
+ @dir.synchronize do
187
+ if doc.is_a?(String) or doc.is_a?(Array)
188
+ doc = {@default_input_field => doc}
189
+ end
190
+
191
+ # delete existing documents with the same key
192
+ if @key
193
+ if @key.is_a?(Array)
194
+ query = @key.inject(BooleanQuery.new()) do |bq, field|
195
+ bq.add_query(TermQuery.new(field, doc[field].to_s), :must)
196
+ bq
197
+ end
198
+ query_delete(query)
199
+ else
200
+ id = doc[@key].to_s
201
+ if id
202
+ delete(id)
203
+ @writer.commit
204
+ end
205
+ end
206
+ end
207
+ ensure_writer_open()
208
+
209
+ old_analyzer = @writer.analyzer if analyzer
210
+ @writer.add_document(doc)
211
+ @writer.analyzer = old_analyzer if analyzer
212
+
213
+ flush() if @auto_flush
214
+ end
215
+ end
216
+ alias :<< :add_document
217
+
218
+ # The main search method for the index. You need to create a query to
219
+ # pass to this method. You can also pass a hash with one or more of the
220
+ # following; {filter, num_docs, first_doc, sort}
221
+ #
222
+ # query:: The query to run on the index
223
+ # filter:: Filters docs from the search result
224
+ # first_doc:: The index in the results of the first doc retrieved.
225
+ # Default is 0
226
+ # num_docs:: The number of results returned. Default is 10
227
+ # sort:: An array of SortFields describing how to sort the results.
228
+ # filter_proc:: A proc which takes |doc_id, score, searcher| as arguments
229
+ # and returns true if the document passes the filter.
230
+ def search(query, options = {})
231
+ @dir.synchronize do
232
+ return do_search(query, options)
233
+ end
234
+ end
235
+
236
+ # See Index#search
237
+ #
238
+ # This method yields the doc and score for each hit.
239
+ # eg.
240
+ # index.search_each() do |doc, score|
241
+ # puts "hit document number #{doc} with a score of #{score}"
242
+ # end
243
+ #
244
+ # returns:: The total number of hits.
245
+ def search_each(query, options = {}) # :yield: doc, score
246
+ @dir.synchronize do
247
+ ensure_searcher_open()
248
+ query = process_query(query)
249
+
250
+ @searcher.search_each(query) do |doc, score|
251
+ yield doc, score
252
+ end
253
+ end
254
+ end
255
+
256
+ # Retrieve the document referenced by the document number +id+, if id is
257
+ # an integer or the first document with term +id+ if +id+ is a term.
258
+ #
259
+ # id:: The number of the document to retrieve, or the term used as the :id
260
+ # for the document we wish to retrieve
261
+ def doc(id)
262
+ @dir.synchronize do
263
+ ensure_reader_open()
264
+ if id.kind_of?(String) or id.kind_of?(Symbol)
265
+ term_doc_enum = @reader.term_docs_for(@id_field, id.to_s)
266
+ id = term_doc_enum.next? ? term_doc_enum.doc : nil
267
+ end
268
+ return @reader[id] if id.is_a? Integer
269
+ if id
270
+ raise(ArgumentError, "key to Index to access a document must be " +
271
+ "an Integer or a String")
272
+ end
273
+ end
274
+ return nil
275
+ end
276
+ alias :[] :doc
277
+
278
+ # Delete the document referenced by the document number +id+ if +id+ is an
279
+ # integer or all of the documents which have the term +id+ if +id+ is a
280
+ # term..
281
+ #
282
+ # id:: The number of the document to delete
283
+ def delete(id)
284
+ @dir.synchronize do
285
+ if id.is_a?(String) or id.is_a?(Symbol)
286
+ ensure_writer_open()
287
+ @writer.delete(@id_field, id.to_s)
288
+ elsif id.is_a?(Integer)
289
+ ensure_reader_open()
290
+ cnt = @reader.delete(id)
291
+ else
292
+ raise ArgumentError, "Cannot delete for id of type #{id.class}"
293
+ end
294
+ flush() if @auto_flush
295
+ end
296
+ return self
297
+ end
298
+
299
+ # Delete all documents returned by the query.
300
+ #
301
+ # query:: The query to find documents you wish to delete. Can either be a
302
+ # string (in which case it is parsed by the standard query parser)
303
+ # or an actual query object.
304
+ def query_delete(query)
305
+ @dir.synchronize do
306
+ ensure_searcher_open()
307
+ query = process_query(query)
308
+ @searcher.search_each(query) do |doc, score|
309
+ @reader.delete(doc)
310
+ end
311
+ flush() if @auto_flush
312
+ end
313
+ end
314
+
315
+ # Returns true if document +n+ has been deleted
316
+ def deleted?(n)
317
+ @dir.synchronize do
318
+ ensure_reader_open()
319
+ return @reader.deleted?(n)
320
+ end
321
+ end
322
+
323
+ # Update the document referenced by the document number +id+ if +id+ is an
324
+ # integer or all of the documents which have the term +id+ if +id+ is a
325
+ # term..
326
+ #
327
+ # id:: The number of the document to update. Can also be a string
328
+ # representing the value in the +id+ field. Also consider using
329
+ # the :key attribute.
330
+ # new_doc:: The document to replace the old document with
331
+ def update(id, new_doc)
332
+ @dir.synchronize do
333
+ delete(id)
334
+ if id.is_a?(String) or id.is_a?(Symbol)
335
+ @writer.commit
336
+ else
337
+ ensure_writer_open()
338
+ end
339
+ @writer << new_doc
340
+ flush() if @auto_flush
341
+ end
342
+ end
343
+
344
+ # Update all the documents returned by the query.
345
+ #
346
+ # query:: The query to find documents you wish to update. Can either be
347
+ # a string (in which case it is parsed by the standard query
348
+ # parser) or an actual query object.
349
+ # new_val:: The values we are updating. This can be a string in which case
350
+ # the default field is updated, or it can be a hash, in which
351
+ # case, all fields in the hash are merged into the old hash.
352
+ # That is, the old fields are replaced by values in the new hash
353
+ # if they exist.
354
+ #
355
+ # === Example
356
+ #
357
+ # index << {:id => "26", :title => "Babylon", :artist => "David Grey"}
358
+ # index << {:id => "29", :title => "My Oh My", :artist => "David Grey"}
359
+ #
360
+ # # correct
361
+ # index.query_update('artist:"David Grey"', {:artist => "David Gray"})
362
+ #
363
+ # index["26"]
364
+ # #=> {:id => "26", :title => "Babylon", :artist => "David Gray"}
365
+ # index["28"]
366
+ # #=> {:id => "28", :title => "My Oh My", :artist => "David Gray"}
367
+ #
368
+ def query_update(query, new_val)
369
+ @dir.synchronize do
370
+ ensure_searcher_open()
371
+ docs_to_add = []
372
+ query = process_query(query)
373
+ @searcher.search_each(query) do |id, score|
374
+ document = @searcher[id].load
375
+ if new_val.is_a?(Hash)
376
+ document.merge!(new_val)
377
+ else new_val.is_a?(String) or new_val.is_a?(Symbol)
378
+ document[@default_input_field] = new_val.to_s
379
+ end
380
+ docs_to_add << document
381
+ @reader.delete(id)
382
+ end
383
+ ensure_writer_open()
384
+ docs_to_add.each {|doc| @writer << doc }
385
+ flush() if @auto_flush
386
+ end
387
+ end
388
+
389
+ # Returns true if any documents have been deleted since the index was last
390
+ # flushed.
391
+ def has_deletions?()
392
+ @dir.synchronize do
393
+ ensure_reader_open()
394
+ return @reader.has_deletions?
395
+ end
396
+ end
397
+
398
+ # Flushes all writes to the index. This will not optimize the index but it
399
+ # will make sure that all writes are written to it.
400
+ #
401
+ # NOTE: this is not necessary if you are only using this class. All writes
402
+ # will automatically flush when you perform an operation that reads the
403
+ # index.
404
+ def flush()
405
+ @dir.synchronize do
406
+ @searcher.close if @searcher
407
+ @reader.close if @reader
408
+ @writer.close if @writer
409
+ @reader = nil
410
+ @writer = nil
411
+ @searcher = nil
412
+ end
413
+ end
414
+
415
+ # optimizes the index. This should only be called when the index will no
416
+ # longer be updated very often, but will be read a lot.
417
+ def optimize()
418
+ @dir.synchronize do
419
+ ensure_writer_open()
420
+ @writer.optimize()
421
+ @writer.close()
422
+ @writer = nil
423
+ end
424
+ end
425
+
426
+ # returns the number of documents in the index
427
+ def size()
428
+ @dir.synchronize do
429
+ ensure_reader_open()
430
+ return @reader.num_docs()
431
+ end
432
+ end
433
+
434
+ # Merges all segments from an index or an array of indexes into this
435
+ # index. You can pass a single Index::Index, Index::Reader,
436
+ # Store::Directory or an array of any single one of these.
437
+ #
438
+ # This may be used to parallelize batch indexing. A large document
439
+ # collection can be broken into sub-collections. Each sub-collection can
440
+ # be indexed in parallel, on a different thread, process or machine and
441
+ # perhaps all in memory. The complete index can then be created by
442
+ # merging sub-collection indexes with this method.
443
+ #
444
+ # After this completes, the index is optimized.
445
+ def add_indexes(indexes)
446
+ @dir.synchronize do
447
+ indexes = [indexes].flatten # make sure we have an array
448
+ return if indexes.size == 0 # nothing to do
449
+ if indexes[0].is_a?(Index)
450
+ indexes.delete(self) # don't merge with self
451
+ indexes = indexes.map {|index| index.reader }
452
+ elsif indexes[0].is_a?(Ferret::Store::Directory)
453
+ indexes.delete(@dir) # don't merge with self
454
+ indexes = indexes.map {|dir| IndexReader.new(dir) }
455
+ elsif indexes[0].is_a?(IndexReader)
456
+ indexes.delete(@reader) # don't merge with self
457
+ else
458
+ raise ArgumentError, "Unknown index type when trying to merge indexes"
459
+ end
460
+ ensure_writer_open
461
+ @writer.add_readers(indexes)
462
+ end
463
+ end
464
+
465
+ # This is a simple utility method for saving an in memory or RAM index to
466
+ # the file system. The same thing can be achieved by using the
467
+ # Index::Index#add_indexes method and you will have more options when
468
+ # creating the new index, however this is a simple way to turn a RAM index
469
+ # into a file system index.
470
+ #
471
+ # directory:: This can either be a Store::Directory object or a String
472
+ # representing the path to the directory where you would
473
+ # like to store the the index.
474
+ #
475
+ # create:: True if you'd like to create the directory if it doesn't
476
+ # exist or copy over an existing directory. False if you'd
477
+ # like to merge with the existing directory. This defaults to
478
+ # false.
479
+ def persist(directory, create = true)
480
+ synchronize do
481
+ flush()
482
+ old_dir = @dir
483
+ if directory.is_a?(String)
484
+ @dir = FSDirectory.new(directory, create)
485
+ elsif directory.is_a?(Ferret::Store::Directory)
486
+ @dir = directory
487
+ end
488
+ @dir.extend(MonitorMixin)
489
+ @options[:dir] = @dir
490
+ @options[:create_if_missing] = true
491
+ add_indexes([old_dir])
492
+ end
493
+ end
494
+
495
+ def to_s
496
+ buf = ""
497
+ (0...(size)).each do |i|
498
+ buf << self[i].to_s + "\n" if not deleted?(i)
499
+ end
500
+ buf
501
+ end
502
+
503
+ # Returns an Explanation that describes how +doc+ scored against
504
+ # +query+.
505
+ #
506
+ # This is intended to be used in developing Similarity implementations,
507
+ # and, for good performance, should not be displayed with every hit.
508
+ # Computing an explanation is as expensive as executing the query over the
509
+ # entire index.
510
+ def explain(query, doc)
511
+ synchronize do
512
+ ensure_searcher_open()
513
+ query = process_query(query)
514
+
515
+ return @searcher.explain(query, doc)
516
+ end
517
+ end
518
+
519
+ protected
520
+ def ensure_writer_open()
521
+ raise "tried to use a closed index" if not @open
522
+ return if @writer
523
+ if @reader
524
+ @searcher.close if @searcher
525
+ @reader.close
526
+ @reader = nil
527
+ @searcher = nil
528
+ end
529
+ @writer = IndexWriter.new(@options)
530
+ end
531
+
532
+ # returns the new reader if one is opened
533
+ def ensure_reader_open()
534
+ raise "tried to use a closed index" if not @open
535
+ if @reader
536
+ if not @reader.latest?
537
+ return @reader = IndexReader.new(@dir)
538
+ end
539
+ else
540
+ if @writer
541
+ @writer.close
542
+ @writer = nil
543
+ end
544
+ return @reader = IndexReader.new(@dir)
545
+ end
546
+ return false
547
+ end
548
+
549
+ def ensure_searcher_open()
550
+ raise "tried to use a closed index" if not @open
551
+ if ensure_reader_open() or not @searcher
552
+ @searcher = Searcher.new(@reader)
553
+ end
554
+ end
555
+
556
+ private
557
+ def do_search(query, options)
558
+ ensure_searcher_open()
559
+ query = process_query(query)
560
+
561
+ return @searcher.search(query, options)
562
+ end
563
+
564
+ def process_query(query)
565
+ if query.is_a?(String)
566
+ if @qp.nil?
567
+ @qp = Ferret::QueryParser.new(@options)
568
+ end
569
+ # we need to set this ever time, in case a new field has been added
570
+ @qp.fields = @reader.field_names
571
+ query = @qp.parse(query)
572
+ end
573
+ return query
574
+ end
575
+
576
+ end
577
+ end