ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,675 +0,0 @@
1
- require 'monitor'
2
-
3
- module Ferret::Index
4
- # This is a simplified interface to the index. See the TUTORIAL for more
5
- # information on how to use this class.
6
- class Index
7
- include MonitorMixin
8
-
9
- include Ferret::Store
10
- include Ferret::Search
11
- include Ferret::Document
12
-
13
- # If you create an Index without any options, it'll simply create an index
14
- # in memory. But this class is highly configurable and every option that
15
- # you can supply to IndexWriter and QueryParser, you can also set here.
16
- #
17
- # === Options
18
- #
19
- # path:: A string representing the path to the index
20
- # directory. If you are creating the index for the
21
- # first time the directory will be created if it's
22
- # missing. You should not choose a directory which
23
- # contains other files.
24
- # create_if_missing:: Create the index if no index is found in the
25
- # specified directory. Otherwise, use the existing
26
- # index. This defaults to true and has no effect on
27
- # in memory indexes.
28
- # create:: Creates the index, even if one already exists.
29
- # That means any existing index will be deleted.
30
- # This option defaults to false and has no effect
31
- # for in memory indexes. It is probably better to
32
- # use the create_if_missing option.
33
- # default_field:: This specifies the default field that will be
34
- # used when you add a simple string to the index
35
- # using #add_document or <<. This will also be used
36
- # for default_search_field unless you set it
37
- # explicitly. The default for this value is the
38
- # string "id".
39
- # id_field: This field is as the field to search when doing
40
- # searches on a term. For example, if you do a
41
- # lookup by term "cat", ie index["cat"], this will
42
- # be the field that is searched. This will default
43
- # to default_field if not set.
44
- # default_search_field:: This specifies the field or fields that will be
45
- # searched by the query parser. You can use a
46
- # string to specify one field, eg, "title". Or you
47
- # can specify multiple fields with a String -
48
- # "title|content" - or with an Array - ["title",
49
- # "content"]. This defaults to the value passed in
50
- # for default_field. If default_field is nil then
51
- # the default is "*" which signifies all fields in
52
- # the index.
53
- # analyzer:: Sets the default analyzer for the index. This is
54
- # used by both the IndexWriter and the QueryParser
55
- # to tokenize the input. The default is the
56
- # StandardAnalyzer.
57
- # dir:: This is an Ferret::Store::Directory object. This
58
- # can be useful if you have an already existing
59
- # in-memory index which you'd like to read with
60
- # this class. If you want to create a new index,
61
- # you are better off passing in a path.
62
- # close_dir:: This specifies whether you want this class to
63
- # close the index directory when this class is
64
- # closed. This only has any meaning when you pass
65
- # in a directory object in the *dir* option, in
66
- # which case it defaults to false. Otherwise it is
67
- # always true.
68
- # occur_default:: Set to either BooleanClause::Occur::SHOULD
69
- # (default) or BooleanClause::Occur::MUST to
70
- # specify the default Occur operator.
71
- # wild_lower:: Set to false if you don't want the terms in fuzzy
72
- # and wild queries to be set to lower case. You
73
- # should do this if your analyzer doesn't downcase.
74
- # The default is true.
75
- # default_slop:: Set the default slop for phrase queries. This
76
- # defaults to 0.
77
- # key:: Expert: This should only be used if you really
78
- # know what you are doing. Basically you can set a
79
- # field or an array of fields to be the key for the
80
- # index. So if you add a document with a same key
81
- # as an existing document, the existing document will
82
- # be replaced by the new object. This will slow
83
- # down indexing so it should not be used if
84
- # performance is a concern. You must make sure that
85
- # your key/keys are either untokenized or that they
86
- # are not broken up by the analyzer.
87
- # use_compound_file:: Uses a compound file to store the index. This
88
- # prevents an error being raised for having too
89
- # many files open at the same time. The default is
90
- # true but performance is better if this is set to
91
- # false.
92
- # handle_parse_errors:: Set this to true if you want the QueryParser to
93
- # degrade gracefully on errors. If the query parser
94
- # fails to parse this query, it will try to parse
95
- # it as a straight boolean query on the default
96
- # field ignoring all query punctuation. If this
97
- # fails, it will return an empty TermQuery. If you
98
- # use this and you need to know why your query
99
- # isn't working you can use the Query#to_s method
100
- # on the query returned to see what is happening to
101
- # your query. This defualts to true. If you set it
102
- # to false a QueryParseException is raised on a
103
- # query parse error.
104
- # auto_flush:: Set this option to true if you want the index
105
- # automatically flushed every time you do a write
106
- # (includes delete) to the index. This is useful if
107
- # you have multiple processes accessing the index
108
- # and you don't want lock errors. This is set to
109
- # false by default.
110
- #
111
- # Some examples;
112
- #
113
- # index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
114
- #
115
- # index = Index::Index.new(:path => '/path/to/index',
116
- # :create_if_missing => false,
117
- # :auto_flush => true)
118
- #
119
- # index = Index::Index.new(:dir => directory,
120
- # :close_dir => false
121
- # :default_slop => 2,
122
- # :handle_parse_errors => false)
123
- #
124
- def initialize(options = {})
125
- super()
126
-
127
- options[:default_field] &&= options[:default_field].to_s
128
- options[:create_if_missing] = true if options[:create_if_missing].nil?
129
- @key = [options[:key]].flatten.map {|k| k.to_s} if options[:key]
130
-
131
- if options[:path]
132
- begin
133
- @dir = FSDirectory.new(options[:path], options[:create])
134
- rescue IOError => io
135
- @dir = FSDirectory.new(options[:path], options[:create_if_missing])
136
- end
137
- options[:close_dir] = true
138
- elsif options[:dir]
139
- @dir = options[:dir]
140
- else
141
- options[:create] = true # this should always be true for a new RAMDir
142
- @dir = RAMDirectory.new
143
- end
144
-
145
- @dir.synchronize do
146
- @options = options
147
- @writer = IndexWriter.new(@dir, options) # create the index if need be
148
- options[:analyzer] = @analyzer = @writer.analyzer
149
- @writer.close
150
- @writer = nil
151
- @has_writes = false
152
- @reader = nil
153
- @options.delete(:create) # only want to create the first time if at all
154
- @close_dir = @options.delete(:close_dir) || false # we'll hold this here
155
- @auto_flush = @options[:auto_flush] || false
156
- @default_search_field = (@options[:default_search_field] || \
157
- @options[:default_field] || "*")
158
- if (@options[:id_field].nil? and
159
- @options[:default_field].nil? and
160
- @key and @key.size == 1)
161
- @default_field = @key[0]
162
- @id_field = @key[0]
163
- else
164
- @default_field =
165
- (@options[:default_field] || @options[:id_field] || "id").to_s
166
- @id_field =
167
- (@options[:id_field] || @options[:default_field] || "id").to_s
168
- end
169
- @options[:handle_parse_errors] = true if @options[:handle_parse_errors].nil?
170
- @open = true
171
- @qp = nil
172
- end
173
- end
174
-
175
- # Closes this index by closing its associated reader and writer objects.
176
- def close
177
- @dir.synchronize do
178
- if not @open
179
- raise "tried to close an already closed directory"
180
- end
181
- @reader.close() if @reader
182
- @writer.close() if @writer
183
- @dir.close()
184
-
185
- @open = false
186
- end
187
- end
188
-
189
- # Get the reader for this index.
190
- # NOTE:: This will close the writer from this index.
191
- def reader
192
- ensure_reader_open()
193
- return @reader
194
- end
195
-
196
- # Get the searcher for this index.
197
- # NOTE:: This will close the writer from this index.
198
- def searcher
199
- ensure_searcher_open()
200
- return @searcher
201
- end
202
-
203
- # Get the writer for this index.
204
- # NOTE:: This will close the reader from this index.
205
- def writer
206
- ensure_writer_open()
207
- return @writer
208
- end
209
- protected :reader, :writer, :searcher
210
-
211
- # Adds a document to this index, using the provided analyzer instead of
212
- # the local analyzer if provided. If the document contains more than
213
- # IndexWriter::MAX_FIELD_LENGTH terms for a given field, the remainder are
214
- # discarded.
215
- #
216
- # There are three ways to add a document to the index.
217
- # To add a document you can simply add a string or an array of strings.
218
- # This will store all the strings in the "" (ie empty string) field
219
- # (unless you specify the default_field when you create the index).
220
- #
221
- # index << "This is a new document to be indexed"
222
- # index << ["And here", "is another", "new document", "to be indexed"]
223
- #
224
- # But these are pretty simple documents. If this is all you want to index
225
- # you could probably just use SimpleSearch. So let's give our documents
226
- # some fields;
227
- #
228
- # index << {:title => "Programming Ruby", :content => "blah blah blah"}
229
- # index << {:title => "Programming Ruby", :content => "yada yada yada"}
230
- #
231
- # Or if you are indexing data stored in a database, you'll probably want
232
- # to store the id;
233
- #
234
- # index << {:id => row.id, :title => row.title, :date => row.date}
235
- #
236
- # The methods above while store all of the input data as well tokenizing
237
- # and indexing it. Sometimes we won't want to tokenize (divide the string
238
- # into tokens) the data. For example, we might want to leave the title as
239
- # a complete string and only allow searchs for that complete string.
240
- # Sometimes we won't want to store the data as it's already stored in the
241
- # database so it'll be a waste to store it in the index. Or perhaps we are
242
- # doing without a database and using Ferret to store all of our data, in
243
- # which case we might not want to index it. For example, if we are storing
244
- # images in the index, we won't want to index them. All of this can be
245
- # done using Ferret's Ferret::Document module. eg;
246
- #
247
- # include Ferret::Document
248
- # doc = Document.new
249
- # doc << Field.new("id", row.id, Field::Store::NO, Field::Index::UNTOKENIZED)
250
- # doc << Field.new("title", row.title, Field::Store::YES, Field::Index::UNTOKENIZED)
251
- # doc << Field.new("data", row.data, Field::Store::YES, Field::Index::TOKENIZED)
252
- # doc << Field.new("image", row.image, Field::Store::YES, Field::Index::NO)
253
- # index << doc
254
- #
255
- # You can also compress the data that you are storing or store term
256
- # vectors with the data. Read more about this in Ferret::Document::Field.
257
- def add_document(doc, analyzer = nil)
258
- @dir.synchronize do
259
- fdoc = nil
260
- if doc.is_a?(String)
261
- fdoc = Document.new
262
- fdoc << Field.new(@default_field, doc,
263
- Field::Store::YES, Field::Index::TOKENIZED)
264
- elsif doc.is_a?(Array)
265
- fdoc = Document.new
266
- doc.each() do |field|
267
- fdoc << Field.new(@default_field, field,
268
- Field::Store::YES, Field::Index::TOKENIZED)
269
- end
270
- elsif doc.is_a?(Hash)
271
- fdoc = Document.new
272
- doc.each_pair() do |field, text|
273
- if @key and @key.index(field.to_s)
274
- fdoc << Field.new(field.to_s, text.to_s,
275
- Field::Store::YES, Field::Index::UNTOKENIZED)
276
- else
277
- fdoc << Field.new(field.to_s, text.to_s,
278
- Field::Store::YES, Field::Index::TOKENIZED)
279
- end
280
- end
281
- elsif doc.is_a?(Document)
282
- fdoc = doc
283
- else
284
- raise ArgumentError, "Unknown document type #{doc.class}"
285
- end
286
-
287
- # delete existing documents with the same key
288
- if @key
289
- query = @key.inject(BooleanQuery.new()) do |bq, field|
290
- bq.add_query(TermQuery.new(Term.new(field, fdoc[field])),
291
- BooleanClause::Occur::MUST)
292
- end
293
- query_delete(query)
294
- end
295
-
296
- ensure_writer_open()
297
- @has_writes = true
298
- @writer.add_document(fdoc, analyzer || @writer.analyzer)
299
- flush() if @auto_flush
300
- end
301
- end
302
- alias :<< :add_document
303
-
304
- # The main search method for the index. You need to create a query to
305
- # pass to this method. You can also pass a hash with one or more of the
306
- # following; {filter, num_docs, first_doc, sort}
307
- #
308
- # query:: The query to run on the index
309
- # filter:: Filters docs from the search result
310
- # first_doc:: The index in the results of the first doc retrieved.
311
- # Default is 0
312
- # num_docs:: The number of results returned. Default is 10
313
- # sort:: An array of SortFields describing how to sort the results.
314
- def search(query, options = {})
315
- @dir.synchronize do
316
- return do_search(query, options)
317
- end
318
- end
319
-
320
- # See Index#search
321
- #
322
- # This method yields the doc and score for each hit.
323
- # eg.
324
- # index.search_each() do |doc, score|
325
- # puts "hit document number #{doc} with a score of #{score}"
326
- # end
327
- #
328
- # returns:: The total number of hits.
329
- def search_each(query, options = {}) # :yield: doc, score
330
- @dir.synchronize do
331
- hits = do_search(query, options)
332
- hits.score_docs.each do |score_doc|
333
- yield score_doc.doc, score_doc.score
334
- end
335
- return hits.total_hits
336
- end
337
- end
338
-
339
- # Retrieve the document referenced by the document number +id+, if id is
340
- # an integer or the first document with term +id+ if +id+ is a term.
341
- #
342
- # id:: The number of the document to retrieve, or the term used as the id
343
- # for the document we wish to retrieve
344
- def doc(id)
345
- @dir.synchronize do
346
- ensure_reader_open()
347
- if id.kind_of?(String) or id.kind_of?(Symbol)
348
- t = Term.new(@id_field, id.to_s)
349
- return @reader.get_document_with_term(t)
350
- elsif id.is_a?(Term)
351
- return @reader.get_document_with_term(id)
352
- else
353
- return @reader.get_document(id)
354
- end
355
- end
356
- end
357
- alias :[] :doc
358
-
359
- # Delete the document referenced by the document number +id+ if +id+ is an
360
- # integer or all of the documents which have the term +id+ if +id+ is a
361
- # term..
362
- #
363
- # id:: The number of the document to delete
364
- def delete(id)
365
- @dir.synchronize do
366
- cnt = 0
367
- ensure_reader_open()
368
- if id.is_a?(String)
369
- t = Term.new(@id_field, id.to_s)
370
- cnt = @reader.delete_docs_with_term(t)
371
- elsif id.is_a?(Term)
372
- cnt = @reader.delete_docs_with_term(id)
373
- elsif id.is_a?(Integer)
374
- cnt = @reader.delete(id)
375
- else
376
- raise ArgumentError, "Cannot delete for id of type #{id.class}"
377
- end
378
- flush() if @auto_flush
379
- return cnt
380
- end
381
- end
382
-
383
- # Delete all documents returned by the query.
384
- #
385
- # query:: The query to find documents you wish to delete. Can either be a
386
- # string (in which case it is parsed by the standard query parser)
387
- # or an actual query object.
388
- def query_delete(query)
389
- @dir.synchronize do
390
- ensure_searcher_open()
391
- query = process_query(query)
392
- @searcher.search_each(query) do |doc, score|
393
- @reader.delete(doc)
394
- end
395
- flush() if @auto_flush
396
- end
397
- end
398
-
399
- # Returns true if document +n+ has been deleted
400
- def deleted?(n)
401
- @dir.synchronize do
402
- ensure_reader_open()
403
- return @reader.deleted?(n)
404
- end
405
- end
406
-
407
- # Update the document referenced by the document number +id+ if +id+ is an
408
- # integer or all of the documents which have the term +id+ if +id+ is a
409
- # term..
410
- #
411
- # id:: The number of the document to update. Can also be a string
412
- # representing the value in the +id+ field or a term to match.
413
- # new_val:: The values we are updating. This can be a string in which case
414
- # the default field is updated, or it can be a hash, in which
415
- # case, all fields in the hash are updated. You can also pass a
416
- # full Document object, which will completely replace the
417
- # documents you remove.
418
- def update(id, new_val)
419
- @dir.synchronize do
420
- if id.is_a?(String)
421
- query_update("#{@id_field}:#{id}", new_val)
422
- elsif id.is_a?(Term)
423
- query_update(TermQuery.new(id), new_val)
424
- elsif id.is_a?(Integer)
425
- ensure_reader_open()
426
- document = doc(id)
427
- if new_val.is_a?(Hash)
428
- new_val.each_pair {|name, content| document[name] = content.to_s}
429
- elsif new_val.is_a?(Ferret::Document::Document)
430
- document = new_val
431
- else
432
- document[@options[:default_field]] = new_val.to_s
433
- end
434
- @reader.delete(id)
435
- ensure_writer_open()
436
- @writer.add_document(document)
437
- else
438
- raise ArgumentError, "Cannot update for id of type #{id.class}"
439
- end
440
- flush() if @auto_flush
441
- end
442
- end
443
-
444
- # Update all the documents returned by the query.
445
- #
446
- # query:: The query to find documents you wish to update. Can either be
447
- # a string (in which case it is parsed by the standard query
448
- # parser) or an actual query object.
449
- # new_val:: The values we are updating. This can be a string in which case
450
- # the default field is updated, or it can be a hash, in which
451
- # case, all fields in the hash are updated. You can also pass a
452
- # full Document object, which will completely replace the
453
- # documents you remove. You should be careful when passing a
454
- # whole document to be sure that your query will return one and
455
- # only result.
456
- def query_update(query, new_val)
457
- @dir.synchronize do
458
- ensure_searcher_open()
459
- docs_to_add = []
460
- query = process_query(query)
461
- @searcher.search_each(query) do |id, score|
462
- document = doc(id)
463
- if new_val.is_a?(Hash)
464
- new_val.each_pair {|name, content| document[name] = content.to_s}
465
- elsif new_val.is_a?(Document)
466
- document = new_val
467
- else
468
- document[@options[:default_field]] = new_val.to_s
469
- end
470
- docs_to_add << document
471
- @reader.delete(id)
472
- end
473
- ensure_writer_open()
474
- docs_to_add.each do |document|
475
- @writer.add_document(document)
476
- end
477
- flush() if @auto_flush
478
- end
479
- end
480
-
481
- # Returns true if any documents have been deleted since the index was last
482
- # flushed.
483
- def has_deletions?()
484
- @dir.synchronize do
485
- ensure_reader_open()
486
- return @reader.has_deletions?
487
- end
488
- end
489
-
490
- # Returns true if any documents have been added to the index since the
491
- # last flush.
492
- def has_writes?()
493
- return @has_writes
494
- end
495
-
496
- # Flushes all writes to the index. This will not optimize the index but it
497
- # will make sure that all writes are written to it.
498
- #
499
- # NOTE: this is not necessary if you are only using this class. All writes
500
- # will automatically flush when you perform an operation that reads the
501
- # index.
502
- def flush()
503
- @dir.synchronize do
504
- @reader.close if @reader
505
- @writer.close if @writer
506
- @reader = nil
507
- @writer = nil
508
- @searcher = nil
509
- @has_writes = false
510
- end
511
- end
512
-
513
- # optimizes the index. This should only be called when the index will no
514
- # longer be updated very often, but will be read a lot.
515
- def optimize()
516
- @dir.synchronize do
517
- ensure_writer_open()
518
- @writer.optimize()
519
- flush()
520
- end
521
- end
522
-
523
- # returns the number of documents in the index
524
- def size()
525
- @dir.synchronize do
526
- ensure_reader_open()
527
- return @reader.num_docs()
528
- end
529
- end
530
-
531
- # Merges all segments from an index or an array of indexes into this
532
- # index. You can pass a single Index::Index, Index::Reader,
533
- # Store::Directory or an array of any single one of these.
534
- #
535
- # This may be used to parallelize batch indexing. A large document
536
- # collection can be broken into sub-collections. Each sub-collection can
537
- # be indexed in parallel, on a different thread, process or machine and
538
- # perhaps all in memory. The complete index can then be created by
539
- # merging sub-collection indexes with this method.
540
- #
541
- # After this completes, the index is optimized.
542
- def add_indexes(indexes)
543
- @dir.synchronize do
544
- indexes = [indexes].flatten # make sure we have an array
545
- return if indexes.size == 0 # nothing to do
546
- if indexes[0].is_a?(Index)
547
- readers = indexes.map {|index| index.reader }
548
- indexes = readers
549
- end
550
-
551
- if indexes[0].is_a?(IndexReader)
552
- ensure_reader_open
553
- indexes.delete(@reader) # we don't want to merge with self
554
- ensure_writer_open
555
- @writer.add_indexes_readers(indexes)
556
- elsif indexes[0].is_a?(Ferret::Store::Directory)
557
- indexes.delete(@dir) # we don't want to merge with self
558
- ensure_writer_open
559
- @writer.add_indexes(indexes)
560
- else
561
- raise ArgumentError, "Unknown index type when trying to merge indexes"
562
- end
563
- end
564
- end
565
-
566
- # This is a simple utility method for saving an in memory or RAM index to
567
- # the file system. The same thing can be achieved by using the
568
- # Index::Index#add_indexes method and you will have more options when
569
- # creating the new index, however this is a simple way to turn a RAM index
570
- # into a file system index.
571
- #
572
- # directory:: This can either be a Store::Directory object or a string
573
- # representing the path to the directory where you would
574
- # like to store the the index.
575
- #
576
- # create:: True if you'd like to create the directory if it doesn't
577
- # exist or copy over an existing directory. False if you'd
578
- # like to merge with the existing directory. This defaults to
579
- # false.
580
- def persist(directory, create = true)
581
- synchronize do
582
- flush()
583
- old_dir = @dir
584
- if directory.is_a?(String)
585
- @dir = FSDirectory.new(directory, create)
586
- @options[:close_dir] = true
587
- elsif directory.is_a?(Ferret::Store::Directory)
588
- @dir = directory
589
- end
590
- ensure_writer_open
591
- @writer.add_indexes([old_dir])
592
- end
593
- end
594
-
595
- def to_s
596
- buf = ""
597
- (0...(size)).each do |i|
598
- buf << self[i].to_s + "\n" if not deleted?(i)
599
- end
600
- buf
601
- end
602
-
603
- # Returns an Explanation that describes how +doc+ scored against
604
- # +query+.
605
- #
606
- # This is intended to be used in developing Similarity implementations,
607
- # and, for good performance, should not be displayed with every hit.
608
- # Computing an explanation is as expensive as executing the query over the
609
- # entire index.
610
- def explain(query, doc)
611
- synchronize do
612
- ensure_searcher_open()
613
- query = process_query(query)
614
-
615
- return @searcher.explain(query, doc)
616
- end
617
- end
618
-
619
- protected
620
- def ensure_writer_open()
621
- raise "tried to use a closed index" if not @open
622
- return if @writer
623
- if @reader
624
- @reader.close
625
- @reader = nil
626
- @searcher = nil
627
- end
628
- @writer = IndexWriter.new(@dir, @options)
629
- end
630
-
631
- # returns the new reader if one is opened
632
- def ensure_reader_open()
633
- raise "tried to use a closed index" if not @open
634
- if @reader
635
- if not @reader.latest?
636
- return @reader = IndexReader.open(@dir, false)
637
- end
638
- else
639
- if @writer
640
- @writer.close
641
- @writer = nil
642
- end
643
- return @reader = IndexReader.open(@dir, false)
644
- end
645
- return false
646
- end
647
-
648
- def ensure_searcher_open()
649
- raise "tried to use a closed index" if not @open
650
- if ensure_reader_open() or not @searcher
651
- @searcher = IndexSearcher.new(@reader)
652
- end
653
- end
654
-
655
- private
656
- def do_search(query, options)
657
- ensure_searcher_open()
658
- query = process_query(query)
659
-
660
- return @searcher.search(query, options)
661
- end
662
-
663
- def process_query(query)
664
- if query.is_a?(String)
665
- if @qp.nil?
666
- @qp = Ferret::QueryParser.new(@default_search_field, @options)
667
- end
668
- # we need to set this ever time, in case a new field has been added
669
- @qp.fields = @reader.get_field_names.to_a
670
- query = @qp.parse(query)
671
- end
672
- return query
673
- end
674
- end
675
- end