ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,33 +0,0 @@
1
- module Ferret
2
- module Index
3
- # Useful constants representing filenames and extensions used by lucene
4
- class IndexFileNames
5
-
6
- # Name of the index segment file
7
- SEGMENTS = "segments"
8
-
9
- # Name of the index deletable file
10
- DELETABLE = "deletable"
11
-
12
- # This array contains all filename extensions used by Lucene's index files, with
13
- # one exception, namely the extension made up from +.f+ + a number.
14
- # Also note that two of Lucene's files (+deletable+ and
15
- # +segments+) don't have any filename extension.
16
- INDEX_EXTENSIONS = [
17
- "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
18
- "tvx", "tvd", "tvf", "tvp"
19
- ]
20
-
21
- # File extensions of old-style index files
22
- COMPOUND_EXTENSIONS = [
23
- "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
24
- ]
25
-
26
- # File extensions for term vector support
27
- VECTOR_EXTENSIONS = [
28
- "tvx", "tvd", "tvf"
29
- ]
30
-
31
- end
32
- end
33
- end
@@ -1,503 +0,0 @@
1
- require 'monitor'
2
-
3
- module Ferret::Index
4
- # IndexReader is an abstract class, providing an interface for accessing an
5
- # index. Search of an index is done entirely through this abstract interface,
6
- # class which implements it is searchable.
7
- #
8
- # Concrete subclasses of IndexReader are usually constructed with a call to
9
- # one of the static <tt>open()</tt> methods, e.g. <tt>#open</tt>.
10
- #
11
- # For efficiency, in this API documents are often referred to via
12
- # _document numbers_, non-negative integers which each name a unique
13
- # document in the index. These document numbers are ephemeral, ie they may change
14
- # as documents are added to and deleted from an index. Clients should thus not
15
- # rely on a given document having the same number between sessions.
16
- #
17
- # An IndexReader can be opened on a directory for which an IndexWriter is
18
- # opened already, but it cannot be used to delete documents from the index then.
19
- class IndexReader
20
- include MonitorMixin
21
-
22
- # This array contains all filename extensions used by Lucene's index files, with
23
- # one exception, namely the extension made up from +.f+ + a number.
24
- # Also note that two of Lucene's files (+deletable+ and
25
- # +segments+) don't have any filename extension.
26
- FILENAME_EXTENSIONS = ["cfs",
27
- "fnm",
28
- "fdx",
29
- "fdt",
30
- "tii",
31
- "tis",
32
- "frq",
33
- "prx",
34
- "del",
35
- "tvx",
36
- "tvd",
37
- "tvf",
38
- "tvp"]
39
-
40
- attr_reader :directory
41
-
42
- class FieldOption < Ferret::Utils::Parameter
43
- # all fields
44
- ALL = FieldOption.new("ALL")
45
- # all indexed fields
46
- INDEXED = FieldOption.new("INDEXED")
47
- # all fields which are not indexed
48
- UNINDEXED = FieldOption.new("UNINDEXED")
49
- # all fields which are indexed with termvectors enables
50
- INDEXED_WITH_TERM_VECTOR = FieldOption.new("INDEXED_WITH_TERM_VECTOR")
51
- # all fields which are indexed but don't have termvectors enabled
52
- INDEXED_NO_TERM_VECTOR = FieldOption.new("INDEXED_NO_TERM_VECTOR")
53
- # all fields where termvectors are enabled. Please note that only standard
54
- # termvector fields are returned
55
- TERM_VECTOR = FieldOption.new("TERM_VECTOR")
56
- # all field with termvectors wiht positions enabled
57
- TERM_VECTOR_WITH_POSITION = FieldOption.new("TERM_VECTOR_WITH_POSITION")
58
- # all fields where termvectors with offset position are set
59
- TERM_VECTOR_WITH_OFFSET = FieldOption.new("TERM_VECTOR_WITH_OFFSET")
60
- # all fields where termvectors with offset and position values set
61
- TERM_VECTOR_WITH_POSITION_OFFSET =
62
- FieldOption.new("TERM_VECTOR_WITH_POSITION_OFFSET")
63
- end
64
-
65
- # To create an IndexReader use the IndexReader.open method. This method
66
- # should only be used by subclasses.
67
- #
68
- # directory:: Directory where IndexReader files reside.
69
- # segment_infos:: Used for write-l
70
- # close_directory:: close the directory when the index reader is closed
71
- def initialize(directory, segment_infos = nil,
72
- close_directory = false, directory_owner = false)
73
- super()
74
- @directory = directory
75
- @close_directory = close_directory
76
- @segment_infos = segment_infos
77
- @directory_owner = directory_owner
78
-
79
- @has_changes = false
80
- @stale = false
81
- @write_lock = nil
82
-
83
- #ObjectSpace.define_finalizer(self, lambda { |id| @write_lock.release() if @write_lock})
84
- end
85
-
86
- # Returns an index reader to read the index in the directory
87
- #
88
- # directory:: This can either be a Directory object or you can pass
89
- # nil (RamDirectory is created) or a path (FSDirectory
90
- # is created). If you chose the second or third options,
91
- # you should leave close_directory as true and infos as
92
- # nil.
93
- # close_directory:: True if you want the IndexReader to close the
94
- # directory when the IndexReader is closed. You'll want
95
- # to set this to false if other objects are using the
96
- # same directory object.
97
- # infos:: Expert: This can be used to read an different version
98
- # of the index but should really be left alone.
99
- def IndexReader.open(directory, close_directory = true, infos = nil)
100
- if directory.nil?
101
- directory = Ferret::Store::RAMDirectory.new
102
- elsif directory.is_a?(String)
103
- directory = Ferret::Store::FSDirectory.new(directory, false)
104
- end
105
- directory.synchronize do # in- & inter-process sync
106
- commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
107
- commit_lock.while_locked() do
108
- if infos.nil?
109
- infos = SegmentInfos.new()
110
- infos.read(directory)
111
- end
112
- if (infos.size() == 1) # index is optimized
113
- return SegmentReader.get(infos[0], infos, close_directory)
114
- end
115
- readers = Array.new(infos.size)
116
- infos.size.times do |i|
117
- readers[i] = SegmentReader.get(infos[i])
118
- end
119
- return MultiReader.new(readers, directory, infos, close_directory)
120
- end
121
- end
122
- end
123
-
124
- # Reads version number from segments files. The version number counts the
125
- # number of changes of the index.
126
- #
127
- # directory:: where the index resides.
128
- # returns:: version number.
129
- # raises:: IOError if segments file cannot be read.
130
- def IndexReader.get_current_version(directory)
131
- return SegmentInfos.read_current_version(directory)
132
- end
133
-
134
- # Return an array of term vectors for the specified document. The array
135
- # contains a vector for each vectorized field in the document. Each vector
136
- # contains terms and frequencies for all terms in a given vectorized field.
137
- # If no such fields existed, the method returns nil. The term vectors that
138
- # are returned my either be of type TermFreqVector or of type
139
- # TermDocPosEnumVector if positions or offsets have been stored.
140
- #
141
- # doc_number:: document for which term vectors are returned
142
- # returns:: array of term vectors. May be nil if no term vectors have been
143
- # stored for the specified document.
144
- # raises:: IOError if index cannot be accessed
145
- #
146
- # See Field::TermVector
147
- def get_term_vectors(doc_number)
148
- raise NotImplementedError
149
- end
150
-
151
-
152
-
153
- # Return a term vector for the specified document and field. The returned
154
- # vector contains terms and frequencies for the terms in the specified
155
- # field of this document, if the field had the storeTermVector flag set. If
156
- # termvectors had been stored with positions or offsets, a
157
- # TermDocPosEnumVector is returned.
158
- #
159
- # doc_number:: document for which the term vector is returned
160
- # field:: field for which the term vector is returned.
161
- # returns:: term vector May be nil if field does not exist in the specified
162
- # document or term vector was not stored.
163
- # raises:: IOError if index cannot be accessed
164
- # See Field::TermVector
165
- def get_term_vector(doc_number, field)
166
- raise NotImplementedError
167
- end
168
-
169
-
170
- # Returns +true+ if an index exists at the specified directory. If the
171
- # directory does not exist or if there is no index in it.
172
- #
173
- # directory:: the directory to check for an index
174
- # returns:: +true+ if an index exists; +false+ otherwise
175
- # raises:: IOError if there is a problem with accessing the index
176
- def IndexReader.index_exists?(directory)
177
- return directory.exists?("segments")
178
- end
179
-
180
- # Returns the number of documents in this index.
181
- def num_docs()
182
- raise NotImplementedError
183
- end
184
-
185
- # Returns one greater than the largest possible document number.
186
- #
187
- # This may be used to, e.g., determine how big to allocate an array which
188
- # will have an element for every document number in an index.
189
- def max_doc()
190
- raise NotImplementedError
191
- end
192
-
193
- # Returns the stored fields of the +n+<sup>th</sup>
194
- # +Document+ in this index.
195
- def get_document(n)
196
- raise NotImplementedError
197
- end
198
-
199
- # Returns the first document with the term +term+. This is useful, for
200
- # example, if we are indexing rows from a database. We can store the id of
201
- # each row in a field in the index and use this method get the document by
202
- # the id. Hence, only one document is returned.
203
- #
204
- # term: The term we are searching for.
205
- def get_document_with_term(term)
206
- docs = term_docs_for(term)
207
- if (docs == nil) then return nil end
208
- document = nil
209
- begin
210
- document = get_document(docs.doc) if docs.next?
211
- ensure
212
- docs.close()
213
- end
214
- return document
215
- end
216
-
217
- # Returns true if document _n_ has been deleted
218
- def deleted?(n)
219
- raise NotImplementedError
220
- end
221
-
222
- # Returns true if any documents have been deleted
223
- def has_deletions?()
224
- raise NotImplementedError
225
- end
226
-
227
- # Returns true if there are norms stored for this field.
228
- def has_norms?(field)
229
- # backward compatible implementation.
230
- # SegmentReader has an efficient implementation.
231
- return (get_norms(field) != nil)
232
- end
233
-
234
- # Returns the byte-encoded normalization factor for the named field of
235
- # every document. This is used by the search code to score documents.
236
- #
237
- # See Field#boost
238
- def get_norms(field)
239
- raise NotImplementedError
240
- end
241
-
242
- # Read norms into a pre-allocated array. This is used as an optimization
243
- # of get_norms.
244
- #
245
- # See Field#boost
246
- def get_norms_into(field, bytes, offset)
247
- raise NotImplementedError
248
- end
249
-
250
- # Expert: Resets the normalization factor for the named field of the named
251
- # document. The norm represents the product of the field's Field#boost and
252
- # its Similarity#length_norm length normalization. Thus, to preserve the
253
- # length normalization values when resetting this, one should base the new
254
- # value upon the old.
255
- #
256
- # See #get_norms
257
- # See Similarity#decode_norm
258
- def set_norm(doc, field, value)
259
- synchronize do
260
- value = Similarity.encode_norm(value) if value.is_a? Float
261
- if(@directory_owner)
262
- acquire_write_lock()
263
- end
264
- do_set_norm(doc, field, value)
265
- @has_changes = true
266
- end
267
- end
268
-
269
- # Implements set_norm in subclass.
270
- def do_set_norm(doc, field, value)
271
- raise NotImplementedError
272
- end
273
-
274
- # Returns an enumeration of all the terms in the index.
275
- # Each term is greater than all that precede it in the enumeration.
276
- def terms()
277
- raise NotImplementedError
278
- end
279
-
280
- # Returns an enumeration of all terms after a given term.
281
- #
282
- # Each term is greater than all that precede it in the enumeration.
283
- def terms_from(t)
284
- raise NotImplementedError
285
- end
286
-
287
- # Returns the number of documents containing the term +t+.
288
- def doc_freq(t)
289
- raise NotImplementedError
290
- end
291
-
292
- # Returns an enumeration of all the documents which contain +term+. For each
293
- # document, the document number, the frequency of the term in that document
294
- # is also provided, for use in search scoring. Thus, this method implements
295
- # the mapping:
296
- #
297
- # Term => <doc_num, freq><sup>*</sup>
298
- #
299
- # The enumeration is ordered by document number. Each document number is
300
- # greater than all that precede it in the enumeration.
301
- def term_docs_for(term)
302
- term_docs = term_docs()
303
- term_docs.seek(term)
304
- return term_docs
305
- end
306
-
307
- # Returns an unpositioned TermDocEnum enumerator.
308
- def term_docs()
309
- raise NotImplementedError
310
- end
311
-
312
- # Returns an enumeration of all the documents which contain
313
- # +term+. For each document, in addition to the document number
314
- # and frequency of the term in that document, a list of all of the ordinal
315
- # positions of the term in the document is available. Thus, this method
316
- # implements the mapping:
317
- #
318
- # Term => <doc_num, freq, < pos<sub>1</sub>, pos<sub>2</sub>, ...
319
- # pos<sub>freq-1</sub> > > <sup>*</sup>
320
- #
321
- # This positional information faciliates phrase and proximity searching.
322
- # The enumeration is ordered by document number. Each document number is
323
- # greater than all that precede it in the enumeration.
324
- def term_positions_for(term)
325
- term_positions = term_positions()
326
- term_positions.seek(term)
327
- return term_positions
328
- end
329
-
330
- # Returns an unpositioned @link TermDocPosEnumendenumerator.
331
- def term_positions()
332
- raise NotImplementedError
333
- end
334
-
335
- # Tries to acquire the WriteLock on this directory.
336
- #
337
- # This method is only valid if this IndexReader is directory owner.
338
- #
339
- # raises:: IOError If WriteLock cannot be acquired.
340
- def acquire_write_lock()
341
- if @stale
342
- raise IOError, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations"
343
- end
344
-
345
- if (@write_lock == nil)
346
- @write_lock = @directory.make_lock(IndexWriter::WRITE_LOCK_NAME)
347
- if not @write_lock.obtain(IndexWriter::WRITE_LOCK_TIMEOUT) # obtain write lock
348
- raise IOError, "Index locked for write: " + @write_lock
349
- end
350
-
351
- # we have to check whether index has changed since this reader was opened.
352
- # if so, this reader is no longer valid for deletion
353
- if (SegmentInfos.read_current_version(@directory) > @segment_infos.version())
354
- @stale = true
355
- @write_lock.release()
356
- @write_lock = nil
357
- raise IOError, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations"
358
- end
359
- end
360
- end
361
-
362
- # Returns true if the reader is reading from the latest version of the
363
- # index.
364
- def latest?()
365
- SegmentInfos.read_current_version(@directory) == @segment_infos.version()
366
- end
367
-
368
- # Deletes the document numbered +doc_num+. Once a document is deleted it
369
- # will not appear in TermDocEnum or TermPostitions enumerations. Attempts to
370
- # read its field with the @link #documentend method will result in an error.
371
- # The presence of this document may still be reflected in the @link
372
- # #docFreqendstatistic, though this will be corrected eventually as the
373
- # index is further modified.
374
- def delete(doc_num)
375
- synchronize do
376
- acquire_write_lock() if @directory_owner
377
- do_delete(doc_num)
378
- @has_changes = true
379
- end
380
- return 1
381
- end
382
-
383
- # Implements deletion of the document numbered +doc_num+.
384
- # Applications should call @link #delete(int)endor @link #delete(Term)end.
385
- def do_delete(doc_num)
386
- raise NotImplementedError
387
- end
388
-
389
- # Deletes all documents containing +term+.
390
- # This is useful if one uses a document field to hold a unique ID string for
391
- # the document. Then to delete such a document, one merely constructs a
392
- # term with the appropriate field and the unique ID string as its text and
393
- # passes it to this method. Returns the number of documents deleted. See
394
- # #delete for information about when this deletion will become effective.
395
- def delete_docs_with_term(term)
396
- docs = term_docs_for(term)
397
- if (docs == nil) then return 0 end
398
- n = 0
399
- begin
400
- while (docs.next?)
401
- delete(docs.doc)
402
- n += 1
403
- end
404
- ensure
405
- docs.close()
406
- end
407
- return n
408
- end
409
-
410
- # Undeletes all documents currently marked as deleted in this index.
411
- def undelete_all()
412
- synchronize do
413
- acquire_write_lock() if @directory_owner
414
- do_undelete_all()
415
- @has_changes = true
416
- end
417
- end
418
-
419
- # Commit changes resulting from delete, undelete_all, or set_norm operations
420
- #
421
- # raises:: IOError
422
- def commit()
423
- synchronize do
424
- if @has_changes
425
- if @directory_owner
426
- @directory.synchronize do # in- & inter-process sync
427
- commit_lock = @directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
428
- commit_lock.while_locked do
429
- do_commit()
430
- @segment_infos.write(@directory)
431
- end
432
- end
433
- if (@write_lock != nil)
434
- @write_lock.release() # release write lock
435
- @write_lock = nil
436
- end
437
- else
438
- do_commit()
439
- end
440
- end
441
- @has_changes = false
442
- end
443
- end
444
-
445
- # Closes files associated with this index.
446
- # Also saves any new deletions to disk.
447
- # No other methods should be called after this has been called.
448
- def close()
449
- synchronize do
450
- commit()
451
- do_close()
452
- @directory.close() if @close_directory
453
- end
454
- end
455
-
456
- protected
457
-
458
- # Implements actual undelete_all() in subclass.
459
- def do_undelete_all()
460
- raise NotImplementedError
461
- end
462
-
463
- # Implements commit.
464
- def do_commit()
465
- raise NotImplementedError
466
- end
467
-
468
-
469
- # Implements close.
470
- def do_close()
471
- raise NotImplementedError
472
- end
473
-
474
- # Get a list of unique field names that exist in this index and have the
475
- # specified field option information.
476
- # fld_option:: specifies which field option should be available for the
477
- # returned fields
478
- # returns:: Collection of Strings indicating the names of the fields.
479
- # See IndexReader.FieldOption
480
- def get_field_names()
481
- raise NotImplementedError
482
- end
483
-
484
- # Returns +true+ iff the index in the named directory is
485
- # currently locked.
486
- # directory:: the directory to check for a lock
487
- # raises:: IOError if there is a problem with accessing the index
488
- def IndexReader.locked?(directory)
489
- return (directory.make_lock(IndexWriter::WRITE_LOCK_NAME).locked? or
490
- directory.make_lock(IndexWriter::COMMIT_LOCK_NAME).locked?)
491
- end
492
-
493
- # Forcibly unlocks the index in the named directory.
494
- #
495
- # Caution: this should only be used by failure recovery code,
496
- # when it is known that no other process nor thread is in fact
497
- # currently accessing this index.
498
- def IndexReader.unlock(directory)
499
- directory.make_lock(IndexWriter::WRITE_LOCK_NAME).release
500
- directory.make_lock(IndexWriter::COMMIT_LOCK_NAME).release
501
- end
502
- end
503
- end