ferret 0.9.6 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
@@ -1,33 +0,0 @@
1
- module Ferret
2
- module Index
3
- # Useful constants representing filenames and extensions used by lucene
4
- class IndexFileNames
5
-
6
- # Name of the index segment file
7
- SEGMENTS = "segments"
8
-
9
- # Name of the index deletable file
10
- DELETABLE = "deletable"
11
-
12
- # This array contains all filename extensions used by Lucene's index files, with
13
- # one exception, namely the extension made up from +.f+ + a number.
14
- # Also note that two of Lucene's files (+deletable+ and
15
- # +segments+) don't have any filename extension.
16
- INDEX_EXTENSIONS = [
17
- "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
18
- "tvx", "tvd", "tvf", "tvp"
19
- ]
20
-
21
- # File extensions of old-style index files
22
- COMPOUND_EXTENSIONS = [
23
- "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
24
- ]
25
-
26
- # File extensions for term vector support
27
- VECTOR_EXTENSIONS = [
28
- "tvx", "tvd", "tvf"
29
- ]
30
-
31
- end
32
- end
33
- end
@@ -1,503 +0,0 @@
1
- require 'monitor'
2
-
3
- module Ferret::Index
4
- # IndexReader is an abstract class, providing an interface for accessing an
5
- # index. Search of an index is done entirely through this abstract interface,
6
- # class which implements it is searchable.
7
- #
8
- # Concrete subclasses of IndexReader are usually constructed with a call to
9
- # one of the static <tt>open()</tt> methods, e.g. <tt>#open</tt>.
10
- #
11
- # For efficiency, in this API documents are often referred to via
12
- # _document numbers_, non-negative integers which each name a unique
13
- # document in the index. These document numbers are ephemeral, ie they may change
14
- # as documents are added to and deleted from an index. Clients should thus not
15
- # rely on a given document having the same number between sessions.
16
- #
17
- # An IndexReader can be opened on a directory for which an IndexWriter is
18
- # opened already, but it cannot be used to delete documents from the index then.
19
- class IndexReader
20
- include MonitorMixin
21
-
22
- # This array contains all filename extensions used by Lucene's index files, with
23
- # one exception, namely the extension made up from +.f+ + a number.
24
- # Also note that two of Lucene's files (+deletable+ and
25
- # +segments+) don't have any filename extension.
26
- FILENAME_EXTENSIONS = ["cfs",
27
- "fnm",
28
- "fdx",
29
- "fdt",
30
- "tii",
31
- "tis",
32
- "frq",
33
- "prx",
34
- "del",
35
- "tvx",
36
- "tvd",
37
- "tvf",
38
- "tvp"]
39
-
40
- attr_reader :directory
41
-
42
- class FieldOption < Ferret::Utils::Parameter
43
- # all fields
44
- ALL = FieldOption.new("ALL")
45
- # all indexed fields
46
- INDEXED = FieldOption.new("INDEXED")
47
- # all fields which are not indexed
48
- UNINDEXED = FieldOption.new("UNINDEXED")
49
- # all fields which are indexed with termvectors enables
50
- INDEXED_WITH_TERM_VECTOR = FieldOption.new("INDEXED_WITH_TERM_VECTOR")
51
- # all fields which are indexed but don't have termvectors enabled
52
- INDEXED_NO_TERM_VECTOR = FieldOption.new("INDEXED_NO_TERM_VECTOR")
53
- # all fields where termvectors are enabled. Please note that only standard
54
- # termvector fields are returned
55
- TERM_VECTOR = FieldOption.new("TERM_VECTOR")
56
- # all field with termvectors wiht positions enabled
57
- TERM_VECTOR_WITH_POSITION = FieldOption.new("TERM_VECTOR_WITH_POSITION")
58
- # all fields where termvectors with offset position are set
59
- TERM_VECTOR_WITH_OFFSET = FieldOption.new("TERM_VECTOR_WITH_OFFSET")
60
- # all fields where termvectors with offset and position values set
61
- TERM_VECTOR_WITH_POSITION_OFFSET =
62
- FieldOption.new("TERM_VECTOR_WITH_POSITION_OFFSET")
63
- end
64
-
65
- # To create an IndexReader use the IndexReader.open method. This method
66
- # should only be used by subclasses.
67
- #
68
- # directory:: Directory where IndexReader files reside.
69
- # segment_infos:: Used for write-l
70
- # close_directory:: close the directory when the index reader is closed
71
- def initialize(directory, segment_infos = nil,
72
- close_directory = false, directory_owner = false)
73
- super()
74
- @directory = directory
75
- @close_directory = close_directory
76
- @segment_infos = segment_infos
77
- @directory_owner = directory_owner
78
-
79
- @has_changes = false
80
- @stale = false
81
- @write_lock = nil
82
-
83
- #ObjectSpace.define_finalizer(self, lambda { |id| @write_lock.release() if @write_lock})
84
- end
85
-
86
- # Returns an index reader to read the index in the directory
87
- #
88
- # directory:: This can either be a Directory object or you can pass
89
- # nil (RamDirectory is created) or a path (FSDirectory
90
- # is created). If you chose the second or third options,
91
- # you should leave close_directory as true and infos as
92
- # nil.
93
- # close_directory:: True if you want the IndexReader to close the
94
- # directory when the IndexReader is closed. You'll want
95
- # to set this to false if other objects are using the
96
- # same directory object.
97
- # infos:: Expert: This can be used to read an different version
98
- # of the index but should really be left alone.
99
- def IndexReader.open(directory, close_directory = true, infos = nil)
100
- if directory.nil?
101
- directory = Ferret::Store::RAMDirectory.new
102
- elsif directory.is_a?(String)
103
- directory = Ferret::Store::FSDirectory.new(directory, false)
104
- end
105
- directory.synchronize do # in- & inter-process sync
106
- commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
107
- commit_lock.while_locked() do
108
- if infos.nil?
109
- infos = SegmentInfos.new()
110
- infos.read(directory)
111
- end
112
- if (infos.size() == 1) # index is optimized
113
- return SegmentReader.get(infos[0], infos, close_directory)
114
- end
115
- readers = Array.new(infos.size)
116
- infos.size.times do |i|
117
- readers[i] = SegmentReader.get(infos[i])
118
- end
119
- return MultiReader.new(readers, directory, infos, close_directory)
120
- end
121
- end
122
- end
123
-
124
- # Reads version number from segments files. The version number counts the
125
- # number of changes of the index.
126
- #
127
- # directory:: where the index resides.
128
- # returns:: version number.
129
- # raises:: IOError if segments file cannot be read.
130
- def IndexReader.get_current_version(directory)
131
- return SegmentInfos.read_current_version(directory)
132
- end
133
-
134
- # Return an array of term vectors for the specified document. The array
135
- # contains a vector for each vectorized field in the document. Each vector
136
- # contains terms and frequencies for all terms in a given vectorized field.
137
- # If no such fields existed, the method returns nil. The term vectors that
138
- # are returned my either be of type TermFreqVector or of type
139
- # TermDocPosEnumVector if positions or offsets have been stored.
140
- #
141
- # doc_number:: document for which term vectors are returned
142
- # returns:: array of term vectors. May be nil if no term vectors have been
143
- # stored for the specified document.
144
- # raises:: IOError if index cannot be accessed
145
- #
146
- # See Field::TermVector
147
- def get_term_vectors(doc_number)
148
- raise NotImplementedError
149
- end
150
-
151
-
152
-
153
- # Return a term vector for the specified document and field. The returned
154
- # vector contains terms and frequencies for the terms in the specified
155
- # field of this document, if the field had the storeTermVector flag set. If
156
- # termvectors had been stored with positions or offsets, a
157
- # TermDocPosEnumVector is returned.
158
- #
159
- # doc_number:: document for which the term vector is returned
160
- # field:: field for which the term vector is returned.
161
- # returns:: term vector May be nil if field does not exist in the specified
162
- # document or term vector was not stored.
163
- # raises:: IOError if index cannot be accessed
164
- # See Field::TermVector
165
- def get_term_vector(doc_number, field)
166
- raise NotImplementedError
167
- end
168
-
169
-
170
- # Returns +true+ if an index exists at the specified directory. If the
171
- # directory does not exist or if there is no index in it.
172
- #
173
- # directory:: the directory to check for an index
174
- # returns:: +true+ if an index exists; +false+ otherwise
175
- # raises:: IOError if there is a problem with accessing the index
176
- def IndexReader.index_exists?(directory)
177
- return directory.exists?("segments")
178
- end
179
-
180
- # Returns the number of documents in this index.
181
- def num_docs()
182
- raise NotImplementedError
183
- end
184
-
185
- # Returns one greater than the largest possible document number.
186
- #
187
- # This may be used to, e.g., determine how big to allocate an array which
188
- # will have an element for every document number in an index.
189
- def max_doc()
190
- raise NotImplementedError
191
- end
192
-
193
- # Returns the stored fields of the +n+<sup>th</sup>
194
- # +Document+ in this index.
195
- def get_document(n)
196
- raise NotImplementedError
197
- end
198
-
199
- # Returns the first document with the term +term+. This is useful, for
200
- # example, if we are indexing rows from a database. We can store the id of
201
- # each row in a field in the index and use this method get the document by
202
- # the id. Hence, only one document is returned.
203
- #
204
- # term: The term we are searching for.
205
- def get_document_with_term(term)
206
- docs = term_docs_for(term)
207
- if (docs == nil) then return nil end
208
- document = nil
209
- begin
210
- document = get_document(docs.doc) if docs.next?
211
- ensure
212
- docs.close()
213
- end
214
- return document
215
- end
216
-
217
- # Returns true if document _n_ has been deleted
218
- def deleted?(n)
219
- raise NotImplementedError
220
- end
221
-
222
- # Returns true if any documents have been deleted
223
- def has_deletions?()
224
- raise NotImplementedError
225
- end
226
-
227
- # Returns true if there are norms stored for this field.
228
- def has_norms?(field)
229
- # backward compatible implementation.
230
- # SegmentReader has an efficient implementation.
231
- return (get_norms(field) != nil)
232
- end
233
-
234
- # Returns the byte-encoded normalization factor for the named field of
235
- # every document. This is used by the search code to score documents.
236
- #
237
- # See Field#boost
238
- def get_norms(field)
239
- raise NotImplementedError
240
- end
241
-
242
- # Read norms into a pre-allocated array. This is used as an optimization
243
- # of get_norms.
244
- #
245
- # See Field#boost
246
- def get_norms_into(field, bytes, offset)
247
- raise NotImplementedError
248
- end
249
-
250
- # Expert: Resets the normalization factor for the named field of the named
251
- # document. The norm represents the product of the field's Field#boost and
252
- # its Similarity#length_norm length normalization. Thus, to preserve the
253
- # length normalization values when resetting this, one should base the new
254
- # value upon the old.
255
- #
256
- # See #get_norms
257
- # See Similarity#decode_norm
258
- def set_norm(doc, field, value)
259
- synchronize do
260
- value = Similarity.encode_norm(value) if value.is_a? Float
261
- if(@directory_owner)
262
- acquire_write_lock()
263
- end
264
- do_set_norm(doc, field, value)
265
- @has_changes = true
266
- end
267
- end
268
-
269
- # Implements set_norm in subclass.
270
- def do_set_norm(doc, field, value)
271
- raise NotImplementedError
272
- end
273
-
274
- # Returns an enumeration of all the terms in the index.
275
- # Each term is greater than all that precede it in the enumeration.
276
- def terms()
277
- raise NotImplementedError
278
- end
279
-
280
- # Returns an enumeration of all terms after a given term.
281
- #
282
- # Each term is greater than all that precede it in the enumeration.
283
- def terms_from(t)
284
- raise NotImplementedError
285
- end
286
-
287
- # Returns the number of documents containing the term +t+.
288
- def doc_freq(t)
289
- raise NotImplementedError
290
- end
291
-
292
- # Returns an enumeration of all the documents which contain +term+. For each
293
- # document, the document number, the frequency of the term in that document
294
- # is also provided, for use in search scoring. Thus, this method implements
295
- # the mapping:
296
- #
297
- # Term => <doc_num, freq><sup>*</sup>
298
- #
299
- # The enumeration is ordered by document number. Each document number is
300
- # greater than all that precede it in the enumeration.
301
- def term_docs_for(term)
302
- term_docs = term_docs()
303
- term_docs.seek(term)
304
- return term_docs
305
- end
306
-
307
- # Returns an unpositioned TermDocEnum enumerator.
308
- def term_docs()
309
- raise NotImplementedError
310
- end
311
-
312
- # Returns an enumeration of all the documents which contain
313
- # +term+. For each document, in addition to the document number
314
- # and frequency of the term in that document, a list of all of the ordinal
315
- # positions of the term in the document is available. Thus, this method
316
- # implements the mapping:
317
- #
318
- # Term => <doc_num, freq, < pos<sub>1</sub>, pos<sub>2</sub>, ...
319
- # pos<sub>freq-1</sub> > > <sup>*</sup>
320
- #
321
- # This positional information faciliates phrase and proximity searching.
322
- # The enumeration is ordered by document number. Each document number is
323
- # greater than all that precede it in the enumeration.
324
- def term_positions_for(term)
325
- term_positions = term_positions()
326
- term_positions.seek(term)
327
- return term_positions
328
- end
329
-
330
- # Returns an unpositioned @link TermDocPosEnumendenumerator.
331
- def term_positions()
332
- raise NotImplementedError
333
- end
334
-
335
- # Tries to acquire the WriteLock on this directory.
336
- #
337
- # This method is only valid if this IndexReader is directory owner.
338
- #
339
- # raises:: IOError If WriteLock cannot be acquired.
340
- def acquire_write_lock()
341
- if @stale
342
- raise IOError, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations"
343
- end
344
-
345
- if (@write_lock == nil)
346
- @write_lock = @directory.make_lock(IndexWriter::WRITE_LOCK_NAME)
347
- if not @write_lock.obtain(IndexWriter::WRITE_LOCK_TIMEOUT) # obtain write lock
348
- raise IOError, "Index locked for write: " + @write_lock
349
- end
350
-
351
- # we have to check whether index has changed since this reader was opened.
352
- # if so, this reader is no longer valid for deletion
353
- if (SegmentInfos.read_current_version(@directory) > @segment_infos.version())
354
- @stale = true
355
- @write_lock.release()
356
- @write_lock = nil
357
- raise IOError, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations"
358
- end
359
- end
360
- end
361
-
362
- # Returns true if the reader is reading from the latest version of the
363
- # index.
364
- def latest?()
365
- SegmentInfos.read_current_version(@directory) == @segment_infos.version()
366
- end
367
-
368
- # Deletes the document numbered +doc_num+. Once a document is deleted it
369
- # will not appear in TermDocEnum or TermPostitions enumerations. Attempts to
370
- # read its field with the @link #documentend method will result in an error.
371
- # The presence of this document may still be reflected in the @link
372
- # #docFreqendstatistic, though this will be corrected eventually as the
373
- # index is further modified.
374
- def delete(doc_num)
375
- synchronize do
376
- acquire_write_lock() if @directory_owner
377
- do_delete(doc_num)
378
- @has_changes = true
379
- end
380
- return 1
381
- end
382
-
383
- # Implements deletion of the document numbered +doc_num+.
384
- # Applications should call @link #delete(int)endor @link #delete(Term)end.
385
- def do_delete(doc_num)
386
- raise NotImplementedError
387
- end
388
-
389
- # Deletes all documents containing +term+.
390
- # This is useful if one uses a document field to hold a unique ID string for
391
- # the document. Then to delete such a document, one merely constructs a
392
- # term with the appropriate field and the unique ID string as its text and
393
- # passes it to this method. Returns the number of documents deleted. See
394
- # #delete for information about when this deletion will become effective.
395
- def delete_docs_with_term(term)
396
- docs = term_docs_for(term)
397
- if (docs == nil) then return 0 end
398
- n = 0
399
- begin
400
- while (docs.next?)
401
- delete(docs.doc)
402
- n += 1
403
- end
404
- ensure
405
- docs.close()
406
- end
407
- return n
408
- end
409
-
410
- # Undeletes all documents currently marked as deleted in this index.
411
- def undelete_all()
412
- synchronize do
413
- acquire_write_lock() if @directory_owner
414
- do_undelete_all()
415
- @has_changes = true
416
- end
417
- end
418
-
419
- # Commit changes resulting from delete, undelete_all, or set_norm operations
420
- #
421
- # raises:: IOError
422
- def commit()
423
- synchronize do
424
- if @has_changes
425
- if @directory_owner
426
- @directory.synchronize do # in- & inter-process sync
427
- commit_lock = @directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
428
- commit_lock.while_locked do
429
- do_commit()
430
- @segment_infos.write(@directory)
431
- end
432
- end
433
- if (@write_lock != nil)
434
- @write_lock.release() # release write lock
435
- @write_lock = nil
436
- end
437
- else
438
- do_commit()
439
- end
440
- end
441
- @has_changes = false
442
- end
443
- end
444
-
445
- # Closes files associated with this index.
446
- # Also saves any new deletions to disk.
447
- # No other methods should be called after this has been called.
448
- def close()
449
- synchronize do
450
- commit()
451
- do_close()
452
- @directory.close() if @close_directory
453
- end
454
- end
455
-
456
- protected
457
-
458
- # Implements actual undelete_all() in subclass.
459
- def do_undelete_all()
460
- raise NotImplementedError
461
- end
462
-
463
- # Implements commit.
464
- def do_commit()
465
- raise NotImplementedError
466
- end
467
-
468
-
469
- # Implements close.
470
- def do_close()
471
- raise NotImplementedError
472
- end
473
-
474
- # Get a list of unique field names that exist in this index and have the
475
- # specified field option information.
476
- # fld_option:: specifies which field option should be available for the
477
- # returned fields
478
- # returns:: Collection of Strings indicating the names of the fields.
479
- # See IndexReader.FieldOption
480
- def get_field_names()
481
- raise NotImplementedError
482
- end
483
-
484
- # Returns +true+ iff the index in the named directory is
485
- # currently locked.
486
- # directory:: the directory to check for a lock
487
- # raises:: IOError if there is a problem with accessing the index
488
- def IndexReader.locked?(directory)
489
- return (directory.make_lock(IndexWriter::WRITE_LOCK_NAME).locked? or
490
- directory.make_lock(IndexWriter::COMMIT_LOCK_NAME).locked?)
491
- end
492
-
493
- # Forcibly unlocks the index in the named directory.
494
- #
495
- # Caution: this should only be used by failure recovery code,
496
- # when it is known that no other process nor thread is in fact
497
- # currently accessing this index.
498
- def IndexReader.unlock(directory)
499
- directory.make_lock(IndexWriter::WRITE_LOCK_NAME).release
500
- directory.make_lock(IndexWriter::COMMIT_LOCK_NAME).release
501
- end
502
- end
503
- end