ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. data/MIT-LICENSE +1 -1
  2. data/README +12 -24
  3. data/Rakefile +38 -54
  4. data/TODO +14 -17
  5. data/ext/analysis.c +982 -823
  6. data/ext/analysis.h +133 -76
  7. data/ext/array.c +96 -58
  8. data/ext/array.h +40 -13
  9. data/ext/bitvector.c +476 -118
  10. data/ext/bitvector.h +264 -22
  11. data/ext/compound_io.c +217 -229
  12. data/ext/defines.h +49 -0
  13. data/ext/document.c +107 -317
  14. data/ext/document.h +31 -65
  15. data/ext/except.c +81 -36
  16. data/ext/except.h +117 -55
  17. data/ext/extconf.rb +2 -9
  18. data/ext/ferret.c +211 -104
  19. data/ext/ferret.h +22 -11
  20. data/ext/filter.c +97 -82
  21. data/ext/fs_store.c +348 -367
  22. data/ext/global.c +226 -188
  23. data/ext/global.h +44 -26
  24. data/ext/hash.c +474 -391
  25. data/ext/hash.h +441 -68
  26. data/ext/hashset.c +124 -96
  27. data/ext/hashset.h +169 -20
  28. data/ext/helper.c +56 -5
  29. data/ext/helper.h +7 -0
  30. data/ext/inc/lang.h +29 -49
  31. data/ext/inc/threading.h +31 -0
  32. data/ext/ind.c +288 -278
  33. data/ext/ind.h +68 -0
  34. data/ext/index.c +5688 -0
  35. data/ext/index.h +663 -616
  36. data/ext/lang.h +29 -49
  37. data/ext/libstemmer.c +3 -3
  38. data/ext/mem_pool.c +84 -0
  39. data/ext/mem_pool.h +35 -0
  40. data/ext/posh.c +1006 -0
  41. data/ext/posh.h +1007 -0
  42. data/ext/priorityqueue.c +117 -194
  43. data/ext/priorityqueue.h +135 -39
  44. data/ext/q_boolean.c +1305 -1108
  45. data/ext/q_const_score.c +106 -93
  46. data/ext/q_filtered_query.c +138 -135
  47. data/ext/q_fuzzy.c +206 -242
  48. data/ext/q_match_all.c +94 -80
  49. data/ext/q_multi_term.c +663 -0
  50. data/ext/q_parser.c +667 -593
  51. data/ext/q_phrase.c +992 -555
  52. data/ext/q_prefix.c +72 -61
  53. data/ext/q_range.c +235 -210
  54. data/ext/q_span.c +1480 -1166
  55. data/ext/q_term.c +273 -246
  56. data/ext/q_wildcard.c +127 -114
  57. data/ext/r_analysis.c +1720 -711
  58. data/ext/r_index.c +3049 -0
  59. data/ext/r_qparser.c +433 -146
  60. data/ext/r_search.c +2934 -1993
  61. data/ext/r_store.c +372 -143
  62. data/ext/r_utils.c +941 -0
  63. data/ext/ram_store.c +330 -326
  64. data/ext/search.c +1291 -668
  65. data/ext/search.h +403 -702
  66. data/ext/similarity.c +91 -113
  67. data/ext/similarity.h +45 -30
  68. data/ext/sort.c +721 -484
  69. data/ext/stopwords.c +361 -273
  70. data/ext/store.c +556 -58
  71. data/ext/store.h +706 -126
  72. data/ext/tags +3578 -2780
  73. data/ext/term_vectors.c +352 -0
  74. data/ext/threading.h +31 -0
  75. data/ext/win32.h +54 -0
  76. data/lib/ferret.rb +5 -17
  77. data/lib/ferret/document.rb +130 -2
  78. data/lib/ferret/index.rb +577 -26
  79. data/lib/ferret/number_tools.rb +157 -0
  80. data/lib/ferret_version.rb +3 -0
  81. data/test/test_helper.rb +5 -13
  82. data/test/unit/analysis/tc_analyzer.rb +513 -1
  83. data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
  84. data/test/unit/index/tc_index.rb +183 -240
  85. data/test/unit/index/tc_index_reader.rb +312 -479
  86. data/test/unit/index/tc_index_writer.rb +397 -13
  87. data/test/unit/index/th_doc.rb +269 -206
  88. data/test/unit/query_parser/tc_query_parser.rb +40 -33
  89. data/test/unit/search/tc_filter.rb +59 -71
  90. data/test/unit/search/tc_fuzzy_query.rb +24 -16
  91. data/test/unit/search/tc_index_searcher.rb +23 -201
  92. data/test/unit/search/tc_multi_searcher.rb +78 -226
  93. data/test/unit/search/tc_search_and_sort.rb +93 -81
  94. data/test/unit/search/tc_sort.rb +23 -23
  95. data/test/unit/search/tc_sort_field.rb +7 -7
  96. data/test/unit/search/tc_spans.rb +51 -47
  97. data/test/unit/search/tm_searcher.rb +339 -0
  98. data/test/unit/store/tc_fs_store.rb +1 -1
  99. data/test/unit/store/tm_store_lock.rb +3 -3
  100. data/test/unit/tc_document.rb +81 -0
  101. data/test/unit/ts_analysis.rb +1 -1
  102. data/test/unit/ts_utils.rb +1 -1
  103. data/test/unit/utils/tc_bit_vector.rb +288 -0
  104. data/test/unit/utils/tc_number_tools.rb +117 -0
  105. data/test/unit/utils/tc_priority_queue.rb +106 -0
  106. metadata +140 -301
  107. data/CHANGELOG +0 -9
  108. data/ext/dummy.exe +0 -0
  109. data/ext/field.c +0 -408
  110. data/ext/frtio.h +0 -13
  111. data/ext/inc/except.h +0 -90
  112. data/ext/index_io.c +0 -382
  113. data/ext/index_rw.c +0 -2658
  114. data/ext/lang.c +0 -41
  115. data/ext/nix_io.c +0 -134
  116. data/ext/q_multi_phrase.c +0 -380
  117. data/ext/r_doc.c +0 -582
  118. data/ext/r_index_io.c +0 -1021
  119. data/ext/r_term.c +0 -219
  120. data/ext/term.c +0 -820
  121. data/ext/termdocs.c +0 -611
  122. data/ext/vector.c +0 -637
  123. data/ext/w32_io.c +0 -150
  124. data/lib/ferret/analysis.rb +0 -11
  125. data/lib/ferret/analysis/analyzers.rb +0 -112
  126. data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
  127. data/lib/ferret/analysis/token.rb +0 -100
  128. data/lib/ferret/analysis/token_filters.rb +0 -86
  129. data/lib/ferret/analysis/token_stream.rb +0 -26
  130. data/lib/ferret/analysis/tokenizers.rb +0 -112
  131. data/lib/ferret/analysis/word_list_loader.rb +0 -27
  132. data/lib/ferret/document/document.rb +0 -152
  133. data/lib/ferret/document/field.rb +0 -312
  134. data/lib/ferret/index/compound_file_io.rb +0 -338
  135. data/lib/ferret/index/document_writer.rb +0 -289
  136. data/lib/ferret/index/field_infos.rb +0 -279
  137. data/lib/ferret/index/fields_io.rb +0 -181
  138. data/lib/ferret/index/index.rb +0 -675
  139. data/lib/ferret/index/index_file_names.rb +0 -33
  140. data/lib/ferret/index/index_reader.rb +0 -503
  141. data/lib/ferret/index/index_writer.rb +0 -534
  142. data/lib/ferret/index/multi_reader.rb +0 -377
  143. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
  144. data/lib/ferret/index/segment_infos.rb +0 -130
  145. data/lib/ferret/index/segment_merge_info.rb +0 -49
  146. data/lib/ferret/index/segment_merge_queue.rb +0 -16
  147. data/lib/ferret/index/segment_merger.rb +0 -358
  148. data/lib/ferret/index/segment_reader.rb +0 -412
  149. data/lib/ferret/index/segment_term_enum.rb +0 -169
  150. data/lib/ferret/index/segment_term_vector.rb +0 -58
  151. data/lib/ferret/index/term.rb +0 -53
  152. data/lib/ferret/index/term_buffer.rb +0 -83
  153. data/lib/ferret/index/term_doc_enum.rb +0 -291
  154. data/lib/ferret/index/term_enum.rb +0 -52
  155. data/lib/ferret/index/term_info.rb +0 -37
  156. data/lib/ferret/index/term_infos_io.rb +0 -321
  157. data/lib/ferret/index/term_vector_offset_info.rb +0 -20
  158. data/lib/ferret/index/term_vectors_io.rb +0 -553
  159. data/lib/ferret/query_parser.rb +0 -312
  160. data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
  161. data/lib/ferret/search.rb +0 -50
  162. data/lib/ferret/search/boolean_clause.rb +0 -100
  163. data/lib/ferret/search/boolean_query.rb +0 -299
  164. data/lib/ferret/search/boolean_scorer.rb +0 -294
  165. data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
  166. data/lib/ferret/search/conjunction_scorer.rb +0 -99
  167. data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
  168. data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
  169. data/lib/ferret/search/explanation.rb +0 -41
  170. data/lib/ferret/search/field_cache.rb +0 -215
  171. data/lib/ferret/search/field_doc.rb +0 -31
  172. data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
  173. data/lib/ferret/search/filter.rb +0 -11
  174. data/lib/ferret/search/filtered_query.rb +0 -130
  175. data/lib/ferret/search/filtered_term_enum.rb +0 -79
  176. data/lib/ferret/search/fuzzy_query.rb +0 -154
  177. data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
  178. data/lib/ferret/search/hit_collector.rb +0 -34
  179. data/lib/ferret/search/hit_queue.rb +0 -11
  180. data/lib/ferret/search/index_searcher.rb +0 -200
  181. data/lib/ferret/search/match_all_query.rb +0 -104
  182. data/lib/ferret/search/multi_phrase_query.rb +0 -216
  183. data/lib/ferret/search/multi_searcher.rb +0 -261
  184. data/lib/ferret/search/multi_term_query.rb +0 -65
  185. data/lib/ferret/search/non_matching_scorer.rb +0 -22
  186. data/lib/ferret/search/phrase_positions.rb +0 -55
  187. data/lib/ferret/search/phrase_query.rb +0 -214
  188. data/lib/ferret/search/phrase_scorer.rb +0 -152
  189. data/lib/ferret/search/prefix_query.rb +0 -54
  190. data/lib/ferret/search/query.rb +0 -140
  191. data/lib/ferret/search/query_filter.rb +0 -51
  192. data/lib/ferret/search/range_filter.rb +0 -103
  193. data/lib/ferret/search/range_query.rb +0 -139
  194. data/lib/ferret/search/req_excl_scorer.rb +0 -125
  195. data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
  196. data/lib/ferret/search/score_doc.rb +0 -38
  197. data/lib/ferret/search/score_doc_comparator.rb +0 -114
  198. data/lib/ferret/search/scorer.rb +0 -91
  199. data/lib/ferret/search/similarity.rb +0 -278
  200. data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
  201. data/lib/ferret/search/sort.rb +0 -112
  202. data/lib/ferret/search/sort_comparator.rb +0 -60
  203. data/lib/ferret/search/sort_field.rb +0 -91
  204. data/lib/ferret/search/spans.rb +0 -12
  205. data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
  206. data/lib/ferret/search/spans/span_first_query.rb +0 -79
  207. data/lib/ferret/search/spans/span_near_query.rb +0 -108
  208. data/lib/ferret/search/spans/span_not_query.rb +0 -130
  209. data/lib/ferret/search/spans/span_or_query.rb +0 -176
  210. data/lib/ferret/search/spans/span_query.rb +0 -25
  211. data/lib/ferret/search/spans/span_scorer.rb +0 -74
  212. data/lib/ferret/search/spans/span_term_query.rb +0 -105
  213. data/lib/ferret/search/spans/span_weight.rb +0 -84
  214. data/lib/ferret/search/spans/spans_enum.rb +0 -44
  215. data/lib/ferret/search/term_query.rb +0 -128
  216. data/lib/ferret/search/term_scorer.rb +0 -183
  217. data/lib/ferret/search/top_docs.rb +0 -36
  218. data/lib/ferret/search/top_field_docs.rb +0 -17
  219. data/lib/ferret/search/weight.rb +0 -54
  220. data/lib/ferret/search/wildcard_query.rb +0 -26
  221. data/lib/ferret/search/wildcard_term_enum.rb +0 -61
  222. data/lib/ferret/stemmers.rb +0 -1
  223. data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
  224. data/lib/ferret/store.rb +0 -5
  225. data/lib/ferret/store/buffered_index_io.rb +0 -190
  226. data/lib/ferret/store/directory.rb +0 -141
  227. data/lib/ferret/store/fs_store.rb +0 -381
  228. data/lib/ferret/store/index_io.rb +0 -245
  229. data/lib/ferret/store/ram_store.rb +0 -286
  230. data/lib/ferret/utils.rb +0 -8
  231. data/lib/ferret/utils/bit_vector.rb +0 -123
  232. data/lib/ferret/utils/date_tools.rb +0 -138
  233. data/lib/ferret/utils/number_tools.rb +0 -91
  234. data/lib/ferret/utils/parameter.rb +0 -41
  235. data/lib/ferret/utils/priority_queue.rb +0 -120
  236. data/lib/ferret/utils/string_helper.rb +0 -47
  237. data/lib/ferret/utils/thread_local.rb +0 -28
  238. data/lib/ferret/utils/weak_key_hash.rb +0 -60
  239. data/lib/rferret.rb +0 -37
  240. data/rake_utils/code_statistics.rb +0 -106
  241. data/test/benchmark/tb_ram_store.rb +0 -76
  242. data/test/benchmark/tb_rw_vint.rb +0 -26
  243. data/test/functional/thread_safety_index_test.rb +0 -81
  244. data/test/functional/thread_safety_test.rb +0 -137
  245. data/test/longrunning/tc_numbertools.rb +0 -60
  246. data/test/longrunning/tm_store.rb +0 -19
  247. data/test/unit/analysis/ctc_analyzer.rb +0 -532
  248. data/test/unit/analysis/data/wordfile +0 -6
  249. data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
  250. data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
  251. data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
  252. data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
  253. data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
  254. data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
  255. data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
  256. data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
  257. data/test/unit/analysis/rtc_stop_filter.rb +0 -14
  258. data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
  259. data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
  260. data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
  261. data/test/unit/analysis/tc_token.rb +0 -25
  262. data/test/unit/document/rtc_field.rb +0 -28
  263. data/test/unit/document/tc_document.rb +0 -47
  264. data/test/unit/document/tc_field.rb +0 -98
  265. data/test/unit/index/rtc_compound_file_io.rb +0 -107
  266. data/test/unit/index/rtc_field_infos.rb +0 -127
  267. data/test/unit/index/rtc_fields_io.rb +0 -167
  268. data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
  269. data/test/unit/index/rtc_segment_infos.rb +0 -74
  270. data/test/unit/index/rtc_segment_term_docs.rb +0 -17
  271. data/test/unit/index/rtc_segment_term_enum.rb +0 -60
  272. data/test/unit/index/rtc_segment_term_vector.rb +0 -71
  273. data/test/unit/index/rtc_term_buffer.rb +0 -57
  274. data/test/unit/index/rtc_term_info.rb +0 -19
  275. data/test/unit/index/rtc_term_infos_io.rb +0 -192
  276. data/test/unit/index/rtc_term_vectors_io.rb +0 -108
  277. data/test/unit/index/tc_term.rb +0 -27
  278. data/test/unit/index/tc_term_voi.rb +0 -18
  279. data/test/unit/search/rtc_similarity.rb +0 -37
  280. data/test/unit/search/rtc_sort_field.rb +0 -14
  281. data/test/unit/search/tc_multi_searcher2.rb +0 -126
  282. data/test/unit/store/rtc_fs_store.rb +0 -62
  283. data/test/unit/store/rtc_ram_store.rb +0 -15
  284. data/test/unit/store/rtm_store.rb +0 -150
  285. data/test/unit/store/rtm_store_lock.rb +0 -2
  286. data/test/unit/ts_document.rb +0 -2
  287. data/test/unit/utils/rtc_bit_vector.rb +0 -73
  288. data/test/unit/utils/rtc_date_tools.rb +0 -50
  289. data/test/unit/utils/rtc_number_tools.rb +0 -59
  290. data/test/unit/utils/rtc_parameter.rb +0 -40
  291. data/test/unit/utils/rtc_priority_queue.rb +0 -62
  292. data/test/unit/utils/rtc_string_helper.rb +0 -21
  293. data/test/unit/utils/rtc_thread.rb +0 -61
  294. data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
  295. data/test/utils/number_to_spoken.rb +0 -132
data/lib/ferret/store.rb DELETED
@@ -1,5 +0,0 @@
1
- require 'ferret/store/directory'
2
- require 'ferret/store/index_io'
3
- require 'ferret/store/buffered_index_io'
4
- require 'ferret/store/fs_store'
5
- require 'ferret/store/ram_store'
@@ -1,190 +0,0 @@
1
- module Ferret::Store
2
- BUFFER_SIZE = 1024
3
- BUFFER = " " * BUFFER_SIZE
4
-
5
- # Base implementation class for a buffered IndexOutput.
6
- class BufferedIndexOutput < IndexOutput
7
-
8
- def initialize
9
- @buffer = BUFFER.clone
10
- @buffer_start = 0 # position in file of buffer
11
- @buffer_position = 0 # position in buffer
12
- end
13
-
14
- # Writes a single byte.
15
- def write_byte(b)
16
-
17
- # The following code offers a 5% speed improvement over the line
18
- # below. It relies on the fact that ruby will throw an error if we try
19
- # and modify a character that is out of range for the string.
20
- #begin
21
- # @buffer[@buffer_position] = b
22
- # @buffer_position += 1
23
- #rescue IndexError
24
- # flush
25
- # @buffer[@buffer_position] = b
26
- # @buffer_position += 1
27
- #end
28
-
29
- flush if @buffer_position >= BUFFER_SIZE
30
- @buffer[@buffer_position] = b
31
- @buffer_position += 1
32
- end
33
-
34
- # Writes an array of bytes.
35
- # buf:: the bytes to write
36
- # length:: the number of bytes to write
37
- def write_bytes(buf, length)
38
- length.times do |i|
39
- write_byte(buf[i])
40
- end
41
- end
42
-
43
- # Forces any buffered output to be written.
44
- def flush()
45
- flush_buffer(@buffer, @buffer_position)
46
- @buffer_start += @buffer_position
47
- @buffer_position = 0
48
- end
49
-
50
- # Closes this stream to further operations.
51
- def close()
52
- flush()
53
- end
54
-
55
- # Get the current position in the file, where the next write will occur.
56
- def pos()
57
- return @buffer_start + @buffer_position
58
- end
59
-
60
- # Set the current position in the file, where the next write will occur.
61
- def seek(pos)
62
- flush()
63
- @buffer_start = pos
64
- end
65
-
66
- # The number of bytes in the file.
67
- def length
68
- raise NotImplementedError
69
- end
70
-
71
- private
72
-
73
- # Expert: implements buffer write. Writes the first len bytes from the
74
- # buffer to the output.
75
- #
76
- # buf:: the bytes to write
77
- # len:: the number of bytes to write
78
- def flush_buffer(buf, len)
79
- raise NotImplementedError
80
- end
81
- end
82
-
83
- # Base implementation class for buffered IndexInput
84
- class BufferedIndexInput < IndexInput
85
- def initialize
86
- @buffer = nil
87
- @buffer_start = 0
88
- @buffer_length = 0
89
- @buffer_position = 0
90
- end
91
-
92
- # Read and return a single byte from the file
93
- def read_byte
94
- refill if (@buffer_position >= @buffer_length)
95
- byte = @buffer[@buffer_position]
96
- @buffer_position += 1
97
- return byte
98
- end
99
-
100
- # Read +len+ bytes into +buffer+ starting at position +offset+ in +buffer+
101
- #
102
- # buffer:: The string buffer to read the characters into.
103
- # offset:: The position in +buffer+ to start writing to.
104
- # len:: the number of characters to read
105
- # returns:: the buffer
106
- def read_bytes(buffer, offset, len)
107
- if (len < BUFFER_SIZE)
108
- offset.upto(offset+len-1) do |i| # read byte-by-byte
109
- buffer[i] = read_byte
110
- end
111
- else # read all-at-once
112
- start = pos()
113
- seek_internal(start)
114
- read_internal(buffer, offset, len)
115
-
116
- @buffer_start = start + len # adjust stream variables
117
- @buffer_position = 0
118
- @buffer_length = 0 # trigger refill on read
119
- end
120
- return buffer
121
- end
122
-
123
- # Get the current position in the file, where the next read will occur.
124
- def pos()
125
- return @buffer_start + @buffer_position
126
- end
127
-
128
- # Set the current position in the file, where the next read will occur.
129
- def seek(pos)
130
- if (pos >= @buffer_start and pos < (@buffer_start + @buffer_length))
131
- @buffer_position = pos - @buffer_start # seek within buffer
132
- else
133
- @buffer_start = pos
134
- @buffer_position = 0
135
- @buffer_length = 0 # trigger refill() on read()
136
- seek_internal(pos)
137
- end
138
- end
139
-
140
- # Creates a clone of the BufferedIndexReader. Reading from a
141
- # BufferedIndexInput should not change the state (read position) in the
142
- # clone and vice-versa.
143
- def initialize_copy(o)
144
- super
145
- @buffer = o.buffer.clone if o.buffer
146
- end
147
-
148
- attr_reader :buffer
149
- protected :buffer
150
-
151
- private
152
-
153
- # Expert: implements buffer refill. Reads bytes from the current position
154
- # in the input.
155
- # buf:: the array to read bytes into
156
- # offset:: the offset in the array to start storing bytes
157
- # len:: the number of bytes to read
158
- def read_internal(buf, offset, len)
159
- raise NotImplementedError
160
- end
161
-
162
- # Expert: implements seek. Sets current position in this file, where the
163
- # next read_internal will occur.
164
- # pos:: the position to set to
165
- def seek_internal(pos)
166
- raise NotImplementedError
167
- end
168
-
169
- # Refill the buffer from the file.
170
- def refill
171
- start = @buffer_start + @buffer_position
172
- last = start + BUFFER_SIZE
173
- if (last > length()) # don't read past EOF
174
- last = length()
175
- end
176
- @buffer_length = last - start
177
- if (@buffer_length <= 0)
178
- raise EOFError
179
- end
180
-
181
- if (@buffer == nil)
182
- @buffer = BUFFER.clone # allocate buffer lazily
183
- end
184
- read_internal(@buffer, 0, @buffer_length)
185
-
186
- @buffer_start = start
187
- @buffer_position = 0
188
- end
189
- end
190
- end
@@ -1,141 +0,0 @@
1
- module Ferret::Store
2
- # A Directory is an object which is used to access the index storage.
3
- # Ruby's IO API is not used so that we can use different storage
4
- # mechanisms to store the index. Some examples are;
5
- #
6
- # * File system based storage
7
- # * RAM based storage
8
- # * Database based storage
9
- #
10
- # NOTE: Once a file has been written and closed, it can no longer be
11
- # modified. To make any changes to the file it must be deleted and
12
- # rewritten. For this reason, the method to open a file for writing is
13
- # called _create_output_, while the method to open a file for reading is
14
- # called _open_input_ If there is a risk of simultaneous modifications of
15
- # the files then locks should be used. See Lock to find out how.
16
- class Directory
17
- LOCK_PREFIX = "ferret-"
18
-
19
- # returns an array of strings, one for each file in the directory
20
- def each # :yeilds: file_name
21
- raise NotImplementedError
22
- end
23
-
24
- # returns the number of files in the directory
25
- def file_count()
26
- i = 0
27
- each {|f| i += 1}
28
- return i
29
- end
30
-
31
- # Returns true if a file with the given name exists.
32
- def exists?(file)
33
- raise NotImplementedError
34
- end
35
-
36
- # Returns the time the named file was last modified.
37
- def modified(file)
38
- raise NotImplementedError
39
- end
40
-
41
- # Set the modified time of an existing file to now.
42
- def touch(file)
43
- raise NotImplementedError
44
- end
45
-
46
- # Removes an existing file in the directory.
47
- def delete(file)
48
- raise NotImplementedError
49
- end
50
-
51
- # Renames an existing file in the directory.
52
- # If a file already exists with the new name, then it is replaced.
53
- # This replacement should be atomic.
54
- def rename(from, to)
55
- raise NotImplementedError
56
- end
57
-
58
- # Returns the length of a file in the directory.
59
- def length(file)
60
- raise NotImplementedError
61
- end
62
-
63
- # Creates a new, empty file in the directory with the given name.
64
- # Returns a stream writing this file.
65
- def create_output(file_name)
66
- raise NotImplementedError
67
- end
68
-
69
- # Returns a stream reading an existing file.
70
- def open_input(file_name)
71
- raise NotImplementedError
72
- end
73
-
74
- # Construct a Lock.
75
- def make_lock(lock_name)
76
- raise NotImplementedError
77
- end
78
-
79
- # Closes the store.
80
- def close
81
- raise NotImplementedError
82
- end
83
-
84
- end
85
-
86
- # A Lock is used to lock a data source so that not more than one
87
- # output stream can access a data source at one time. It is possible
88
- # that locks could be disabled. For example a read only index stored
89
- # on a CDROM would have no need for a lock.
90
- #
91
- # You can use a lock in two ways. Firstly:
92
- #
93
- # write_lock = @directory.make_lock(LOCK_NAME)
94
- # write_lock.obtain(WRITE_LOCK_TIME_OUT)
95
- # ... # Do your file modifications # ...
96
- # write_lock.release()
97
- #
98
- # Alternatively you could use the while locked method. This ensures that
99
- # the lock will be released once processing has finished.
100
- #
101
- # write_lock = @directory.make_lock(LOCK_NAME)
102
- # write_lock.while_locked(WRITE_LOCK_TIME_OUT) do
103
- # ... # Do your file modifications # ...
104
- # end
105
- class Lock
106
- # Attempts made to obtain the lock before the application gives up. If
107
- # you want the process to wait longer to get the lock then just increase
108
- # the lock timeout
109
- MAX_ATTEMPTS = 5
110
-
111
- # Obtain the lock on the data source. If you expect to have to wait for
112
- # a while on a lock then you should set the lock_timeout to a large
113
- # number. This may be necessary if you are doing multiple large batch
114
- # updates on an index but the default 1 second should be fine in most
115
- # cases.
116
- def obtain(lock_timeout = 1)
117
- raise NotImplementedError
118
- end
119
-
120
- # Release the lock on the data source
121
- def release
122
- raise NotImplementedError
123
- end
124
-
125
- # Returns true if there is a lock on the data source
126
- def locked?
127
- raise NotImplementedError
128
- end
129
-
130
- # Obtains the lock, processes the block and ensures that the lock is
131
- # released when the block terminates. The lock timeout is in seconds.
132
- def while_locked(lock_timeout=1)
133
- obtain(lock_timeout)
134
- begin
135
- yield
136
- ensure
137
- release()
138
- end
139
- end
140
- end
141
- end
@@ -1,381 +0,0 @@
1
- module Ferret::Store
2
-
3
- require 'monitor'
4
- require 'fileutils'
5
- require 'digest/md5'
6
-
7
- # This is a filesystem implementation of Directory and will be the one
8
- # usually used for storing the index. This implementation stores each
9
- # separate file as a separate file on the operating system. This works fine
10
- # and is the most efficient solution for small to medium size indexes. For
11
- # very large indexes, there may be a problem with the operating system not
12
- # wanting to open to many files. One fix for this is to change the maximum
13
- # open files setting in your operating system. Alternatively you could use
14
- # a compound file instead.
15
- #
16
- # TODO:
17
- # * need a better way of setting properties. Currently you have to
18
- # change the constants to disable locking.
19
- class FSDirectory < Directory
20
- include MonitorMixin
21
-
22
- # This cache of directories ensures that there is a unique Directory
23
- # instance per path, so that synchronization on the Directory can be used to
24
- # synchronize access between readers and writers.
25
- @@Directories = Hash.new.extend(MonitorMixin)
26
-
27
-
28
- # Locks should be disabled it there is no need for them
29
- LOCKS_DISABLED = false
30
-
31
- # The lock dir is the directory where the file locks will be stored
32
- LOCK_DIR = nil
33
-
34
- # Create a new directory from the path.
35
- # path:: the path to the directory.
36
- # create:: if true, create, or erase any existing contents.
37
- def initialize(path, create)
38
- super()
39
- if create then FileUtils.mkdir_p(path) end
40
- if not File.directory?(path) then
41
- raise IOError, "There is no directory: #{path}. Use create = true to create one"
42
- end
43
- @dir = Dir.new(path)
44
- # put the lock_dir here as well if no default exists.
45
- if LOCK_DIR then
46
- @lock_dir = Dir.new(LOCK_DIR)
47
- else
48
- @lock_dir = Dir.new(path)
49
- end
50
- @ref_count = 0
51
- end
52
-
53
- class <<FSDirectory
54
- alias :allocate :new
55
- protected :allocate
56
- end
57
-
58
- # Returns the directory instance for the named location.
59
- #
60
- # Directories are cached, so that, for a given canonical path, the same
61
- # FSDirectory instance will always be returned. This permits
62
- # synchronization on directories.
63
- #
64
- # path:: the path to the directory.
65
- # create:: if true, create, or erase any existing contents.
66
- def FSDirectory.new(path, create = false)
67
- dir = nil
68
- @@Directories.synchronize do
69
- dir = @@Directories[path]
70
- if not dir then
71
- dir = FSDirectory.allocate(path, create)
72
- @@Directories[path] = dir
73
- end
74
- dir.refresh if create
75
- end
76
- dir.synchronize do
77
- dir.reference()
78
- end
79
- return dir
80
- end
81
-
82
- # Returns true if locks have been disabled
83
- def FSDirectory.locks_disabled?
84
- LOCKS_DISABLED
85
- end
86
-
87
- # Set the directory where all of the locks will be stored.
88
- # path:: the path to the directory where the locks will be stored.
89
- # An exception will be raised if the directory does not exist
90
- def lock_dir=(path)
91
- # close the old lock dir if it exists
92
- @lock_dir.close() if @lock_dir
93
- @lock_dir = Dir.new(path)
94
- end
95
-
96
- # Returns a Dir object of the directory where the lock is stored
97
- attr_reader :lock_dir
98
-
99
- # Remove all files and locks from this directory so we have a clean instance
100
- def refresh
101
- synchronize do
102
- # delete all the files
103
- refresh_dir
104
- each do |fname|
105
- FileUtils.rm_rf(dir_path(fname))
106
- end
107
- # clear all the locks
108
- refresh_lock_dir
109
- @lock_dir.each do |lock_fname|
110
- next if lock_fname == '.' or lock_fname == '..'
111
- FileUtils.rm_rf(@lock_dir.path + '/' + lock_fname)
112
- end
113
- end
114
- end
115
-
116
- #--
117
- # Directory implementation
118
- #++
119
-
120
- # Iterates through the file listing, skipping lock files if they exist
121
- def each()
122
- refresh_dir
123
- @dir.each do |file_name|
124
- # return all files except for the current and parent directories
125
- # and any lock files that exist in this directory
126
- next if ['.', '..'].include?(file_name)
127
- next if file_name =~ Regexp.new('^' + lock_prefix)
128
- yield file_name
129
- end
130
- end
131
-
132
- # Returns true if a file with the given name exists.
133
- def exists?(name)
134
- File.exists?(dir_path(name))
135
- end
136
-
137
- # Returns the time the named file was last modified.
138
- def modified(name)
139
- File.mtime(dir_path(name))
140
- end
141
-
142
- # Set the modified time of an existing file to now.
143
- def touch(name)
144
- # just open the file and close it. No need to do anything with it.
145
- FileUtils.touch(dir_path(name))
146
- end
147
-
148
- # Removes an existing file in the directory.
149
- def delete(name)
150
- begin
151
- File.delete(dir_path(name))
152
- rescue SystemCallError => e
153
- raise IOError, e.to_s
154
- end
155
- end
156
-
157
- # Renames an existing file in the directory.
158
- # If a file already exists with the new name, then it is replaced.
159
- # This replacement should be atomic.
160
- def rename(from, to)
161
- synchronize do
162
- begin
163
- File.rename(dir_path(from), dir_path(to))
164
- rescue
165
- # try again, this time forcing the delete
166
- FileUtils.rm_rf(dir_path(to))
167
- begin
168
- FileUtils.mv(dir_path(from), dir_path(to))
169
- rescue
170
- FileUtils.cp(dir_path(from), dir_path(to))
171
- FileUtils.rm_rf(dir_path(to))
172
- end
173
- end
174
- end
175
- end
176
-
177
-
178
- # Returns the length of a file in the directory.
179
- def length(name)
180
- File.size(dir_path(name))
181
- end
182
-
183
- # Creates a new, empty file in the directory with the given name.
184
- # Returns a stream writing this file.
185
- def create_output(name)
186
- FSIndexOutput.new(dir_path(name))
187
- end
188
-
189
- # Returns a stream reading an existing file.
190
- def open_input(name)
191
- FSIndexInput.new(dir_path(name))
192
- end
193
-
194
- # Construct a Lock.
195
- def make_lock(name)
196
- FSLock.new(@lock_dir.path + "/" + lock_prefix() + name + ".lck")
197
- end
198
-
199
- # Closes the store.
200
- def close()
201
- synchronize do
202
- @ref_count -= 1
203
- if (@ref_count <= 0) then
204
- @@Directories.synchronize do
205
- @@Directories.delete(@dir.path)
206
- close_internal
207
- end
208
- end
209
- end
210
- end
211
-
212
- def reference()
213
- @ref_count += 1
214
- end
215
-
216
- # See Lock for hints as to how to use locks.
217
- class FSLock < Lock
218
- # pass the name of the file that we are going to lock
219
- def initialize(lock_file)
220
- @lock_file = lock_file
221
- #@clean = FSLock.make_finalizer(lock_file)
222
- @clean = lambda { FileUtils.rm_rf(lock_file)}
223
- end
224
-
225
- def FSLock.make_finalizer(lock_file)
226
- lambda { FileUtils.rm_rf(lock_file)}
227
- end
228
-
229
- # obtain the lock on the data source
230
- def obtain(lock_timeout = 1)
231
- return true if FSDirectory.locks_disabled?
232
- MAX_ATTEMPTS.times do
233
- begin
234
- # create a file if none exists. If one already exists
235
- # then someone beat us to the lock so return false
236
- File.open(@lock_file, File::WRONLY|File::EXCL|File::CREAT) {|f|}
237
- ObjectSpace.define_finalizer(self, @clean)
238
- return true
239
- rescue SystemCallError
240
- # lock was not obtained so sleep for timeout then try again.
241
- sleep(lock_timeout)
242
- end
243
- end
244
- # lock could not be obtained so raise an exception
245
- raise "could not obtain lock: #{@lock_file}"
246
- end
247
-
248
- # Release the lock on the data source. Returns true if successful.
249
- def release
250
- return if FSDirectory.locks_disabled?
251
- begin
252
- FileUtils.rm_rf(@lock_file)
253
- ObjectSpace.undefine_finalizer(self)
254
- rescue SystemCallError
255
- # maybe we tried to release a lock that wasn't locked. This
256
- # isn't critical so just return false
257
- return false
258
- end
259
- return true
260
- end
261
-
262
- # returns true if there is a lock on the data source
263
- def locked?
264
- return false if FSDirectory.locks_disabled?
265
- File.exists?(@lock_file)
266
- end
267
- end
268
-
269
- # A file system output stream extending OutputStream to read from the file
270
- # system
271
- class FSIndexOutput < BufferedIndexOutput
272
- def initialize(path)
273
- super()
274
- @file = File.open(path, "wb")
275
- end
276
-
277
- def close
278
- super()
279
- @file.close
280
- end
281
-
282
- def seek(pos)
283
- super(pos)
284
- @file.seek(pos)
285
- end
286
-
287
- private
288
- def flush_buffer(b, size)
289
- @file.syswrite(b[0...size])
290
- end
291
- end
292
-
293
- # A file system input stream extending InputStream to read from the file system
294
- class FSIndexInput < BufferedIndexInput
295
- attr_accessor :is_clone
296
- attr_reader :length, :file
297
-
298
- def initialize(path)
299
- begin
300
- @file = File.open(path, "rb")
301
- rescue Errno::ENOENT => e
302
- raise StandardError.new(e.message)
303
- end
304
- @file.extend(MonitorMixin)
305
- #class <<@file
306
- # attr_accessor :ref_count
307
- #end
308
- #@file.ref_count = 1
309
- @length = File.size(path)
310
- @is_clone = false
311
- super()
312
- end
313
-
314
- def close
315
- #@file.ref_count -= 1
316
- #@file.close if @file.ref_count == 0
317
- @file.close if not @is_clone
318
- end
319
-
320
- # We need to record if this is a clone so we know when to close the file.
321
- # The file should only be closed when the original FSIndexInput is closed.
322
- def initialize_copy(o)
323
- super
324
- @is_clone = true
325
- end
326
-
327
- private
328
-
329
- def read_internal(b, offset, length)
330
- #@file.synchronize do
331
- position = pos()
332
- if position != @file.pos
333
- @file.seek(position)
334
- end
335
- bytes = @file.read(length)
336
- if bytes.nil?
337
- raise EOFError, "Read past EOF in #{@file.path}"
338
- end
339
- b[offset, bytes.length] = bytes
340
- #end
341
- end
342
-
343
- def seek_internal(pos)
344
- @file.seek(pos)
345
- end
346
-
347
- end
348
-
349
- private
350
-
351
- # Add the directory path to the file name for opening
352
- def dir_path(name)
353
- File.join(@dir.path, name)
354
- end
355
-
356
- # returns the lock prefix for this directory
357
- def lock_prefix
358
- LOCK_PREFIX
359
- end
360
-
361
- # Unfortunately, on Windows, Dir does not refresh when rewind is called
362
- # so any new files will be hidden. So we open the directory again.
363
- def refresh_dir()
364
- tmp = Dir.new(@dir.path)
365
- @dir.close
366
- @dir = tmp
367
- end
368
-
369
- def refresh_lock_dir()
370
- tmp = Dir.new(@lock_dir.path)
371
- @lock_dir.close
372
- @lock_dir = tmp
373
- end
374
-
375
- # This method is only used by the c extension to free the directory
376
- def close_internal
377
- end
378
-
379
- #end private
380
- end
381
- end