ferret 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +109 -0
  3. data/Rakefile +275 -0
  4. data/TODO +9 -0
  5. data/TUTORIAL +197 -0
  6. data/ext/extconf.rb +3 -0
  7. data/ext/ferret.c +23 -0
  8. data/ext/ferret.h +85 -0
  9. data/ext/index_io.c +543 -0
  10. data/ext/priority_queue.c +227 -0
  11. data/ext/ram_directory.c +316 -0
  12. data/ext/segment_merge_queue.c +41 -0
  13. data/ext/string_helper.c +42 -0
  14. data/ext/tags +240 -0
  15. data/ext/term.c +261 -0
  16. data/ext/term_buffer.c +299 -0
  17. data/ext/util.c +12 -0
  18. data/lib/ferret.rb +41 -0
  19. data/lib/ferret/analysis.rb +11 -0
  20. data/lib/ferret/analysis/analyzers.rb +93 -0
  21. data/lib/ferret/analysis/standard_tokenizer.rb +65 -0
  22. data/lib/ferret/analysis/token.rb +79 -0
  23. data/lib/ferret/analysis/token_filters.rb +86 -0
  24. data/lib/ferret/analysis/token_stream.rb +26 -0
  25. data/lib/ferret/analysis/tokenizers.rb +107 -0
  26. data/lib/ferret/analysis/word_list_loader.rb +27 -0
  27. data/lib/ferret/document.rb +2 -0
  28. data/lib/ferret/document/document.rb +152 -0
  29. data/lib/ferret/document/field.rb +304 -0
  30. data/lib/ferret/index.rb +26 -0
  31. data/lib/ferret/index/compound_file_io.rb +343 -0
  32. data/lib/ferret/index/document_writer.rb +288 -0
  33. data/lib/ferret/index/field_infos.rb +259 -0
  34. data/lib/ferret/index/fields_io.rb +175 -0
  35. data/lib/ferret/index/index.rb +228 -0
  36. data/lib/ferret/index/index_file_names.rb +33 -0
  37. data/lib/ferret/index/index_reader.rb +462 -0
  38. data/lib/ferret/index/index_writer.rb +488 -0
  39. data/lib/ferret/index/multi_reader.rb +363 -0
  40. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +105 -0
  41. data/lib/ferret/index/segment_infos.rb +130 -0
  42. data/lib/ferret/index/segment_merge_info.rb +47 -0
  43. data/lib/ferret/index/segment_merge_queue.rb +16 -0
  44. data/lib/ferret/index/segment_merger.rb +337 -0
  45. data/lib/ferret/index/segment_reader.rb +380 -0
  46. data/lib/ferret/index/segment_term_enum.rb +178 -0
  47. data/lib/ferret/index/segment_term_vector.rb +58 -0
  48. data/lib/ferret/index/term.rb +49 -0
  49. data/lib/ferret/index/term_buffer.rb +88 -0
  50. data/lib/ferret/index/term_doc_enum.rb +283 -0
  51. data/lib/ferret/index/term_enum.rb +52 -0
  52. data/lib/ferret/index/term_info.rb +41 -0
  53. data/lib/ferret/index/term_infos_io.rb +312 -0
  54. data/lib/ferret/index/term_vector_offset_info.rb +20 -0
  55. data/lib/ferret/index/term_vectors_io.rb +552 -0
  56. data/lib/ferret/query_parser.rb +274 -0
  57. data/lib/ferret/query_parser/query_parser.tab.rb +819 -0
  58. data/lib/ferret/search.rb +49 -0
  59. data/lib/ferret/search/boolean_clause.rb +100 -0
  60. data/lib/ferret/search/boolean_query.rb +303 -0
  61. data/lib/ferret/search/boolean_scorer.rb +294 -0
  62. data/lib/ferret/search/caching_wrapper_filter.rb +40 -0
  63. data/lib/ferret/search/conjunction_scorer.rb +99 -0
  64. data/lib/ferret/search/disjunction_sum_scorer.rb +203 -0
  65. data/lib/ferret/search/exact_phrase_scorer.rb +32 -0
  66. data/lib/ferret/search/explanation.rb +41 -0
  67. data/lib/ferret/search/field_cache.rb +216 -0
  68. data/lib/ferret/search/field_doc.rb +31 -0
  69. data/lib/ferret/search/field_sorted_hit_queue.rb +184 -0
  70. data/lib/ferret/search/filter.rb +11 -0
  71. data/lib/ferret/search/filtered_query.rb +130 -0
  72. data/lib/ferret/search/filtered_term_enum.rb +79 -0
  73. data/lib/ferret/search/fuzzy_query.rb +153 -0
  74. data/lib/ferret/search/fuzzy_term_enum.rb +244 -0
  75. data/lib/ferret/search/hit_collector.rb +34 -0
  76. data/lib/ferret/search/hit_queue.rb +11 -0
  77. data/lib/ferret/search/index_searcher.rb +173 -0
  78. data/lib/ferret/search/match_all_docs_query.rb +104 -0
  79. data/lib/ferret/search/multi_phrase_query.rb +204 -0
  80. data/lib/ferret/search/multi_term_query.rb +65 -0
  81. data/lib/ferret/search/non_matching_scorer.rb +22 -0
  82. data/lib/ferret/search/phrase_positions.rb +55 -0
  83. data/lib/ferret/search/phrase_query.rb +217 -0
  84. data/lib/ferret/search/phrase_scorer.rb +153 -0
  85. data/lib/ferret/search/prefix_query.rb +47 -0
  86. data/lib/ferret/search/query.rb +111 -0
  87. data/lib/ferret/search/query_filter.rb +51 -0
  88. data/lib/ferret/search/range_filter.rb +103 -0
  89. data/lib/ferret/search/range_query.rb +139 -0
  90. data/lib/ferret/search/req_excl_scorer.rb +125 -0
  91. data/lib/ferret/search/req_opt_sum_scorer.rb +70 -0
  92. data/lib/ferret/search/score_doc.rb +38 -0
  93. data/lib/ferret/search/score_doc_comparator.rb +114 -0
  94. data/lib/ferret/search/scorer.rb +91 -0
  95. data/lib/ferret/search/similarity.rb +278 -0
  96. data/lib/ferret/search/sloppy_phrase_scorer.rb +47 -0
  97. data/lib/ferret/search/sort.rb +105 -0
  98. data/lib/ferret/search/sort_comparator.rb +60 -0
  99. data/lib/ferret/search/sort_field.rb +87 -0
  100. data/lib/ferret/search/spans.rb +12 -0
  101. data/lib/ferret/search/spans/near_spans_enum.rb +304 -0
  102. data/lib/ferret/search/spans/span_first_query.rb +79 -0
  103. data/lib/ferret/search/spans/span_near_query.rb +108 -0
  104. data/lib/ferret/search/spans/span_not_query.rb +130 -0
  105. data/lib/ferret/search/spans/span_or_query.rb +176 -0
  106. data/lib/ferret/search/spans/span_query.rb +25 -0
  107. data/lib/ferret/search/spans/span_scorer.rb +74 -0
  108. data/lib/ferret/search/spans/span_term_query.rb +105 -0
  109. data/lib/ferret/search/spans/span_weight.rb +84 -0
  110. data/lib/ferret/search/spans/spans_enum.rb +44 -0
  111. data/lib/ferret/search/term_query.rb +128 -0
  112. data/lib/ferret/search/term_scorer.rb +181 -0
  113. data/lib/ferret/search/top_docs.rb +24 -0
  114. data/lib/ferret/search/top_field_docs.rb +17 -0
  115. data/lib/ferret/search/weight.rb +54 -0
  116. data/lib/ferret/search/wildcard_query.rb +26 -0
  117. data/lib/ferret/search/wildcard_term_enum.rb +61 -0
  118. data/lib/ferret/stemmers.rb +1 -0
  119. data/lib/ferret/stemmers/porter_stemmer.rb +218 -0
  120. data/lib/ferret/store.rb +5 -0
  121. data/lib/ferret/store/buffered_index_io.rb +191 -0
  122. data/lib/ferret/store/directory.rb +139 -0
  123. data/lib/ferret/store/fs_store.rb +338 -0
  124. data/lib/ferret/store/index_io.rb +259 -0
  125. data/lib/ferret/store/ram_store.rb +282 -0
  126. data/lib/ferret/utils.rb +7 -0
  127. data/lib/ferret/utils/bit_vector.rb +105 -0
  128. data/lib/ferret/utils/date_tools.rb +138 -0
  129. data/lib/ferret/utils/number_tools.rb +91 -0
  130. data/lib/ferret/utils/parameter.rb +41 -0
  131. data/lib/ferret/utils/priority_queue.rb +120 -0
  132. data/lib/ferret/utils/string_helper.rb +47 -0
  133. data/lib/ferret/utils/weak_key_hash.rb +51 -0
  134. data/rake_utils/code_statistics.rb +106 -0
  135. data/setup.rb +1551 -0
  136. data/test/benchmark/tb_ram_store.rb +76 -0
  137. data/test/benchmark/tb_rw_vint.rb +26 -0
  138. data/test/longrunning/tc_numbertools.rb +60 -0
  139. data/test/longrunning/tm_store.rb +19 -0
  140. data/test/test_all.rb +9 -0
  141. data/test/test_helper.rb +6 -0
  142. data/test/unit/analysis/tc_analyzer.rb +21 -0
  143. data/test/unit/analysis/tc_letter_tokenizer.rb +20 -0
  144. data/test/unit/analysis/tc_lower_case_filter.rb +20 -0
  145. data/test/unit/analysis/tc_lower_case_tokenizer.rb +27 -0
  146. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +39 -0
  147. data/test/unit/analysis/tc_porter_stem_filter.rb +16 -0
  148. data/test/unit/analysis/tc_standard_analyzer.rb +20 -0
  149. data/test/unit/analysis/tc_standard_tokenizer.rb +20 -0
  150. data/test/unit/analysis/tc_stop_analyzer.rb +20 -0
  151. data/test/unit/analysis/tc_stop_filter.rb +14 -0
  152. data/test/unit/analysis/tc_white_space_analyzer.rb +21 -0
  153. data/test/unit/analysis/tc_white_space_tokenizer.rb +20 -0
  154. data/test/unit/analysis/tc_word_list_loader.rb +32 -0
  155. data/test/unit/document/tc_document.rb +47 -0
  156. data/test/unit/document/tc_field.rb +80 -0
  157. data/test/unit/index/tc_compound_file_io.rb +107 -0
  158. data/test/unit/index/tc_field_infos.rb +119 -0
  159. data/test/unit/index/tc_fields_io.rb +167 -0
  160. data/test/unit/index/tc_index.rb +140 -0
  161. data/test/unit/index/tc_index_reader.rb +622 -0
  162. data/test/unit/index/tc_index_writer.rb +57 -0
  163. data/test/unit/index/tc_multiple_term_doc_pos_enum.rb +80 -0
  164. data/test/unit/index/tc_segment_infos.rb +74 -0
  165. data/test/unit/index/tc_segment_term_docs.rb +17 -0
  166. data/test/unit/index/tc_segment_term_enum.rb +60 -0
  167. data/test/unit/index/tc_segment_term_vector.rb +71 -0
  168. data/test/unit/index/tc_term.rb +22 -0
  169. data/test/unit/index/tc_term_buffer.rb +57 -0
  170. data/test/unit/index/tc_term_info.rb +19 -0
  171. data/test/unit/index/tc_term_infos_io.rb +192 -0
  172. data/test/unit/index/tc_term_vector_offset_info.rb +18 -0
  173. data/test/unit/index/tc_term_vectors_io.rb +108 -0
  174. data/test/unit/index/th_doc.rb +244 -0
  175. data/test/unit/query_parser/tc_query_parser.rb +84 -0
  176. data/test/unit/search/tc_filter.rb +113 -0
  177. data/test/unit/search/tc_fuzzy_query.rb +136 -0
  178. data/test/unit/search/tc_index_searcher.rb +188 -0
  179. data/test/unit/search/tc_search_and_sort.rb +98 -0
  180. data/test/unit/search/tc_similarity.rb +37 -0
  181. data/test/unit/search/tc_sort.rb +48 -0
  182. data/test/unit/search/tc_sort_field.rb +27 -0
  183. data/test/unit/search/tc_spans.rb +153 -0
  184. data/test/unit/store/tc_fs_store.rb +84 -0
  185. data/test/unit/store/tc_ram_store.rb +35 -0
  186. data/test/unit/store/tm_store.rb +180 -0
  187. data/test/unit/store/tm_store_lock.rb +68 -0
  188. data/test/unit/ts_analysis.rb +16 -0
  189. data/test/unit/ts_document.rb +4 -0
  190. data/test/unit/ts_index.rb +18 -0
  191. data/test/unit/ts_query_parser.rb +3 -0
  192. data/test/unit/ts_search.rb +10 -0
  193. data/test/unit/ts_store.rb +6 -0
  194. data/test/unit/ts_utils.rb +10 -0
  195. data/test/unit/utils/tc_bit_vector.rb +65 -0
  196. data/test/unit/utils/tc_date_tools.rb +50 -0
  197. data/test/unit/utils/tc_number_tools.rb +59 -0
  198. data/test/unit/utils/tc_parameter.rb +40 -0
  199. data/test/unit/utils/tc_priority_queue.rb +62 -0
  200. data/test/unit/utils/tc_string_helper.rb +21 -0
  201. data/test/unit/utils/tc_weak_key_hash.rb +25 -0
  202. metadata +251 -0
@@ -0,0 +1,5 @@
1
+ require 'ferret/store/directory'
2
+ require 'ferret/store/index_io'
3
+ require 'ferret/store/buffered_index_io'
4
+ require 'ferret/store/fs_store'
5
+ require 'ferret/store/ram_store'
@@ -0,0 +1,191 @@
1
+ module Ferret::Store
2
+ BUFFER_SIZE = 1024
3
+ BUFFER = " " * BUFFER_SIZE
4
+
5
+ # Base implementation class for a buffered IndexOutput.
6
+ class BufferedIndexOutput < IndexOutput
7
+
8
+ def initialize
9
+ @buffer = BUFFER.clone
10
+ @buffer_start = 0 # position in file of buffer
11
+ @buffer_position = 0 # position in buffer
12
+ end
13
+
14
+ # Writes a single byte.
15
+ def write_byte(b)
16
+
17
+ # The following code offers a 5% speed improvement over the line
18
+ # below. It relies on the fact that ruby will throw an error if we try
19
+ # and modify a character that is out of range for the string.
20
+ #begin
21
+ # @buffer[@buffer_position] = b
22
+ # @buffer_position += 1
23
+ #rescue IndexError
24
+ # flush
25
+ # @buffer[@buffer_position] = b
26
+ # @buffer_position += 1
27
+ #end
28
+
29
+ flush if @buffer_position >= BUFFER_SIZE
30
+ @buffer[@buffer_position] = b
31
+ @buffer_position += 1
32
+ end
33
+
34
+ # Writes an array of bytes.
35
+ # buf:: the bytes to write
36
+ # length:: the number of bytes to write
37
+ def write_bytes(buf, length)
38
+ length.times do |i|
39
+ write_byte(buf[i])
40
+ end
41
+ end
42
+
43
+ # Forces any buffered output to be written.
44
+ def flush()
45
+ flush_buffer(@buffer, @buffer_position)
46
+ @buffer_start += @buffer_position
47
+ @buffer_position = 0
48
+ end
49
+
50
+ # Closes this stream to further operations.
51
+ def close()
52
+ flush()
53
+ end
54
+
55
+ # Get the current position in the file, where the next write will occur.
56
+ def pos()
57
+ return @buffer_start + @buffer_position
58
+ end
59
+
60
+ # Set the current position in the file, where the next write will occur.
61
+ def seek(pos)
62
+ flush()
63
+ @buffer_start = pos
64
+ end
65
+
66
+ # The number of bytes in the file.
67
+ def length
68
+ raise NotImplementedError
69
+ end
70
+
71
+ private
72
+
73
+ # Expert: implements buffer write. Writes the first len bytes from the
74
+ # buffer to the output.
75
+ #
76
+ # buf:: the bytes to write
77
+ # len:: the number of bytes to write
78
+ def flush_buffer(buf, len)
79
+ raise NotImplementedError
80
+ end
81
+ end
82
+
83
+ # Base implementation class for buffered IndexInput
84
+ class BufferedIndexInput < IndexInput
85
+ def initialize
86
+ @buffer = nil
87
+ @buffer_start = 0
88
+ @buffer_length = 0
89
+ @buffer_position = 0
90
+ end
91
+
92
+ # Read and return a single byte from the file
93
+ def read_byte
94
+ refill if (@buffer_position >= @buffer_length)
95
+ byte = @buffer[@buffer_position]
96
+ @buffer_position += 1
97
+ return byte
98
+ end
99
+
100
+ # Read +len+ bytes into +buffer+ starting at position +offset+ in +buffer+
101
+ #
102
+ # buffer:: The string buffer to read the characters into.
103
+ # offset:: The position in +buffer+ to start writing to.
104
+ # len:: the number of characters to read
105
+ # returns:: the buffer
106
+ def read_bytes(buffer, offset, len)
107
+ if (len < BUFFER_SIZE)
108
+ offset.upto(offset+len-1) do |i| # read byte-by-byte
109
+ buffer[i] = read_byte
110
+ end
111
+ else # read all-at-once
112
+ start = pos()
113
+ seek_internal(start)
114
+ read_internal(buffer, offset, len)
115
+
116
+ @buffer_start = start + len # adjust stream variables
117
+ @buffer_position = 0
118
+ @buffer_length = 0 # trigger refill on read
119
+ end
120
+ return buffer
121
+ end
122
+
123
+ # Get the current position in the file, where the next read will occur.
124
+ def pos()
125
+ return @buffer_start + @buffer_position
126
+ end
127
+
128
+ # Set the current position in the file, where the next read will occur.
129
+ def seek(pos)
130
+ if (pos >= @buffer_start and pos < (@buffer_start + @buffer_length))
131
+ @buffer_position = pos - @buffer_start # seek within buffer
132
+ else
133
+ @buffer_start = pos
134
+ @buffer_position = 0
135
+ @buffer_length = 0 # trigger refill() on read()
136
+ seek_internal(pos)
137
+ end
138
+ end
139
+
140
+ # Creates a clone of the BufferedIndexReader. Reading from a
141
+ # BufferedIndexInput should not change the state (read position) in the
142
+ # clone and vice-versa.
143
+ def clone()
144
+ bii = super
145
+ bii.buffer = @buffer.clone if @buffer
146
+ return bii
147
+ end
148
+
149
+ attr_writer :buffer
150
+ protected :buffer=
151
+
152
+ private
153
+
154
+ # Expert: implements buffer refill. Reads bytes from the current position
155
+ # in the input.
156
+ # buf:: the array to read bytes into
157
+ # offset:: the offset in the array to start storing bytes
158
+ # len:: the number of bytes to read
159
+ def read_internal(buf, offset, len)
160
+ raise NotImplementedError
161
+ end
162
+
163
+ # Expert: implements seek. Sets current position in this file, where the
164
+ # next read_internal will occur.
165
+ # pos:: the position to set to
166
+ def seek_internal(pos)
167
+ raise NotImplementedError
168
+ end
169
+
170
+ # Refill the buffer from the file.
171
+ def refill
172
+ start = @buffer_start + @buffer_position
173
+ last = start + BUFFER_SIZE
174
+ if (last > length()) # don't read past EOF
175
+ last = length()
176
+ end
177
+ @buffer_length = last - start
178
+ if (@buffer_length <= 0)
179
+ raise EOFError
180
+ end
181
+
182
+ if (@buffer == nil)
183
+ @buffer = BUFFER.clone # allocate buffer lazily
184
+ end
185
+ read_internal(@buffer, 0, @buffer_length)
186
+
187
+ @buffer_start = start
188
+ @buffer_position = 0
189
+ end
190
+ end
191
+ end
@@ -0,0 +1,139 @@
1
+ module Ferret::Store
2
+ # A Directory is an object which is used to access the index storage.
3
+ # Ruby's IO API is not used so that we can use different storage
4
+ # mechanisms to store the index. Some examples are;
5
+ #
6
+ # * File system based storage
7
+ # * RAM based storage
8
+ # * Database based storage
9
+ #
10
+ # NOTE: Once a file has been written and closed, it can no longer be
11
+ # modified. To make any changes to the file it must be deleted and
12
+ # rewritten. For this reason, the method to open a file for writing is
13
+ # called _create_output_, while the method to open a file for reading is
14
+ # called _open_input_ If there is a risk of simultaneous modifications of
15
+ # the files then locks should be used. See Lock to find out how.
16
+ class Directory
17
+ # returns an array of strings, one for each file in the directory
18
+ def each # :yeilds: file_name
19
+ raise NotImplementedError
20
+ end
21
+
22
+ # returns the number of files in the directory
23
+ def file_count()
24
+ i = 0
25
+ each {|f| i += 1}
26
+ return i
27
+ end
28
+
29
+ # Returns true if a file with the given name exists.
30
+ def exists?(file)
31
+ raise NotImplementedError
32
+ end
33
+
34
+ # Returns the time the named file was last modified.
35
+ def modified(file)
36
+ raise NotImplementedError
37
+ end
38
+
39
+ # Set the modified time of an existing file to now.
40
+ def touch(file)
41
+ raise NotImplementedError
42
+ end
43
+
44
+ # Removes an existing file in the directory.
45
+ def delete(file)
46
+ raise NotImplementedError
47
+ end
48
+
49
+ # Renames an existing file in the directory.
50
+ # If a file already exists with the new name, then it is replaced.
51
+ # This replacement should be atomic.
52
+ def rename(from, to)
53
+ raise NotImplementedError
54
+ end
55
+
56
+ # Returns the length of a file in the directory.
57
+ def length(file)
58
+ raise NotImplementedError
59
+ end
60
+
61
+ # Creates a new, empty file in the directory with the given name.
62
+ # Returns a stream writing this file.
63
+ def create_output(file_name)
64
+ raise NotImplementedError
65
+ end
66
+
67
+ # Returns a stream reading an existing file.
68
+ def open_input(file_name)
69
+ raise NotImplementedError
70
+ end
71
+
72
+ # Construct a Lock.
73
+ def make_lock(lock_name)
74
+ raise NotImplementedError
75
+ end
76
+
77
+ # Closes the store.
78
+ def close
79
+ raise NotImplementedError
80
+ end
81
+
82
+ end
83
+
84
+ # A Lock is used to lock a data source so that not more than one
85
+ # output stream can access a data source at one time. It is possible
86
+ # that locks could be disabled. For example a read only index stored
87
+ # on a CDROM would have no need for a lock.
88
+ #
89
+ # You can use a lock in two ways. Firstly:
90
+ #
91
+ # write_lock = @directory.make_lock(LOCK_NAME)
92
+ # write_lock.obtain(WRITE_LOCK_TIME_OUT)
93
+ # ... # Do your file modifications # ...
94
+ # write_lock.release()
95
+ #
96
+ # Alternatively you could use the while locked method. This ensures that
97
+ # the lock will be released once processing has finished.
98
+ #
99
+ # write_lock = @directory.make_lock(LOCK_NAME)
100
+ # write_lock.while_locked(WRITE_LOCK_TIME_OUT) do
101
+ # ... # Do your file modifications # ...
102
+ # end
103
+ class Lock
104
+ # Attempts made to obtain the lock before the application gives up. If
105
+ # you want the process to wait longer to get the lock then just increase
106
+ # the lock timeout
107
+ MAX_ATTEMPTS = 5
108
+
109
+ # Obtain the lock on the data source. If you expect to have to wait for
110
+ # a while on a lock then you should set the lock_timeout to a large
111
+ # number. This may be necessary if you are doing multiple large batch
112
+ # updates on an index but the default 1 second should be fine in most
113
+ # cases.
114
+ def obtain(lock_timeout = 1)
115
+ raise NotImplementedError
116
+ end
117
+
118
+ # Release the lock on the data source
119
+ def release
120
+ raise NotImplementedError
121
+ end
122
+
123
+ # Returns true if there is a lock on the data source
124
+ def locked?
125
+ raise NotImplementedError
126
+ end
127
+
128
+ # Obtains the lock, processes the block and ensures that the lock is
129
+ # released when the block terminates. The lock timeout is in seconds.
130
+ def while_locked(lock_timeout=1)
131
+ obtain(lock_timeout)
132
+ begin
133
+ yield
134
+ ensure
135
+ release()
136
+ end
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,338 @@
1
+ module Ferret::Store
2
+
3
+ require 'monitor'
4
+ require 'fileutils'
5
+ require 'digest/md5'
6
+
7
+ # This is a filesystem implementation of Directory and will be the one
8
+ # usually used for storing the index. This implementation stores each
9
+ # separate file as a separate file on the operating system. This works fine
10
+ # and is the most efficient solution for small to medium size indexes. For
11
+ # very large indexes, there may be a problem with the operating system not
12
+ # wanting to open to many files. One fix for this is to change the maximum
13
+ # open files setting in your operating system. Alternatively you could use
14
+ # a compound file instead.
15
+ #
16
+ # TODO:
17
+ # * need a better way of setting properties. Currently you have to
18
+ # change the constants to disable locking.
19
+ class FSDirectory < Directory
20
+ include MonitorMixin
21
+
22
+ # This cache of directories ensures that there is a unique Directory
23
+ # instance per path, so that synchronization on the Directory can be used to
24
+ # synchronize access between readers and writers.
25
+ @@Directories = Hash.new.extend(MonitorMixin)
26
+
27
+
28
+ # Locks should be disabled it there is no need for them
29
+ LOCKS_DISABLED = false
30
+
31
+ # The lock dir is the directory where the file locks will be stored
32
+ LOCK_DIR = nil
33
+
34
+ # Returns the directory instance for the named location.
35
+ #
36
+ # Directories are cached, so that, for a given canonical path, the same
37
+ # FSDirectory instance will always be returned. This permits
38
+ # synchronization on directories.
39
+ #
40
+ # path:: the path to the directory.
41
+ # create:: if true, create, or erase any existing contents.
42
+ def FSDirectory.get_directory(path, create=false)
43
+ dir = nil
44
+ @@Directories.synchronize do
45
+ dir = @@Directories[path]
46
+ if not dir then
47
+ dir = FSDirectory.new(path, create)
48
+ @@Directories[path] = dir
49
+ end
50
+ dir.refresh if create
51
+ end
52
+ dir.synchronize do
53
+ dir.reference()
54
+ end
55
+ return dir
56
+ end
57
+
58
+ # Returns true if locks have been disabled
59
+ def FSDirectory.locks_disabled?
60
+ LOCKS_DISABLED
61
+ end
62
+
63
+ # Set the directory where all of the locks will be stored.
64
+ # path:: the path to the directory where the locks will be stored.
65
+ # An exception will be raised if the directory does not exist
66
+ def lock_dir=(path)
67
+ # close the old lock dir if it exists
68
+ @lock_dir.close() if @lock_dir
69
+ @lock_dir = Dir.new(path)
70
+ end
71
+
72
+ # Returns a Dir object of the directory where the lock is stored
73
+ attr_reader :lock_dir
74
+
75
+ # Remove all files and locks from this directory so we have a clean instance
76
+ def refresh
77
+ synchronize do
78
+ # delete all the files
79
+ each do |fname|
80
+ File.delete(dir_path(fname))
81
+ end
82
+ # clear all the locks
83
+ refresh_lock_dir
84
+ @lock_dir.each do |lock_fname|
85
+ next if lock_fname == '.' or lock_fname == '..'
86
+ File.delete(@lock_dir.path + '/' + lock_fname)
87
+ end
88
+ end
89
+ end
90
+
91
+ #--
92
+ # Directory implementation
93
+ #++
94
+
95
+ # Iterates through the file listing, skipping lock files if they exist
96
+ def each()
97
+ refresh_dir
98
+ @dir.each do |file_name|
99
+ # return all files except for the current and parent directories
100
+ # and any lock files that exist in this directory
101
+ next if ['.', '..'].include?(file_name)
102
+ next if file_name =~ Regexp.new('^' + lock_prefix)
103
+ yield file_name
104
+ end
105
+ end
106
+
107
+ # Returns true if a file with the given name exists.
108
+ def exists?(name)
109
+ File.exists?(dir_path(name))
110
+ end
111
+
112
+ # Returns the time the named file was last modified.
113
+ def modified(name)
114
+ File.mtime(dir_path(name))
115
+ end
116
+
117
+ # Set the modified time of an existing file to now.
118
+ def touch(name)
119
+ # just open the file and close it. No need to do anything with it.
120
+ FileUtils.touch(dir_path(name))
121
+ end
122
+
123
+ # Removes an existing file in the directory.
124
+ def delete(name)
125
+ begin
126
+ File.delete(dir_path(name))
127
+ rescue SystemCallError => e
128
+ raise IOError, e.to_s
129
+ end
130
+ end
131
+
132
+ # Renames an existing file in the directory.
133
+ # If a file already exists with the new name, then it is replaced.
134
+ # This replacement should be atomic.
135
+ def rename(from, to)
136
+ File.rename(dir_path(from), dir_path(to))
137
+ end
138
+
139
+
140
+ # Returns the length of a file in the directory.
141
+ def length(name)
142
+ File.size(dir_path(name))
143
+ end
144
+
145
+ # Creates a new, empty file in the directory with the given name.
146
+ # Returns a stream writing this file.
147
+ def create_output(name)
148
+ FSIndexOutput.new(dir_path(name))
149
+ end
150
+
151
+ # Returns a stream reading an existing file.
152
+ def open_input(name)
153
+ FSIndexInput.new(dir_path(name))
154
+ end
155
+
156
+ # Construct a Lock.
157
+ def make_lock(name)
158
+ FSLock.new(@lock_dir.path + "/" + lock_prefix() + name)
159
+ end
160
+
161
+ # Closes the store.
162
+ def close()
163
+ @ref_count -= 1
164
+ if (@ref_count <=0) then
165
+ @@Directories.synchronize do
166
+ @@Directories.delete(@dir.path)
167
+ end
168
+ end
169
+ end
170
+
171
+ def reference()
172
+ @ref_count += 1
173
+ end
174
+
175
+ # See Lock for hints as to how to use locks.
176
+ class FSLock < Lock
177
+ # pass the name of the file that we are going to lock
178
+ def initialize(lock_file)
179
+ @lock_file = lock_file
180
+ end
181
+
182
+ # obtain the lock on the data source
183
+ def obtain(lock_timeout = 1)
184
+ return true if FSDirectory.locks_disabled?
185
+ MAX_ATTEMPTS.times do
186
+ begin
187
+ # create a file if none exists. If one already exists
188
+ # then someone beat us to the lock so return false
189
+ File.open(@lock_file, File::WRONLY|File::EXCL|File::CREAT) {|f|}
190
+ return true
191
+ rescue SystemCallError
192
+ # lock was not obtained so sleep for timeout then try again.
193
+ sleep(lock_timeout)
194
+ end
195
+ end
196
+ # lock could not be obtained so raise an exception
197
+ raise "could not obtain lock: " + @lock_file.to_s
198
+ end
199
+
200
+ # Release the lock on the data source. Returns true if successful.
201
+ def release
202
+ return if FSDirectory.locks_disabled?
203
+ begin
204
+ File.delete(@lock_file)
205
+ rescue SystemCallError
206
+ # maybe we tried to release a lock that wasn't locked. This
207
+ # isn't critical so just return false
208
+ return false
209
+ end
210
+ return true
211
+ end
212
+
213
+ # returns true if there is a lock on the data source
214
+ def locked?
215
+ return false if FSDirectory.locks_disabled?
216
+ File.exists?(@lock_file)
217
+ end
218
+ end
219
+
220
+ # A file system output stream extending OutputStream to read from the file system
221
+ class FSIndexOutput < BufferedIndexOutput
222
+ def initialize(path)
223
+ super()
224
+ @file = File.open(path, "wb")
225
+ end
226
+
227
+ def close
228
+ super()
229
+ @file.close
230
+ end
231
+
232
+ def seek(pos)
233
+ super(pos)
234
+ @file.seek(pos)
235
+ end
236
+
237
+ private
238
+ def flush_buffer(b, size)
239
+ @file.syswrite(b[0...size])
240
+ end
241
+ end
242
+
243
+ # A file system input stream extending InputStream to read from the file system
244
+ class FSIndexInput < BufferedIndexInput
245
+ attr_writer :is_clone
246
+ attr_reader :length
247
+ attr_reader :file
248
+
249
+ def initialize(path)
250
+ @file = File.open(path, "rb")
251
+ @file.extend(MonitorMixin)
252
+ @length = File.size(path)
253
+ @is_clone = false
254
+ super()
255
+ end
256
+
257
+ def close
258
+ @file.close if not @is_clone
259
+ end
260
+
261
+ # We need to record if this is a clone so we know when to close the file.
262
+ # The file should only be closed when the original FSIndexInput is closed.
263
+ def clone()
264
+ fsii = super
265
+ fsii.is_clone = true
266
+ fsii.file.seek(@file.pos)
267
+ return fsii
268
+ end
269
+
270
+ private
271
+
272
+ def read_internal(b, offset, length)
273
+ @file.synchronize do
274
+ position = pos()
275
+ if position != @file.pos
276
+ @file.seek(position)
277
+ end
278
+ bytes = @file.read(length)
279
+ if bytes.nil?
280
+ raise EOFError, "Read past EOF in #{@file.path}"
281
+ end
282
+ b[offset, bytes.length] = bytes
283
+ end
284
+ end
285
+
286
+ def seek_internal(pos)
287
+ @file.seek(pos)
288
+ end
289
+
290
+ end
291
+
292
+ private
293
+ # Create a new directory from the path.
294
+ # path:: the path to the directory.
295
+ # create:: if true, create, or erase any existing contents.
296
+ def initialize(path, create)
297
+ super()
298
+ if create then FileUtils.mkdir_p(path) end
299
+ if not File.directory?(path) then
300
+ raise "There is no directory: #{path}. Use create = true to create one"
301
+ end
302
+ @dir = Dir.new(path)
303
+ # put the lock_dir here as well if no default exists.
304
+ if LOCK_DIR then
305
+ @lock_dir = Dir.new(LOCK_DIR)
306
+ else
307
+ @lock_dir = Dir.new(path)
308
+ end
309
+ @ref_count = 0
310
+ end
311
+
312
+ # Add the directory path to the file name for opening
313
+ def dir_path(name)
314
+ File.join(@dir.path, name)
315
+ end
316
+
317
+ # returns the lock prefix for this directory
318
+ def lock_prefix
319
+ 'ferret-' + Digest::MD5.hexdigest(@dir.path)
320
+ end
321
+
322
+ # Unfortunately, on Windows, Dir does not refresh when rewind is called
323
+ # so any new files will be hidden. So we open the directory again.
324
+ def refresh_dir()
325
+ tmp = Dir.new(@dir.path)
326
+ @dir.close
327
+ @dir = tmp
328
+ end
329
+
330
+ def refresh_lock_dir()
331
+ tmp = Dir.new(@lock_dir.path)
332
+ @lock_dir.close
333
+ @lock_dir = tmp
334
+ end
335
+
336
+ #end private
337
+ end
338
+ end