ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/lib/ferret/store.rb
DELETED
@@ -1,190 +0,0 @@
|
|
1
|
-
module Ferret::Store
|
2
|
-
BUFFER_SIZE = 1024
|
3
|
-
BUFFER = " " * BUFFER_SIZE
|
4
|
-
|
5
|
-
# Base implementation class for a buffered IndexOutput.
|
6
|
-
class BufferedIndexOutput < IndexOutput
|
7
|
-
|
8
|
-
def initialize
|
9
|
-
@buffer = BUFFER.clone
|
10
|
-
@buffer_start = 0 # position in file of buffer
|
11
|
-
@buffer_position = 0 # position in buffer
|
12
|
-
end
|
13
|
-
|
14
|
-
# Writes a single byte.
|
15
|
-
def write_byte(b)
|
16
|
-
|
17
|
-
# The following code offers a 5% speed improvement over the line
|
18
|
-
# below. It relies on the fact that ruby will throw an error if we try
|
19
|
-
# and modify a character that is out of range for the string.
|
20
|
-
#begin
|
21
|
-
# @buffer[@buffer_position] = b
|
22
|
-
# @buffer_position += 1
|
23
|
-
#rescue IndexError
|
24
|
-
# flush
|
25
|
-
# @buffer[@buffer_position] = b
|
26
|
-
# @buffer_position += 1
|
27
|
-
#end
|
28
|
-
|
29
|
-
flush if @buffer_position >= BUFFER_SIZE
|
30
|
-
@buffer[@buffer_position] = b
|
31
|
-
@buffer_position += 1
|
32
|
-
end
|
33
|
-
|
34
|
-
# Writes an array of bytes.
|
35
|
-
# buf:: the bytes to write
|
36
|
-
# length:: the number of bytes to write
|
37
|
-
def write_bytes(buf, length)
|
38
|
-
length.times do |i|
|
39
|
-
write_byte(buf[i])
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
# Forces any buffered output to be written.
|
44
|
-
def flush()
|
45
|
-
flush_buffer(@buffer, @buffer_position)
|
46
|
-
@buffer_start += @buffer_position
|
47
|
-
@buffer_position = 0
|
48
|
-
end
|
49
|
-
|
50
|
-
# Closes this stream to further operations.
|
51
|
-
def close()
|
52
|
-
flush()
|
53
|
-
end
|
54
|
-
|
55
|
-
# Get the current position in the file, where the next write will occur.
|
56
|
-
def pos()
|
57
|
-
return @buffer_start + @buffer_position
|
58
|
-
end
|
59
|
-
|
60
|
-
# Set the current position in the file, where the next write will occur.
|
61
|
-
def seek(pos)
|
62
|
-
flush()
|
63
|
-
@buffer_start = pos
|
64
|
-
end
|
65
|
-
|
66
|
-
# The number of bytes in the file.
|
67
|
-
def length
|
68
|
-
raise NotImplementedError
|
69
|
-
end
|
70
|
-
|
71
|
-
private
|
72
|
-
|
73
|
-
# Expert: implements buffer write. Writes the first len bytes from the
|
74
|
-
# buffer to the output.
|
75
|
-
#
|
76
|
-
# buf:: the bytes to write
|
77
|
-
# len:: the number of bytes to write
|
78
|
-
def flush_buffer(buf, len)
|
79
|
-
raise NotImplementedError
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
# Base implementation class for buffered IndexInput
|
84
|
-
class BufferedIndexInput < IndexInput
|
85
|
-
def initialize
|
86
|
-
@buffer = nil
|
87
|
-
@buffer_start = 0
|
88
|
-
@buffer_length = 0
|
89
|
-
@buffer_position = 0
|
90
|
-
end
|
91
|
-
|
92
|
-
# Read and return a single byte from the file
|
93
|
-
def read_byte
|
94
|
-
refill if (@buffer_position >= @buffer_length)
|
95
|
-
byte = @buffer[@buffer_position]
|
96
|
-
@buffer_position += 1
|
97
|
-
return byte
|
98
|
-
end
|
99
|
-
|
100
|
-
# Read +len+ bytes into +buffer+ starting at position +offset+ in +buffer+
|
101
|
-
#
|
102
|
-
# buffer:: The string buffer to read the characters into.
|
103
|
-
# offset:: The position in +buffer+ to start writing to.
|
104
|
-
# len:: the number of characters to read
|
105
|
-
# returns:: the buffer
|
106
|
-
def read_bytes(buffer, offset, len)
|
107
|
-
if (len < BUFFER_SIZE)
|
108
|
-
offset.upto(offset+len-1) do |i| # read byte-by-byte
|
109
|
-
buffer[i] = read_byte
|
110
|
-
end
|
111
|
-
else # read all-at-once
|
112
|
-
start = pos()
|
113
|
-
seek_internal(start)
|
114
|
-
read_internal(buffer, offset, len)
|
115
|
-
|
116
|
-
@buffer_start = start + len # adjust stream variables
|
117
|
-
@buffer_position = 0
|
118
|
-
@buffer_length = 0 # trigger refill on read
|
119
|
-
end
|
120
|
-
return buffer
|
121
|
-
end
|
122
|
-
|
123
|
-
# Get the current position in the file, where the next read will occur.
|
124
|
-
def pos()
|
125
|
-
return @buffer_start + @buffer_position
|
126
|
-
end
|
127
|
-
|
128
|
-
# Set the current position in the file, where the next read will occur.
|
129
|
-
def seek(pos)
|
130
|
-
if (pos >= @buffer_start and pos < (@buffer_start + @buffer_length))
|
131
|
-
@buffer_position = pos - @buffer_start # seek within buffer
|
132
|
-
else
|
133
|
-
@buffer_start = pos
|
134
|
-
@buffer_position = 0
|
135
|
-
@buffer_length = 0 # trigger refill() on read()
|
136
|
-
seek_internal(pos)
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
# Creates a clone of the BufferedIndexReader. Reading from a
|
141
|
-
# BufferedIndexInput should not change the state (read position) in the
|
142
|
-
# clone and vice-versa.
|
143
|
-
def initialize_copy(o)
|
144
|
-
super
|
145
|
-
@buffer = o.buffer.clone if o.buffer
|
146
|
-
end
|
147
|
-
|
148
|
-
attr_reader :buffer
|
149
|
-
protected :buffer
|
150
|
-
|
151
|
-
private
|
152
|
-
|
153
|
-
# Expert: implements buffer refill. Reads bytes from the current position
|
154
|
-
# in the input.
|
155
|
-
# buf:: the array to read bytes into
|
156
|
-
# offset:: the offset in the array to start storing bytes
|
157
|
-
# len:: the number of bytes to read
|
158
|
-
def read_internal(buf, offset, len)
|
159
|
-
raise NotImplementedError
|
160
|
-
end
|
161
|
-
|
162
|
-
# Expert: implements seek. Sets current position in this file, where the
|
163
|
-
# next read_internal will occur.
|
164
|
-
# pos:: the position to set to
|
165
|
-
def seek_internal(pos)
|
166
|
-
raise NotImplementedError
|
167
|
-
end
|
168
|
-
|
169
|
-
# Refill the buffer from the file.
|
170
|
-
def refill
|
171
|
-
start = @buffer_start + @buffer_position
|
172
|
-
last = start + BUFFER_SIZE
|
173
|
-
if (last > length()) # don't read past EOF
|
174
|
-
last = length()
|
175
|
-
end
|
176
|
-
@buffer_length = last - start
|
177
|
-
if (@buffer_length <= 0)
|
178
|
-
raise EOFError
|
179
|
-
end
|
180
|
-
|
181
|
-
if (@buffer == nil)
|
182
|
-
@buffer = BUFFER.clone # allocate buffer lazily
|
183
|
-
end
|
184
|
-
read_internal(@buffer, 0, @buffer_length)
|
185
|
-
|
186
|
-
@buffer_start = start
|
187
|
-
@buffer_position = 0
|
188
|
-
end
|
189
|
-
end
|
190
|
-
end
|
@@ -1,141 +0,0 @@
|
|
1
|
-
module Ferret::Store
|
2
|
-
# A Directory is an object which is used to access the index storage.
|
3
|
-
# Ruby's IO API is not used so that we can use different storage
|
4
|
-
# mechanisms to store the index. Some examples are;
|
5
|
-
#
|
6
|
-
# * File system based storage
|
7
|
-
# * RAM based storage
|
8
|
-
# * Database based storage
|
9
|
-
#
|
10
|
-
# NOTE: Once a file has been written and closed, it can no longer be
|
11
|
-
# modified. To make any changes to the file it must be deleted and
|
12
|
-
# rewritten. For this reason, the method to open a file for writing is
|
13
|
-
# called _create_output_, while the method to open a file for reading is
|
14
|
-
# called _open_input_ If there is a risk of simultaneous modifications of
|
15
|
-
# the files then locks should be used. See Lock to find out how.
|
16
|
-
class Directory
|
17
|
-
LOCK_PREFIX = "ferret-"
|
18
|
-
|
19
|
-
# returns an array of strings, one for each file in the directory
|
20
|
-
def each # :yeilds: file_name
|
21
|
-
raise NotImplementedError
|
22
|
-
end
|
23
|
-
|
24
|
-
# returns the number of files in the directory
|
25
|
-
def file_count()
|
26
|
-
i = 0
|
27
|
-
each {|f| i += 1}
|
28
|
-
return i
|
29
|
-
end
|
30
|
-
|
31
|
-
# Returns true if a file with the given name exists.
|
32
|
-
def exists?(file)
|
33
|
-
raise NotImplementedError
|
34
|
-
end
|
35
|
-
|
36
|
-
# Returns the time the named file was last modified.
|
37
|
-
def modified(file)
|
38
|
-
raise NotImplementedError
|
39
|
-
end
|
40
|
-
|
41
|
-
# Set the modified time of an existing file to now.
|
42
|
-
def touch(file)
|
43
|
-
raise NotImplementedError
|
44
|
-
end
|
45
|
-
|
46
|
-
# Removes an existing file in the directory.
|
47
|
-
def delete(file)
|
48
|
-
raise NotImplementedError
|
49
|
-
end
|
50
|
-
|
51
|
-
# Renames an existing file in the directory.
|
52
|
-
# If a file already exists with the new name, then it is replaced.
|
53
|
-
# This replacement should be atomic.
|
54
|
-
def rename(from, to)
|
55
|
-
raise NotImplementedError
|
56
|
-
end
|
57
|
-
|
58
|
-
# Returns the length of a file in the directory.
|
59
|
-
def length(file)
|
60
|
-
raise NotImplementedError
|
61
|
-
end
|
62
|
-
|
63
|
-
# Creates a new, empty file in the directory with the given name.
|
64
|
-
# Returns a stream writing this file.
|
65
|
-
def create_output(file_name)
|
66
|
-
raise NotImplementedError
|
67
|
-
end
|
68
|
-
|
69
|
-
# Returns a stream reading an existing file.
|
70
|
-
def open_input(file_name)
|
71
|
-
raise NotImplementedError
|
72
|
-
end
|
73
|
-
|
74
|
-
# Construct a Lock.
|
75
|
-
def make_lock(lock_name)
|
76
|
-
raise NotImplementedError
|
77
|
-
end
|
78
|
-
|
79
|
-
# Closes the store.
|
80
|
-
def close
|
81
|
-
raise NotImplementedError
|
82
|
-
end
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
# A Lock is used to lock a data source so that not more than one
|
87
|
-
# output stream can access a data source at one time. It is possible
|
88
|
-
# that locks could be disabled. For example a read only index stored
|
89
|
-
# on a CDROM would have no need for a lock.
|
90
|
-
#
|
91
|
-
# You can use a lock in two ways. Firstly:
|
92
|
-
#
|
93
|
-
# write_lock = @directory.make_lock(LOCK_NAME)
|
94
|
-
# write_lock.obtain(WRITE_LOCK_TIME_OUT)
|
95
|
-
# ... # Do your file modifications # ...
|
96
|
-
# write_lock.release()
|
97
|
-
#
|
98
|
-
# Alternatively you could use the while locked method. This ensures that
|
99
|
-
# the lock will be released once processing has finished.
|
100
|
-
#
|
101
|
-
# write_lock = @directory.make_lock(LOCK_NAME)
|
102
|
-
# write_lock.while_locked(WRITE_LOCK_TIME_OUT) do
|
103
|
-
# ... # Do your file modifications # ...
|
104
|
-
# end
|
105
|
-
class Lock
|
106
|
-
# Attempts made to obtain the lock before the application gives up. If
|
107
|
-
# you want the process to wait longer to get the lock then just increase
|
108
|
-
# the lock timeout
|
109
|
-
MAX_ATTEMPTS = 5
|
110
|
-
|
111
|
-
# Obtain the lock on the data source. If you expect to have to wait for
|
112
|
-
# a while on a lock then you should set the lock_timeout to a large
|
113
|
-
# number. This may be necessary if you are doing multiple large batch
|
114
|
-
# updates on an index but the default 1 second should be fine in most
|
115
|
-
# cases.
|
116
|
-
def obtain(lock_timeout = 1)
|
117
|
-
raise NotImplementedError
|
118
|
-
end
|
119
|
-
|
120
|
-
# Release the lock on the data source
|
121
|
-
def release
|
122
|
-
raise NotImplementedError
|
123
|
-
end
|
124
|
-
|
125
|
-
# Returns true if there is a lock on the data source
|
126
|
-
def locked?
|
127
|
-
raise NotImplementedError
|
128
|
-
end
|
129
|
-
|
130
|
-
# Obtains the lock, processes the block and ensures that the lock is
|
131
|
-
# released when the block terminates. The lock timeout is in seconds.
|
132
|
-
def while_locked(lock_timeout=1)
|
133
|
-
obtain(lock_timeout)
|
134
|
-
begin
|
135
|
-
yield
|
136
|
-
ensure
|
137
|
-
release()
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
end
|
@@ -1,381 +0,0 @@
|
|
1
|
-
module Ferret::Store
|
2
|
-
|
3
|
-
require 'monitor'
|
4
|
-
require 'fileutils'
|
5
|
-
require 'digest/md5'
|
6
|
-
|
7
|
-
# This is a filesystem implementation of Directory and will be the one
|
8
|
-
# usually used for storing the index. This implementation stores each
|
9
|
-
# separate file as a separate file on the operating system. This works fine
|
10
|
-
# and is the most efficient solution for small to medium size indexes. For
|
11
|
-
# very large indexes, there may be a problem with the operating system not
|
12
|
-
# wanting to open to many files. One fix for this is to change the maximum
|
13
|
-
# open files setting in your operating system. Alternatively you could use
|
14
|
-
# a compound file instead.
|
15
|
-
#
|
16
|
-
# TODO:
|
17
|
-
# * need a better way of setting properties. Currently you have to
|
18
|
-
# change the constants to disable locking.
|
19
|
-
class FSDirectory < Directory
|
20
|
-
include MonitorMixin
|
21
|
-
|
22
|
-
# This cache of directories ensures that there is a unique Directory
|
23
|
-
# instance per path, so that synchronization on the Directory can be used to
|
24
|
-
# synchronize access between readers and writers.
|
25
|
-
@@Directories = Hash.new.extend(MonitorMixin)
|
26
|
-
|
27
|
-
|
28
|
-
# Locks should be disabled it there is no need for them
|
29
|
-
LOCKS_DISABLED = false
|
30
|
-
|
31
|
-
# The lock dir is the directory where the file locks will be stored
|
32
|
-
LOCK_DIR = nil
|
33
|
-
|
34
|
-
# Create a new directory from the path.
|
35
|
-
# path:: the path to the directory.
|
36
|
-
# create:: if true, create, or erase any existing contents.
|
37
|
-
def initialize(path, create)
|
38
|
-
super()
|
39
|
-
if create then FileUtils.mkdir_p(path) end
|
40
|
-
if not File.directory?(path) then
|
41
|
-
raise IOError, "There is no directory: #{path}. Use create = true to create one"
|
42
|
-
end
|
43
|
-
@dir = Dir.new(path)
|
44
|
-
# put the lock_dir here as well if no default exists.
|
45
|
-
if LOCK_DIR then
|
46
|
-
@lock_dir = Dir.new(LOCK_DIR)
|
47
|
-
else
|
48
|
-
@lock_dir = Dir.new(path)
|
49
|
-
end
|
50
|
-
@ref_count = 0
|
51
|
-
end
|
52
|
-
|
53
|
-
class <<FSDirectory
|
54
|
-
alias :allocate :new
|
55
|
-
protected :allocate
|
56
|
-
end
|
57
|
-
|
58
|
-
# Returns the directory instance for the named location.
|
59
|
-
#
|
60
|
-
# Directories are cached, so that, for a given canonical path, the same
|
61
|
-
# FSDirectory instance will always be returned. This permits
|
62
|
-
# synchronization on directories.
|
63
|
-
#
|
64
|
-
# path:: the path to the directory.
|
65
|
-
# create:: if true, create, or erase any existing contents.
|
66
|
-
def FSDirectory.new(path, create = false)
|
67
|
-
dir = nil
|
68
|
-
@@Directories.synchronize do
|
69
|
-
dir = @@Directories[path]
|
70
|
-
if not dir then
|
71
|
-
dir = FSDirectory.allocate(path, create)
|
72
|
-
@@Directories[path] = dir
|
73
|
-
end
|
74
|
-
dir.refresh if create
|
75
|
-
end
|
76
|
-
dir.synchronize do
|
77
|
-
dir.reference()
|
78
|
-
end
|
79
|
-
return dir
|
80
|
-
end
|
81
|
-
|
82
|
-
# Returns true if locks have been disabled
|
83
|
-
def FSDirectory.locks_disabled?
|
84
|
-
LOCKS_DISABLED
|
85
|
-
end
|
86
|
-
|
87
|
-
# Set the directory where all of the locks will be stored.
|
88
|
-
# path:: the path to the directory where the locks will be stored.
|
89
|
-
# An exception will be raised if the directory does not exist
|
90
|
-
def lock_dir=(path)
|
91
|
-
# close the old lock dir if it exists
|
92
|
-
@lock_dir.close() if @lock_dir
|
93
|
-
@lock_dir = Dir.new(path)
|
94
|
-
end
|
95
|
-
|
96
|
-
# Returns a Dir object of the directory where the lock is stored
|
97
|
-
attr_reader :lock_dir
|
98
|
-
|
99
|
-
# Remove all files and locks from this directory so we have a clean instance
|
100
|
-
def refresh
|
101
|
-
synchronize do
|
102
|
-
# delete all the files
|
103
|
-
refresh_dir
|
104
|
-
each do |fname|
|
105
|
-
FileUtils.rm_rf(dir_path(fname))
|
106
|
-
end
|
107
|
-
# clear all the locks
|
108
|
-
refresh_lock_dir
|
109
|
-
@lock_dir.each do |lock_fname|
|
110
|
-
next if lock_fname == '.' or lock_fname == '..'
|
111
|
-
FileUtils.rm_rf(@lock_dir.path + '/' + lock_fname)
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
#--
|
117
|
-
# Directory implementation
|
118
|
-
#++
|
119
|
-
|
120
|
-
# Iterates through the file listing, skipping lock files if they exist
|
121
|
-
def each()
|
122
|
-
refresh_dir
|
123
|
-
@dir.each do |file_name|
|
124
|
-
# return all files except for the current and parent directories
|
125
|
-
# and any lock files that exist in this directory
|
126
|
-
next if ['.', '..'].include?(file_name)
|
127
|
-
next if file_name =~ Regexp.new('^' + lock_prefix)
|
128
|
-
yield file_name
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
# Returns true if a file with the given name exists.
|
133
|
-
def exists?(name)
|
134
|
-
File.exists?(dir_path(name))
|
135
|
-
end
|
136
|
-
|
137
|
-
# Returns the time the named file was last modified.
|
138
|
-
def modified(name)
|
139
|
-
File.mtime(dir_path(name))
|
140
|
-
end
|
141
|
-
|
142
|
-
# Set the modified time of an existing file to now.
|
143
|
-
def touch(name)
|
144
|
-
# just open the file and close it. No need to do anything with it.
|
145
|
-
FileUtils.touch(dir_path(name))
|
146
|
-
end
|
147
|
-
|
148
|
-
# Removes an existing file in the directory.
|
149
|
-
def delete(name)
|
150
|
-
begin
|
151
|
-
File.delete(dir_path(name))
|
152
|
-
rescue SystemCallError => e
|
153
|
-
raise IOError, e.to_s
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
# Renames an existing file in the directory.
|
158
|
-
# If a file already exists with the new name, then it is replaced.
|
159
|
-
# This replacement should be atomic.
|
160
|
-
def rename(from, to)
|
161
|
-
synchronize do
|
162
|
-
begin
|
163
|
-
File.rename(dir_path(from), dir_path(to))
|
164
|
-
rescue
|
165
|
-
# try again, this time forcing the delete
|
166
|
-
FileUtils.rm_rf(dir_path(to))
|
167
|
-
begin
|
168
|
-
FileUtils.mv(dir_path(from), dir_path(to))
|
169
|
-
rescue
|
170
|
-
FileUtils.cp(dir_path(from), dir_path(to))
|
171
|
-
FileUtils.rm_rf(dir_path(to))
|
172
|
-
end
|
173
|
-
end
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
|
-
|
178
|
-
# Returns the length of a file in the directory.
|
179
|
-
def length(name)
|
180
|
-
File.size(dir_path(name))
|
181
|
-
end
|
182
|
-
|
183
|
-
# Creates a new, empty file in the directory with the given name.
|
184
|
-
# Returns a stream writing this file.
|
185
|
-
def create_output(name)
|
186
|
-
FSIndexOutput.new(dir_path(name))
|
187
|
-
end
|
188
|
-
|
189
|
-
# Returns a stream reading an existing file.
|
190
|
-
def open_input(name)
|
191
|
-
FSIndexInput.new(dir_path(name))
|
192
|
-
end
|
193
|
-
|
194
|
-
# Construct a Lock.
|
195
|
-
def make_lock(name)
|
196
|
-
FSLock.new(@lock_dir.path + "/" + lock_prefix() + name + ".lck")
|
197
|
-
end
|
198
|
-
|
199
|
-
# Closes the store.
|
200
|
-
def close()
|
201
|
-
synchronize do
|
202
|
-
@ref_count -= 1
|
203
|
-
if (@ref_count <= 0) then
|
204
|
-
@@Directories.synchronize do
|
205
|
-
@@Directories.delete(@dir.path)
|
206
|
-
close_internal
|
207
|
-
end
|
208
|
-
end
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
def reference()
|
213
|
-
@ref_count += 1
|
214
|
-
end
|
215
|
-
|
216
|
-
# See Lock for hints as to how to use locks.
|
217
|
-
class FSLock < Lock
|
218
|
-
# pass the name of the file that we are going to lock
|
219
|
-
def initialize(lock_file)
|
220
|
-
@lock_file = lock_file
|
221
|
-
#@clean = FSLock.make_finalizer(lock_file)
|
222
|
-
@clean = lambda { FileUtils.rm_rf(lock_file)}
|
223
|
-
end
|
224
|
-
|
225
|
-
def FSLock.make_finalizer(lock_file)
|
226
|
-
lambda { FileUtils.rm_rf(lock_file)}
|
227
|
-
end
|
228
|
-
|
229
|
-
# obtain the lock on the data source
|
230
|
-
def obtain(lock_timeout = 1)
|
231
|
-
return true if FSDirectory.locks_disabled?
|
232
|
-
MAX_ATTEMPTS.times do
|
233
|
-
begin
|
234
|
-
# create a file if none exists. If one already exists
|
235
|
-
# then someone beat us to the lock so return false
|
236
|
-
File.open(@lock_file, File::WRONLY|File::EXCL|File::CREAT) {|f|}
|
237
|
-
ObjectSpace.define_finalizer(self, @clean)
|
238
|
-
return true
|
239
|
-
rescue SystemCallError
|
240
|
-
# lock was not obtained so sleep for timeout then try again.
|
241
|
-
sleep(lock_timeout)
|
242
|
-
end
|
243
|
-
end
|
244
|
-
# lock could not be obtained so raise an exception
|
245
|
-
raise "could not obtain lock: #{@lock_file}"
|
246
|
-
end
|
247
|
-
|
248
|
-
# Release the lock on the data source. Returns true if successful.
|
249
|
-
def release
|
250
|
-
return if FSDirectory.locks_disabled?
|
251
|
-
begin
|
252
|
-
FileUtils.rm_rf(@lock_file)
|
253
|
-
ObjectSpace.undefine_finalizer(self)
|
254
|
-
rescue SystemCallError
|
255
|
-
# maybe we tried to release a lock that wasn't locked. This
|
256
|
-
# isn't critical so just return false
|
257
|
-
return false
|
258
|
-
end
|
259
|
-
return true
|
260
|
-
end
|
261
|
-
|
262
|
-
# returns true if there is a lock on the data source
|
263
|
-
def locked?
|
264
|
-
return false if FSDirectory.locks_disabled?
|
265
|
-
File.exists?(@lock_file)
|
266
|
-
end
|
267
|
-
end
|
268
|
-
|
269
|
-
# A file system output stream extending OutputStream to read from the file
|
270
|
-
# system
|
271
|
-
class FSIndexOutput < BufferedIndexOutput
|
272
|
-
def initialize(path)
|
273
|
-
super()
|
274
|
-
@file = File.open(path, "wb")
|
275
|
-
end
|
276
|
-
|
277
|
-
def close
|
278
|
-
super()
|
279
|
-
@file.close
|
280
|
-
end
|
281
|
-
|
282
|
-
def seek(pos)
|
283
|
-
super(pos)
|
284
|
-
@file.seek(pos)
|
285
|
-
end
|
286
|
-
|
287
|
-
private
|
288
|
-
def flush_buffer(b, size)
|
289
|
-
@file.syswrite(b[0...size])
|
290
|
-
end
|
291
|
-
end
|
292
|
-
|
293
|
-
# A file system input stream extending InputStream to read from the file system
|
294
|
-
class FSIndexInput < BufferedIndexInput
|
295
|
-
attr_accessor :is_clone
|
296
|
-
attr_reader :length, :file
|
297
|
-
|
298
|
-
def initialize(path)
|
299
|
-
begin
|
300
|
-
@file = File.open(path, "rb")
|
301
|
-
rescue Errno::ENOENT => e
|
302
|
-
raise StandardError.new(e.message)
|
303
|
-
end
|
304
|
-
@file.extend(MonitorMixin)
|
305
|
-
#class <<@file
|
306
|
-
# attr_accessor :ref_count
|
307
|
-
#end
|
308
|
-
#@file.ref_count = 1
|
309
|
-
@length = File.size(path)
|
310
|
-
@is_clone = false
|
311
|
-
super()
|
312
|
-
end
|
313
|
-
|
314
|
-
def close
|
315
|
-
#@file.ref_count -= 1
|
316
|
-
#@file.close if @file.ref_count == 0
|
317
|
-
@file.close if not @is_clone
|
318
|
-
end
|
319
|
-
|
320
|
-
# We need to record if this is a clone so we know when to close the file.
|
321
|
-
# The file should only be closed when the original FSIndexInput is closed.
|
322
|
-
def initialize_copy(o)
|
323
|
-
super
|
324
|
-
@is_clone = true
|
325
|
-
end
|
326
|
-
|
327
|
-
private
|
328
|
-
|
329
|
-
def read_internal(b, offset, length)
|
330
|
-
#@file.synchronize do
|
331
|
-
position = pos()
|
332
|
-
if position != @file.pos
|
333
|
-
@file.seek(position)
|
334
|
-
end
|
335
|
-
bytes = @file.read(length)
|
336
|
-
if bytes.nil?
|
337
|
-
raise EOFError, "Read past EOF in #{@file.path}"
|
338
|
-
end
|
339
|
-
b[offset, bytes.length] = bytes
|
340
|
-
#end
|
341
|
-
end
|
342
|
-
|
343
|
-
def seek_internal(pos)
|
344
|
-
@file.seek(pos)
|
345
|
-
end
|
346
|
-
|
347
|
-
end
|
348
|
-
|
349
|
-
private
|
350
|
-
|
351
|
-
# Add the directory path to the file name for opening
|
352
|
-
def dir_path(name)
|
353
|
-
File.join(@dir.path, name)
|
354
|
-
end
|
355
|
-
|
356
|
-
# returns the lock prefix for this directory
|
357
|
-
def lock_prefix
|
358
|
-
LOCK_PREFIX
|
359
|
-
end
|
360
|
-
|
361
|
-
# Unfortunately, on Windows, Dir does not refresh when rewind is called
|
362
|
-
# so any new files will be hidden. So we open the directory again.
|
363
|
-
def refresh_dir()
|
364
|
-
tmp = Dir.new(@dir.path)
|
365
|
-
@dir.close
|
366
|
-
@dir = tmp
|
367
|
-
end
|
368
|
-
|
369
|
-
def refresh_lock_dir()
|
370
|
-
tmp = Dir.new(@lock_dir.path)
|
371
|
-
@lock_dir.close
|
372
|
-
@lock_dir = tmp
|
373
|
-
end
|
374
|
-
|
375
|
-
# This method is only used by the c extension to free the directory
|
376
|
-
def close_internal
|
377
|
-
end
|
378
|
-
|
379
|
-
#end private
|
380
|
-
end
|
381
|
-
end
|