ferret 0.9.6 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,338 +0,0 @@
|
|
1
|
-
require 'monitor'
|
2
|
-
|
3
|
-
module Ferret::Index
|
4
|
-
|
5
|
-
# Class for accessing a compound stream.
|
6
|
-
# This class implements a directory, but is limited to only read operations.
|
7
|
-
# Directory methods that would normally modify data raise.
|
8
|
-
class CompoundFileReader < Ferret::Store::Directory
|
9
|
-
|
10
|
-
include MonitorMixin
|
11
|
-
|
12
|
-
attr_reader :directory, :file_name
|
13
|
-
|
14
|
-
# Creates a Compound File Reader which contains a single file and has
|
15
|
-
# pointers to the individual files within. When it is initialized, the
|
16
|
-
# compound file is set and the header is read so that it is ready to read
|
17
|
-
# the individual files within.
|
18
|
-
def initialize(dir, name)
|
19
|
-
|
20
|
-
super()
|
21
|
-
|
22
|
-
@directory = dir
|
23
|
-
@file_name = name
|
24
|
-
@entries = {}
|
25
|
-
|
26
|
-
success = false
|
27
|
-
|
28
|
-
begin
|
29
|
-
@stream = dir.open_input(name)
|
30
|
-
|
31
|
-
# read the directory and init files
|
32
|
-
count = @stream.read_vint()
|
33
|
-
entry = nil
|
34
|
-
count.times() do
|
35
|
-
offset = @stream.read_long()
|
36
|
-
id = @stream.read_string()
|
37
|
-
|
38
|
-
if (entry != nil)
|
39
|
-
# set length of the previous entry
|
40
|
-
entry.length = offset - entry.offset
|
41
|
-
end
|
42
|
-
|
43
|
-
entry = FileEntry.new(offset)
|
44
|
-
@entries[id] = entry
|
45
|
-
end
|
46
|
-
|
47
|
-
# set the length of the final entry
|
48
|
-
if (entry != nil)
|
49
|
-
entry.length = @stream.length() - entry.offset
|
50
|
-
end
|
51
|
-
|
52
|
-
success = true
|
53
|
-
|
54
|
-
ensure
|
55
|
-
|
56
|
-
if not success and (@stream != nil)
|
57
|
-
begin
|
58
|
-
@stream.close()
|
59
|
-
rescue IOError
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def close()
|
66
|
-
synchronize do
|
67
|
-
if (@stream == nil): raise(IOError, "Already closed") end
|
68
|
-
|
69
|
-
@entries.clear()
|
70
|
-
@stream.close()
|
71
|
-
@stream = nil
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def open_input(id)
|
76
|
-
synchronize do
|
77
|
-
if (@stream == nil)
|
78
|
-
raise(IOError, "Stream closed")
|
79
|
-
end
|
80
|
-
|
81
|
-
entry = @entries[id]
|
82
|
-
if (entry == nil)
|
83
|
-
raise(IOError, "No sub-file with id " + id + " found")
|
84
|
-
end
|
85
|
-
return CSIndexInput.new(@stream, entry.offset, entry.length)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
# Returns an array of strings, one for each file in the directory.
|
90
|
-
def list()
|
91
|
-
return @entries.keys()
|
92
|
-
end
|
93
|
-
|
94
|
-
# Returns true iff a file with the given name exists.
|
95
|
-
def exists?(name)
|
96
|
-
return @entries.key?(name)
|
97
|
-
end
|
98
|
-
|
99
|
-
# Returns the time the named file was last modified.
|
100
|
-
def modified(name)
|
101
|
-
return @directory.modified(@file_name)
|
102
|
-
end
|
103
|
-
|
104
|
-
# Set the modified time of an existing file to now.
|
105
|
-
def touch(name)
|
106
|
-
@directory.touch(@file_name)
|
107
|
-
end
|
108
|
-
|
109
|
-
# Not implemented
|
110
|
-
def remove(name) raise(NotImplementedError) end
|
111
|
-
|
112
|
-
# Not implemented
|
113
|
-
def rename(from, to) raise(NotImplementedError) end
|
114
|
-
|
115
|
-
# Returns the length of a file in the directory.
|
116
|
-
def length(name)
|
117
|
-
e = @entries[name]
|
118
|
-
if (e == nil): raise(IOError, "File " + name + " does not exist") end
|
119
|
-
return e.length
|
120
|
-
end
|
121
|
-
|
122
|
-
# Not implemented
|
123
|
-
def create_output(name) raise(NotImplementedError) end
|
124
|
-
|
125
|
-
# Not implemented
|
126
|
-
def make_lock(name) raise(NotImplementedError) end
|
127
|
-
|
128
|
-
# Implementation of an IndexInput that reads from a portion of the
|
129
|
-
# compound file.
|
130
|
-
class CSIndexInput < Ferret::Store::BufferedIndexInput
|
131
|
-
attr_reader :length
|
132
|
-
|
133
|
-
def initialize(base, file_offset, length)
|
134
|
-
super()
|
135
|
-
@base = base
|
136
|
-
@base.extend(MonitorMixin)
|
137
|
-
@file_offset = file_offset
|
138
|
-
@length = length
|
139
|
-
end
|
140
|
-
|
141
|
-
# Closes the stream to further operations.
|
142
|
-
def close() end
|
143
|
-
|
144
|
-
private
|
145
|
-
# Expert: implements buffer refill. Reads bytes from the current
|
146
|
-
# position in the input.
|
147
|
-
#
|
148
|
-
# b:: the array to read bytes into
|
149
|
-
# offset:: the offset in the array to start storing bytes
|
150
|
-
# len:: the number of bytes to read
|
151
|
-
def read_internal(b, offset, len)
|
152
|
-
@base.synchronize() do
|
153
|
-
start = pos()
|
154
|
-
if(start + len > @length): raise(EOFError, "read past EOF") end
|
155
|
-
@base.seek(@file_offset + start)
|
156
|
-
@base.read_bytes(b, offset, len)
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
# Expert: implements seek. Sets current position in @file, where
|
161
|
-
# the next {@link #read_internal(byte[],int,int)} will occur.
|
162
|
-
def seek_internal(pos) end
|
163
|
-
end
|
164
|
-
|
165
|
-
private
|
166
|
-
# Base info
|
167
|
-
class FileEntry
|
168
|
-
attr_accessor :offset, :length
|
169
|
-
def initialize(offset)
|
170
|
-
@offset = offset
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
end
|
175
|
-
|
176
|
-
# Combines multiple files into a single compound file.
|
177
|
-
# The file format:
|
178
|
-
#
|
179
|
-
# * VInt fileCount
|
180
|
-
# * {Directory} fileCount entries with the following structure:
|
181
|
-
# + long data_offset
|
182
|
-
# + UTFString extension
|
183
|
-
# * {File Data} fileCount entries with the raw data of the corresponding file
|
184
|
-
#
|
185
|
-
# The fileCount integer indicates how many files are contained in this compound
|
186
|
-
# file. The {directory} that follows has that many entries. Each directory entry
|
187
|
-
# contains an encoding identifier, a long pointer to the start of this file's
|
188
|
-
# data section, and a UTF String with that file's extension.
|
189
|
-
class CompoundFileWriter
|
190
|
-
|
191
|
-
class StateError < Exception
|
192
|
-
end
|
193
|
-
|
194
|
-
attr_reader :directory, :file_name
|
195
|
-
|
196
|
-
# Create the compound stream in the specified file. The file name is the
|
197
|
-
# entire name (no extensions are added).
|
198
|
-
def initialize(dir, name)
|
199
|
-
@directory = dir
|
200
|
-
@file_name = name
|
201
|
-
@ids = Set.new
|
202
|
-
@file_entries = []
|
203
|
-
@merged = false
|
204
|
-
end
|
205
|
-
|
206
|
-
# Add a source stream. _file_name_ is the string by which the
|
207
|
-
# sub-stream will be known in the compound stream.
|
208
|
-
#
|
209
|
-
# Raises:: StateError if this writer is closed
|
210
|
-
# Raises:: ArgumentError if a file with the same name
|
211
|
-
# has been added already
|
212
|
-
def add_file(file_name)
|
213
|
-
if @merged
|
214
|
-
raise(StateError, "Can't add extensions after merge has been called")
|
215
|
-
end
|
216
|
-
|
217
|
-
if not @ids.add?(file_name)
|
218
|
-
raise(ArgumentError, "File #{file_name} already added")
|
219
|
-
end
|
220
|
-
|
221
|
-
entry = FileEntry.new(file_name)
|
222
|
-
@file_entries << entry
|
223
|
-
end
|
224
|
-
|
225
|
-
# Merge files with the extensions added up to now.
|
226
|
-
# All files with these extensions are combined sequentially into the
|
227
|
-
# compound stream. After successful merge, the source files
|
228
|
-
# are deleted.
|
229
|
-
#
|
230
|
-
# Throws:: StateException if close() had been called before or
|
231
|
-
# if no file has been added to this object
|
232
|
-
def close()
|
233
|
-
|
234
|
-
if @merged
|
235
|
-
raise(StateException, "Merge already performed")
|
236
|
-
end
|
237
|
-
|
238
|
-
if @file_entries.empty?
|
239
|
-
raise(StateException, "No entries to merge have been defined")
|
240
|
-
end
|
241
|
-
|
242
|
-
@merged = true
|
243
|
-
|
244
|
-
# open the compound stream
|
245
|
-
os = nil
|
246
|
-
begin
|
247
|
-
os = @directory.create_output(@file_name)
|
248
|
-
|
249
|
-
# Write the number of entries
|
250
|
-
os.write_vint(@file_entries.size)
|
251
|
-
|
252
|
-
# Write the directory with all offsets at 0.
|
253
|
-
# Remember the positions of directory entries so that we can
|
254
|
-
# adjust the offsets later
|
255
|
-
@file_entries.each do |fe|
|
256
|
-
fe.dir_offset = os.pos()
|
257
|
-
os.write_long(0) # for now
|
258
|
-
os.write_string(fe.file_name)
|
259
|
-
end
|
260
|
-
|
261
|
-
# Open the files and copy their data into the stream.
|
262
|
-
# Remember the locations of each file's data section.
|
263
|
-
@file_entries.each do |fe|
|
264
|
-
fe.data_offset = os.pos()
|
265
|
-
copy_file(fe, os)
|
266
|
-
end
|
267
|
-
|
268
|
-
# Write the data offsets into the directory of the compound stream
|
269
|
-
@file_entries.each do |fe|
|
270
|
-
os.seek(fe.dir_offset)
|
271
|
-
os.write_long(fe.data_offset)
|
272
|
-
end
|
273
|
-
|
274
|
-
# Close the output stream. Set the os to nil before trying to
|
275
|
-
# close so that if an exception occurs during the close, the
|
276
|
-
# finally clause below will not attempt to close the stream
|
277
|
-
# the second time.
|
278
|
-
tmp = os
|
279
|
-
os = nil
|
280
|
-
tmp.close()
|
281
|
-
|
282
|
-
ensure
|
283
|
-
if (os != nil)
|
284
|
-
begin
|
285
|
-
os.close()
|
286
|
-
rescue
|
287
|
-
end
|
288
|
-
end
|
289
|
-
end
|
290
|
-
end
|
291
|
-
|
292
|
-
private
|
293
|
-
|
294
|
-
# Internal class for holding a file
|
295
|
-
FileEntry = Struct.new(:file_name, :dir_offset, :data_offset)
|
296
|
-
|
297
|
-
# Copy the contents of the file with specified extension into the
|
298
|
-
# provided output stream. Use a buffer for moving data
|
299
|
-
# to reduce memory allocation.
|
300
|
-
def copy_file(source, os)
|
301
|
-
is = nil
|
302
|
-
begin
|
303
|
-
start_ptr = os.pos()
|
304
|
-
|
305
|
-
is = @directory.open_input(source.file_name)
|
306
|
-
remainder = length = is.length
|
307
|
-
|
308
|
-
buffer = Ferret::Store::BUFFER.clone
|
309
|
-
while (remainder > 0)
|
310
|
-
len = [remainder, Ferret::Store::BUFFER_SIZE].min
|
311
|
-
is.read_bytes(buffer, 0, len)
|
312
|
-
os.write_bytes(buffer, len)
|
313
|
-
remainder -= len
|
314
|
-
end
|
315
|
-
|
316
|
-
# Verify that remainder is 0
|
317
|
-
if (remainder != 0)
|
318
|
-
raise(IOError,
|
319
|
-
"Non-zero remainder length after copying: #{remainder} " +
|
320
|
-
"(id: #{source.file_name}, length: #{length}, buffer size: " +
|
321
|
-
" #{Ferret::Store::BUFFER_SIZE})")
|
322
|
-
end
|
323
|
-
|
324
|
-
# Verify that the output length diff is equal to original file
|
325
|
-
end_ptr = os.pos()
|
326
|
-
diff = end_ptr - start_ptr
|
327
|
-
if (diff != length)
|
328
|
-
raise(IOError,
|
329
|
-
"Difference in the output file offsets #{diff}" +
|
330
|
-
" does not match the original file length #{length}")
|
331
|
-
end
|
332
|
-
|
333
|
-
ensure
|
334
|
-
if (is != nil): is.close() end
|
335
|
-
end
|
336
|
-
end
|
337
|
-
end
|
338
|
-
end
|
@@ -1,289 +0,0 @@
|
|
1
|
-
require 'ferret/search/similarity'
|
2
|
-
|
3
|
-
module Ferret::Index
|
4
|
-
|
5
|
-
class DocumentWriter
|
6
|
-
# If non-nil, a message will be printed to this if max_field_length is
|
7
|
-
# reached.
|
8
|
-
attr_writer :info_stream
|
9
|
-
|
10
|
-
# directory:: The directory to write the document information to
|
11
|
-
# analyzer:: The analyzer to use for the document
|
12
|
-
# similarity:: The Similarity function writer.similarity
|
13
|
-
# max_field_length:: The maximum number of tokens a field may have
|
14
|
-
# writer.max_field_length
|
15
|
-
# term_index_interval:: The interval of terms in the index
|
16
|
-
# writer.max_field_length
|
17
|
-
def initialize(directory,
|
18
|
-
analyzer,
|
19
|
-
similarity,
|
20
|
-
max_field_length,
|
21
|
-
term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
|
22
|
-
@directory = directory
|
23
|
-
@analyzer = analyzer
|
24
|
-
@similarity = similarity
|
25
|
-
@max_field_length = max_field_length
|
26
|
-
@term_index_interval = term_index_interval
|
27
|
-
|
28
|
-
# Keys are Terms, values are Postings.
|
29
|
-
# Used to buffer a document before it is written to the index.
|
30
|
-
@posting_table = {}
|
31
|
-
|
32
|
-
@term_buffer = Term.new("", "")
|
33
|
-
end
|
34
|
-
|
35
|
-
def add_document(segment, doc)
|
36
|
-
|
37
|
-
# write field names
|
38
|
-
@field_infos = FieldInfos.new()
|
39
|
-
@field_infos << doc
|
40
|
-
@field_infos.write_to_dir(@directory, segment + ".fnm")
|
41
|
-
|
42
|
-
# write field values
|
43
|
-
fields_writer = FieldsWriter.new(@directory, segment, @field_infos)
|
44
|
-
begin
|
45
|
-
fields_writer.add_document(doc)
|
46
|
-
ensure
|
47
|
-
fields_writer.close()
|
48
|
-
end
|
49
|
-
|
50
|
-
# invert doc into posting_table
|
51
|
-
@posting_table.clear(); # clear posting_table
|
52
|
-
arr_size = @field_infos.size
|
53
|
-
@field_lengths = Array.new(arr_size, 0) # init field_lengths
|
54
|
-
@field_positions = Array.new(arr_size, 0) # init field_positions
|
55
|
-
@field_offsets = Array.new(arr_size, 0) # init field_offsets
|
56
|
-
@field_boosts = Array.new(arr_size, doc.boost) # init field_boosts
|
57
|
-
|
58
|
-
invert_document(doc)
|
59
|
-
|
60
|
-
# sort posting_table into an array
|
61
|
-
postings = sort_posting_table()
|
62
|
-
|
63
|
-
# for (int i = 0; i < postings.length; i += 1)
|
64
|
-
# Posting posting = postings[i]
|
65
|
-
# print(posting.term)
|
66
|
-
# print(" freq=" + posting.freq)
|
67
|
-
# print(" pos=")
|
68
|
-
# print(posting.positions[0])
|
69
|
-
# for (int j = 1; j < posting.freq; j += 1)
|
70
|
-
# print("," + posting.positions[j])
|
71
|
-
# puts("")
|
72
|
-
# end
|
73
|
-
|
74
|
-
# write postings
|
75
|
-
write_postings(postings, segment)
|
76
|
-
|
77
|
-
# write norms of indexed fields
|
78
|
-
write_norms(segment)
|
79
|
-
|
80
|
-
end
|
81
|
-
|
82
|
-
private
|
83
|
-
|
84
|
-
# Tokenizes the fields of a document into Postings.
|
85
|
-
def invert_document(doc)
|
86
|
-
|
87
|
-
fields = doc.all_fields
|
88
|
-
fields.each do |field|
|
89
|
-
field_name = field.name
|
90
|
-
field_info = @field_infos[field_name]
|
91
|
-
field_number = field_info.number
|
92
|
-
|
93
|
-
length = @field_lengths[field_number] # length of field
|
94
|
-
position = @field_positions[field_number] # position in field
|
95
|
-
position += @analyzer.pos_inc_gap(field_name) if length > 0
|
96
|
-
offset = @field_offsets[field_number] # offset field
|
97
|
-
|
98
|
-
if field_info.indexed?
|
99
|
-
if not field.tokenized? # un-tokenized field
|
100
|
-
string_value = field.string_value
|
101
|
-
if field_info.store_offsets?
|
102
|
-
add_position(field_name,
|
103
|
-
string_value,
|
104
|
-
position,
|
105
|
-
TermVectorOffsetInfo.new(offset,
|
106
|
-
offset + string_value.length))
|
107
|
-
position += 1
|
108
|
-
else
|
109
|
-
add_position(field_name, string_value, position, nil)
|
110
|
-
position += 1
|
111
|
-
end
|
112
|
-
offset += string_value.length()
|
113
|
-
length += 1
|
114
|
-
else
|
115
|
-
|
116
|
-
reader = field.reader_value()
|
117
|
-
|
118
|
-
# Tokenize field and add to posting_table
|
119
|
-
stream = @analyzer.token_stream(field_name, reader)
|
120
|
-
begin
|
121
|
-
last_token = nil
|
122
|
-
while token = stream.next
|
123
|
-
position += (token.pos_inc - 1)
|
124
|
-
|
125
|
-
if(field_info.store_offsets?())
|
126
|
-
add_position(field_name,
|
127
|
-
token.text(),
|
128
|
-
position,
|
129
|
-
TermVectorOffsetInfo.new(
|
130
|
-
offset + token.start_offset(),
|
131
|
-
offset + token.end_offset()))
|
132
|
-
position += 1
|
133
|
-
else
|
134
|
-
add_position(field_name, token.text(), position, nil)
|
135
|
-
position += 1
|
136
|
-
end
|
137
|
-
|
138
|
-
last_token = token
|
139
|
-
length += 1
|
140
|
-
if (length > @max_field_length)
|
141
|
-
if @info_stream
|
142
|
-
@info_stream.puts("max_field_length " + @max_field_length.to_s + " reached, ignoring following tokens")
|
143
|
-
end
|
144
|
-
break
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
if(last_token != nil)
|
149
|
-
offset += last_token.end_offset() + 1
|
150
|
-
end
|
151
|
-
|
152
|
-
ensure
|
153
|
-
stream.close()
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
@field_lengths[field_number] = length # save field length
|
158
|
-
@field_positions[field_number] = position # save field position
|
159
|
-
@field_boosts[field_number] *= field.boost
|
160
|
-
@field_offsets[field_number] = offset
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
|
166
|
-
def add_position(field, text, position, tv_offset_info)
|
167
|
-
@term_buffer.set!(field, text)
|
168
|
-
#puts("Offset: " + tv_offset_info)
|
169
|
-
posting = @posting_table[@term_buffer]
|
170
|
-
if (posting != nil) # word seen before
|
171
|
-
freq = posting.freq
|
172
|
-
posting.positions[freq] = position # add new position
|
173
|
-
posting.offsets[freq] = tv_offset_info # add new position
|
174
|
-
|
175
|
-
if (tv_offset_info != nil)
|
176
|
-
posting.offsets[freq] = tv_offset_info
|
177
|
-
end
|
178
|
-
posting.freq = freq + 1 # update frequency
|
179
|
-
else # word not seen before
|
180
|
-
term = Term.new(field, text)
|
181
|
-
@posting_table[term] = Posting.new(term, position, tv_offset_info)
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
def sort_posting_table()
|
186
|
-
# copy @posting_table into an array
|
187
|
-
return @posting_table.values.sort { |x,y| x.term <=> y.term }
|
188
|
-
end
|
189
|
-
|
190
|
-
def write_postings(postings, segment)
|
191
|
-
|
192
|
-
freq = nil
|
193
|
-
prox = nil
|
194
|
-
tis_writer = nil
|
195
|
-
tv_writer = nil
|
196
|
-
begin
|
197
|
-
#open files for inverse index storage
|
198
|
-
freq = @directory.create_output(segment + ".frq")
|
199
|
-
prox = @directory.create_output(segment + ".prx")
|
200
|
-
tis_writer = TermInfosWriter.new(@directory, segment, @field_infos,
|
201
|
-
@term_index_interval)
|
202
|
-
ti = TermInfo.new()
|
203
|
-
current_field = nil
|
204
|
-
|
205
|
-
postings.each do |posting|
|
206
|
-
# add an entry to the dictionary with pointers to prox and freq files
|
207
|
-
ti.set_values!(1, freq.pos(), prox.pos(), -1)
|
208
|
-
tis_writer.add(posting.term, ti)
|
209
|
-
|
210
|
-
# add an entry to the freq file
|
211
|
-
posting_freq = posting.freq
|
212
|
-
if (posting_freq == 1) # optimize freq=1
|
213
|
-
freq.write_vint(1) # set low bit of doc num.
|
214
|
-
else
|
215
|
-
freq.write_vint(0) # the document number
|
216
|
-
freq.write_vint(posting_freq) # frequency in doc
|
217
|
-
end
|
218
|
-
|
219
|
-
last_position = 0 # write positions
|
220
|
-
posting.positions.each do |position|
|
221
|
-
prox.write_vint(position - last_position)
|
222
|
-
last_position = position
|
223
|
-
end
|
224
|
-
# check to see if we switched to a new field
|
225
|
-
term_field = posting.term.field
|
226
|
-
if (current_field != term_field)
|
227
|
-
# changing field - see if there is something to save
|
228
|
-
current_field = term_field
|
229
|
-
fi = @field_infos[current_field]
|
230
|
-
if (fi.store_term_vector?)
|
231
|
-
if tv_writer.nil?
|
232
|
-
tv_writer = TermVectorsWriter.new(@directory, segment, @field_infos)
|
233
|
-
tv_writer.open_document()
|
234
|
-
end
|
235
|
-
tv_writer.open_field(current_field)
|
236
|
-
|
237
|
-
elsif not tv_writer.nil?
|
238
|
-
tv_writer.close_field()
|
239
|
-
end
|
240
|
-
end
|
241
|
-
if not tv_writer.nil? and tv_writer.field_open?
|
242
|
-
tv_writer.add_term(posting.term.text, posting_freq, posting.positions, posting.offsets)
|
243
|
-
end
|
244
|
-
end
|
245
|
-
if not tv_writer.nil?
|
246
|
-
tv_writer.close_document()
|
247
|
-
end
|
248
|
-
ensure
|
249
|
-
# make an effort to close all streams we can but remember and re-raise
|
250
|
-
# the last exception encountered in this process
|
251
|
-
keep = nil
|
252
|
-
[freq, prox, tis_writer, tv_writer].compact.each do |obj|
|
253
|
-
begin
|
254
|
-
obj.close
|
255
|
-
rescue IOError => e
|
256
|
-
keep = e
|
257
|
-
end
|
258
|
-
end
|
259
|
-
raise keep if not keep.nil?
|
260
|
-
end
|
261
|
-
end
|
262
|
-
|
263
|
-
def write_norms(segment)
|
264
|
-
@field_infos.each_with_index do |fi, i|
|
265
|
-
if fi.indexed? and not fi.omit_norms?
|
266
|
-
norm = @field_boosts[i] * @similarity.length_norm(fi.name, @field_lengths[i])
|
267
|
-
norms = @directory.create_output(segment + ".f" + i.to_s)
|
268
|
-
begin
|
269
|
-
norms.write_byte(Ferret::Search::Similarity.encode_norm(norm))
|
270
|
-
ensure
|
271
|
-
norms.close()
|
272
|
-
end
|
273
|
-
end
|
274
|
-
end
|
275
|
-
end
|
276
|
-
|
277
|
-
end
|
278
|
-
|
279
|
-
class Posting # info about a Term in a doc
|
280
|
-
attr_accessor :term, :freq, :positions, :offsets
|
281
|
-
|
282
|
-
def initialize(t, position, offset)
|
283
|
-
@term = t
|
284
|
-
@freq = 1
|
285
|
-
@positions = [position]
|
286
|
-
@offsets = [offset]
|
287
|
-
end
|
288
|
-
end
|
289
|
-
end
|