ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,338 +0,0 @@
|
|
1
|
-
require 'monitor'
|
2
|
-
|
3
|
-
module Ferret::Index
|
4
|
-
|
5
|
-
# Class for accessing a compound stream.
|
6
|
-
# This class implements a directory, but is limited to only read operations.
|
7
|
-
# Directory methods that would normally modify data raise.
|
8
|
-
class CompoundFileReader < Ferret::Store::Directory
|
9
|
-
|
10
|
-
include MonitorMixin
|
11
|
-
|
12
|
-
attr_reader :directory, :file_name
|
13
|
-
|
14
|
-
# Creates a Compound File Reader which contains a single file and has
|
15
|
-
# pointers to the individual files within. When it is initialized, the
|
16
|
-
# compound file is set and the header is read so that it is ready to read
|
17
|
-
# the individual files within.
|
18
|
-
def initialize(dir, name)
|
19
|
-
|
20
|
-
super()
|
21
|
-
|
22
|
-
@directory = dir
|
23
|
-
@file_name = name
|
24
|
-
@entries = {}
|
25
|
-
|
26
|
-
success = false
|
27
|
-
|
28
|
-
begin
|
29
|
-
@stream = dir.open_input(name)
|
30
|
-
|
31
|
-
# read the directory and init files
|
32
|
-
count = @stream.read_vint()
|
33
|
-
entry = nil
|
34
|
-
count.times() do
|
35
|
-
offset = @stream.read_long()
|
36
|
-
id = @stream.read_string()
|
37
|
-
|
38
|
-
if (entry != nil)
|
39
|
-
# set length of the previous entry
|
40
|
-
entry.length = offset - entry.offset
|
41
|
-
end
|
42
|
-
|
43
|
-
entry = FileEntry.new(offset)
|
44
|
-
@entries[id] = entry
|
45
|
-
end
|
46
|
-
|
47
|
-
# set the length of the final entry
|
48
|
-
if (entry != nil)
|
49
|
-
entry.length = @stream.length() - entry.offset
|
50
|
-
end
|
51
|
-
|
52
|
-
success = true
|
53
|
-
|
54
|
-
ensure
|
55
|
-
|
56
|
-
if not success and (@stream != nil)
|
57
|
-
begin
|
58
|
-
@stream.close()
|
59
|
-
rescue IOError
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def close()
|
66
|
-
synchronize do
|
67
|
-
if (@stream == nil): raise(IOError, "Already closed") end
|
68
|
-
|
69
|
-
@entries.clear()
|
70
|
-
@stream.close()
|
71
|
-
@stream = nil
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def open_input(id)
|
76
|
-
synchronize do
|
77
|
-
if (@stream == nil)
|
78
|
-
raise(IOError, "Stream closed")
|
79
|
-
end
|
80
|
-
|
81
|
-
entry = @entries[id]
|
82
|
-
if (entry == nil)
|
83
|
-
raise(IOError, "No sub-file with id " + id + " found")
|
84
|
-
end
|
85
|
-
return CSIndexInput.new(@stream, entry.offset, entry.length)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
# Returns an array of strings, one for each file in the directory.
|
90
|
-
def list()
|
91
|
-
return @entries.keys()
|
92
|
-
end
|
93
|
-
|
94
|
-
# Returns true iff a file with the given name exists.
|
95
|
-
def exists?(name)
|
96
|
-
return @entries.key?(name)
|
97
|
-
end
|
98
|
-
|
99
|
-
# Returns the time the named file was last modified.
|
100
|
-
def modified(name)
|
101
|
-
return @directory.modified(@file_name)
|
102
|
-
end
|
103
|
-
|
104
|
-
# Set the modified time of an existing file to now.
|
105
|
-
def touch(name)
|
106
|
-
@directory.touch(@file_name)
|
107
|
-
end
|
108
|
-
|
109
|
-
# Not implemented
|
110
|
-
def remove(name) raise(NotImplementedError) end
|
111
|
-
|
112
|
-
# Not implemented
|
113
|
-
def rename(from, to) raise(NotImplementedError) end
|
114
|
-
|
115
|
-
# Returns the length of a file in the directory.
|
116
|
-
def length(name)
|
117
|
-
e = @entries[name]
|
118
|
-
if (e == nil): raise(IOError, "File " + name + " does not exist") end
|
119
|
-
return e.length
|
120
|
-
end
|
121
|
-
|
122
|
-
# Not implemented
|
123
|
-
def create_output(name) raise(NotImplementedError) end
|
124
|
-
|
125
|
-
# Not implemented
|
126
|
-
def make_lock(name) raise(NotImplementedError) end
|
127
|
-
|
128
|
-
# Implementation of an IndexInput that reads from a portion of the
|
129
|
-
# compound file.
|
130
|
-
class CSIndexInput < Ferret::Store::BufferedIndexInput
|
131
|
-
attr_reader :length
|
132
|
-
|
133
|
-
def initialize(base, file_offset, length)
|
134
|
-
super()
|
135
|
-
@base = base
|
136
|
-
@base.extend(MonitorMixin)
|
137
|
-
@file_offset = file_offset
|
138
|
-
@length = length
|
139
|
-
end
|
140
|
-
|
141
|
-
# Closes the stream to further operations.
|
142
|
-
def close() end
|
143
|
-
|
144
|
-
private
|
145
|
-
# Expert: implements buffer refill. Reads bytes from the current
|
146
|
-
# position in the input.
|
147
|
-
#
|
148
|
-
# b:: the array to read bytes into
|
149
|
-
# offset:: the offset in the array to start storing bytes
|
150
|
-
# len:: the number of bytes to read
|
151
|
-
def read_internal(b, offset, len)
|
152
|
-
@base.synchronize() do
|
153
|
-
start = pos()
|
154
|
-
if(start + len > @length): raise(EOFError, "read past EOF") end
|
155
|
-
@base.seek(@file_offset + start)
|
156
|
-
@base.read_bytes(b, offset, len)
|
157
|
-
end
|
158
|
-
end
|
159
|
-
|
160
|
-
# Expert: implements seek. Sets current position in @file, where
|
161
|
-
# the next {@link #read_internal(byte[],int,int)} will occur.
|
162
|
-
def seek_internal(pos) end
|
163
|
-
end
|
164
|
-
|
165
|
-
private
|
166
|
-
# Base info
|
167
|
-
class FileEntry
|
168
|
-
attr_accessor :offset, :length
|
169
|
-
def initialize(offset)
|
170
|
-
@offset = offset
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
end
|
175
|
-
|
176
|
-
# Combines multiple files into a single compound file.
|
177
|
-
# The file format:
|
178
|
-
#
|
179
|
-
# * VInt fileCount
|
180
|
-
# * {Directory} fileCount entries with the following structure:
|
181
|
-
# + long data_offset
|
182
|
-
# + UTFString extension
|
183
|
-
# * {File Data} fileCount entries with the raw data of the corresponding file
|
184
|
-
#
|
185
|
-
# The fileCount integer indicates how many files are contained in this compound
|
186
|
-
# file. The {directory} that follows has that many entries. Each directory entry
|
187
|
-
# contains an encoding identifier, a long pointer to the start of this file's
|
188
|
-
# data section, and a UTF String with that file's extension.
|
189
|
-
class CompoundFileWriter
|
190
|
-
|
191
|
-
class StateError < Exception
|
192
|
-
end
|
193
|
-
|
194
|
-
attr_reader :directory, :file_name
|
195
|
-
|
196
|
-
# Create the compound stream in the specified file. The file name is the
|
197
|
-
# entire name (no extensions are added).
|
198
|
-
def initialize(dir, name)
|
199
|
-
@directory = dir
|
200
|
-
@file_name = name
|
201
|
-
@ids = Set.new
|
202
|
-
@file_entries = []
|
203
|
-
@merged = false
|
204
|
-
end
|
205
|
-
|
206
|
-
# Add a source stream. _file_name_ is the string by which the
|
207
|
-
# sub-stream will be known in the compound stream.
|
208
|
-
#
|
209
|
-
# Raises:: StateError if this writer is closed
|
210
|
-
# Raises:: ArgumentError if a file with the same name
|
211
|
-
# has been added already
|
212
|
-
def add_file(file_name)
|
213
|
-
if @merged
|
214
|
-
raise(StateError, "Can't add extensions after merge has been called")
|
215
|
-
end
|
216
|
-
|
217
|
-
if not @ids.add?(file_name)
|
218
|
-
raise(ArgumentError, "File #{file_name} already added")
|
219
|
-
end
|
220
|
-
|
221
|
-
entry = FileEntry.new(file_name)
|
222
|
-
@file_entries << entry
|
223
|
-
end
|
224
|
-
|
225
|
-
# Merge files with the extensions added up to now.
|
226
|
-
# All files with these extensions are combined sequentially into the
|
227
|
-
# compound stream. After successful merge, the source files
|
228
|
-
# are deleted.
|
229
|
-
#
|
230
|
-
# Throws:: StateException if close() had been called before or
|
231
|
-
# if no file has been added to this object
|
232
|
-
def close()
|
233
|
-
|
234
|
-
if @merged
|
235
|
-
raise(StateException, "Merge already performed")
|
236
|
-
end
|
237
|
-
|
238
|
-
if @file_entries.empty?
|
239
|
-
raise(StateException, "No entries to merge have been defined")
|
240
|
-
end
|
241
|
-
|
242
|
-
@merged = true
|
243
|
-
|
244
|
-
# open the compound stream
|
245
|
-
os = nil
|
246
|
-
begin
|
247
|
-
os = @directory.create_output(@file_name)
|
248
|
-
|
249
|
-
# Write the number of entries
|
250
|
-
os.write_vint(@file_entries.size)
|
251
|
-
|
252
|
-
# Write the directory with all offsets at 0.
|
253
|
-
# Remember the positions of directory entries so that we can
|
254
|
-
# adjust the offsets later
|
255
|
-
@file_entries.each do |fe|
|
256
|
-
fe.dir_offset = os.pos()
|
257
|
-
os.write_long(0) # for now
|
258
|
-
os.write_string(fe.file_name)
|
259
|
-
end
|
260
|
-
|
261
|
-
# Open the files and copy their data into the stream.
|
262
|
-
# Remember the locations of each file's data section.
|
263
|
-
@file_entries.each do |fe|
|
264
|
-
fe.data_offset = os.pos()
|
265
|
-
copy_file(fe, os)
|
266
|
-
end
|
267
|
-
|
268
|
-
# Write the data offsets into the directory of the compound stream
|
269
|
-
@file_entries.each do |fe|
|
270
|
-
os.seek(fe.dir_offset)
|
271
|
-
os.write_long(fe.data_offset)
|
272
|
-
end
|
273
|
-
|
274
|
-
# Close the output stream. Set the os to nil before trying to
|
275
|
-
# close so that if an exception occurs during the close, the
|
276
|
-
# finally clause below will not attempt to close the stream
|
277
|
-
# the second time.
|
278
|
-
tmp = os
|
279
|
-
os = nil
|
280
|
-
tmp.close()
|
281
|
-
|
282
|
-
ensure
|
283
|
-
if (os != nil)
|
284
|
-
begin
|
285
|
-
os.close()
|
286
|
-
rescue
|
287
|
-
end
|
288
|
-
end
|
289
|
-
end
|
290
|
-
end
|
291
|
-
|
292
|
-
private
|
293
|
-
|
294
|
-
# Internal class for holding a file
|
295
|
-
FileEntry = Struct.new(:file_name, :dir_offset, :data_offset)
|
296
|
-
|
297
|
-
# Copy the contents of the file with specified extension into the
|
298
|
-
# provided output stream. Use a buffer for moving data
|
299
|
-
# to reduce memory allocation.
|
300
|
-
def copy_file(source, os)
|
301
|
-
is = nil
|
302
|
-
begin
|
303
|
-
start_ptr = os.pos()
|
304
|
-
|
305
|
-
is = @directory.open_input(source.file_name)
|
306
|
-
remainder = length = is.length
|
307
|
-
|
308
|
-
buffer = Ferret::Store::BUFFER.clone
|
309
|
-
while (remainder > 0)
|
310
|
-
len = [remainder, Ferret::Store::BUFFER_SIZE].min
|
311
|
-
is.read_bytes(buffer, 0, len)
|
312
|
-
os.write_bytes(buffer, len)
|
313
|
-
remainder -= len
|
314
|
-
end
|
315
|
-
|
316
|
-
# Verify that remainder is 0
|
317
|
-
if (remainder != 0)
|
318
|
-
raise(IOError,
|
319
|
-
"Non-zero remainder length after copying: #{remainder} " +
|
320
|
-
"(id: #{source.file_name}, length: #{length}, buffer size: " +
|
321
|
-
" #{Ferret::Store::BUFFER_SIZE})")
|
322
|
-
end
|
323
|
-
|
324
|
-
# Verify that the output length diff is equal to original file
|
325
|
-
end_ptr = os.pos()
|
326
|
-
diff = end_ptr - start_ptr
|
327
|
-
if (diff != length)
|
328
|
-
raise(IOError,
|
329
|
-
"Difference in the output file offsets #{diff}" +
|
330
|
-
" does not match the original file length #{length}")
|
331
|
-
end
|
332
|
-
|
333
|
-
ensure
|
334
|
-
if (is != nil): is.close() end
|
335
|
-
end
|
336
|
-
end
|
337
|
-
end
|
338
|
-
end
|
@@ -1,289 +0,0 @@
|
|
1
|
-
require 'ferret/search/similarity'
|
2
|
-
|
3
|
-
module Ferret::Index
|
4
|
-
|
5
|
-
class DocumentWriter
|
6
|
-
# If non-nil, a message will be printed to this if max_field_length is
|
7
|
-
# reached.
|
8
|
-
attr_writer :info_stream
|
9
|
-
|
10
|
-
# directory:: The directory to write the document information to
|
11
|
-
# analyzer:: The analyzer to use for the document
|
12
|
-
# similarity:: The Similarity function writer.similarity
|
13
|
-
# max_field_length:: The maximum number of tokens a field may have
|
14
|
-
# writer.max_field_length
|
15
|
-
# term_index_interval:: The interval of terms in the index
|
16
|
-
# writer.max_field_length
|
17
|
-
def initialize(directory,
|
18
|
-
analyzer,
|
19
|
-
similarity,
|
20
|
-
max_field_length,
|
21
|
-
term_index_interval = IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
|
22
|
-
@directory = directory
|
23
|
-
@analyzer = analyzer
|
24
|
-
@similarity = similarity
|
25
|
-
@max_field_length = max_field_length
|
26
|
-
@term_index_interval = term_index_interval
|
27
|
-
|
28
|
-
# Keys are Terms, values are Postings.
|
29
|
-
# Used to buffer a document before it is written to the index.
|
30
|
-
@posting_table = {}
|
31
|
-
|
32
|
-
@term_buffer = Term.new("", "")
|
33
|
-
end
|
34
|
-
|
35
|
-
def add_document(segment, doc)
|
36
|
-
|
37
|
-
# write field names
|
38
|
-
@field_infos = FieldInfos.new()
|
39
|
-
@field_infos << doc
|
40
|
-
@field_infos.write_to_dir(@directory, segment + ".fnm")
|
41
|
-
|
42
|
-
# write field values
|
43
|
-
fields_writer = FieldsWriter.new(@directory, segment, @field_infos)
|
44
|
-
begin
|
45
|
-
fields_writer.add_document(doc)
|
46
|
-
ensure
|
47
|
-
fields_writer.close()
|
48
|
-
end
|
49
|
-
|
50
|
-
# invert doc into posting_table
|
51
|
-
@posting_table.clear(); # clear posting_table
|
52
|
-
arr_size = @field_infos.size
|
53
|
-
@field_lengths = Array.new(arr_size, 0) # init field_lengths
|
54
|
-
@field_positions = Array.new(arr_size, 0) # init field_positions
|
55
|
-
@field_offsets = Array.new(arr_size, 0) # init field_offsets
|
56
|
-
@field_boosts = Array.new(arr_size, doc.boost) # init field_boosts
|
57
|
-
|
58
|
-
invert_document(doc)
|
59
|
-
|
60
|
-
# sort posting_table into an array
|
61
|
-
postings = sort_posting_table()
|
62
|
-
|
63
|
-
# for (int i = 0; i < postings.length; i += 1)
|
64
|
-
# Posting posting = postings[i]
|
65
|
-
# print(posting.term)
|
66
|
-
# print(" freq=" + posting.freq)
|
67
|
-
# print(" pos=")
|
68
|
-
# print(posting.positions[0])
|
69
|
-
# for (int j = 1; j < posting.freq; j += 1)
|
70
|
-
# print("," + posting.positions[j])
|
71
|
-
# puts("")
|
72
|
-
# end
|
73
|
-
|
74
|
-
# write postings
|
75
|
-
write_postings(postings, segment)
|
76
|
-
|
77
|
-
# write norms of indexed fields
|
78
|
-
write_norms(segment)
|
79
|
-
|
80
|
-
end
|
81
|
-
|
82
|
-
private
|
83
|
-
|
84
|
-
# Tokenizes the fields of a document into Postings.
|
85
|
-
def invert_document(doc)
|
86
|
-
|
87
|
-
fields = doc.all_fields
|
88
|
-
fields.each do |field|
|
89
|
-
field_name = field.name
|
90
|
-
field_info = @field_infos[field_name]
|
91
|
-
field_number = field_info.number
|
92
|
-
|
93
|
-
length = @field_lengths[field_number] # length of field
|
94
|
-
position = @field_positions[field_number] # position in field
|
95
|
-
position += @analyzer.pos_inc_gap(field_name) if length > 0
|
96
|
-
offset = @field_offsets[field_number] # offset field
|
97
|
-
|
98
|
-
if field_info.indexed?
|
99
|
-
if not field.tokenized? # un-tokenized field
|
100
|
-
string_value = field.string_value
|
101
|
-
if field_info.store_offsets?
|
102
|
-
add_position(field_name,
|
103
|
-
string_value,
|
104
|
-
position,
|
105
|
-
TermVectorOffsetInfo.new(offset,
|
106
|
-
offset + string_value.length))
|
107
|
-
position += 1
|
108
|
-
else
|
109
|
-
add_position(field_name, string_value, position, nil)
|
110
|
-
position += 1
|
111
|
-
end
|
112
|
-
offset += string_value.length()
|
113
|
-
length += 1
|
114
|
-
else
|
115
|
-
|
116
|
-
reader = field.reader_value()
|
117
|
-
|
118
|
-
# Tokenize field and add to posting_table
|
119
|
-
stream = @analyzer.token_stream(field_name, reader)
|
120
|
-
begin
|
121
|
-
last_token = nil
|
122
|
-
while token = stream.next
|
123
|
-
position += (token.pos_inc - 1)
|
124
|
-
|
125
|
-
if(field_info.store_offsets?())
|
126
|
-
add_position(field_name,
|
127
|
-
token.text(),
|
128
|
-
position,
|
129
|
-
TermVectorOffsetInfo.new(
|
130
|
-
offset + token.start_offset(),
|
131
|
-
offset + token.end_offset()))
|
132
|
-
position += 1
|
133
|
-
else
|
134
|
-
add_position(field_name, token.text(), position, nil)
|
135
|
-
position += 1
|
136
|
-
end
|
137
|
-
|
138
|
-
last_token = token
|
139
|
-
length += 1
|
140
|
-
if (length > @max_field_length)
|
141
|
-
if @info_stream
|
142
|
-
@info_stream.puts("max_field_length " + @max_field_length.to_s + " reached, ignoring following tokens")
|
143
|
-
end
|
144
|
-
break
|
145
|
-
end
|
146
|
-
end
|
147
|
-
|
148
|
-
if(last_token != nil)
|
149
|
-
offset += last_token.end_offset() + 1
|
150
|
-
end
|
151
|
-
|
152
|
-
ensure
|
153
|
-
stream.close()
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
@field_lengths[field_number] = length # save field length
|
158
|
-
@field_positions[field_number] = position # save field position
|
159
|
-
@field_boosts[field_number] *= field.boost
|
160
|
-
@field_offsets[field_number] = offset
|
161
|
-
end
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
|
166
|
-
def add_position(field, text, position, tv_offset_info)
|
167
|
-
@term_buffer.set!(field, text)
|
168
|
-
#puts("Offset: " + tv_offset_info)
|
169
|
-
posting = @posting_table[@term_buffer]
|
170
|
-
if (posting != nil) # word seen before
|
171
|
-
freq = posting.freq
|
172
|
-
posting.positions[freq] = position # add new position
|
173
|
-
posting.offsets[freq] = tv_offset_info # add new position
|
174
|
-
|
175
|
-
if (tv_offset_info != nil)
|
176
|
-
posting.offsets[freq] = tv_offset_info
|
177
|
-
end
|
178
|
-
posting.freq = freq + 1 # update frequency
|
179
|
-
else # word not seen before
|
180
|
-
term = Term.new(field, text)
|
181
|
-
@posting_table[term] = Posting.new(term, position, tv_offset_info)
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
def sort_posting_table()
|
186
|
-
# copy @posting_table into an array
|
187
|
-
return @posting_table.values.sort { |x,y| x.term <=> y.term }
|
188
|
-
end
|
189
|
-
|
190
|
-
def write_postings(postings, segment)
|
191
|
-
|
192
|
-
freq = nil
|
193
|
-
prox = nil
|
194
|
-
tis_writer = nil
|
195
|
-
tv_writer = nil
|
196
|
-
begin
|
197
|
-
#open files for inverse index storage
|
198
|
-
freq = @directory.create_output(segment + ".frq")
|
199
|
-
prox = @directory.create_output(segment + ".prx")
|
200
|
-
tis_writer = TermInfosWriter.new(@directory, segment, @field_infos,
|
201
|
-
@term_index_interval)
|
202
|
-
ti = TermInfo.new()
|
203
|
-
current_field = nil
|
204
|
-
|
205
|
-
postings.each do |posting|
|
206
|
-
# add an entry to the dictionary with pointers to prox and freq files
|
207
|
-
ti.set_values!(1, freq.pos(), prox.pos(), -1)
|
208
|
-
tis_writer.add(posting.term, ti)
|
209
|
-
|
210
|
-
# add an entry to the freq file
|
211
|
-
posting_freq = posting.freq
|
212
|
-
if (posting_freq == 1) # optimize freq=1
|
213
|
-
freq.write_vint(1) # set low bit of doc num.
|
214
|
-
else
|
215
|
-
freq.write_vint(0) # the document number
|
216
|
-
freq.write_vint(posting_freq) # frequency in doc
|
217
|
-
end
|
218
|
-
|
219
|
-
last_position = 0 # write positions
|
220
|
-
posting.positions.each do |position|
|
221
|
-
prox.write_vint(position - last_position)
|
222
|
-
last_position = position
|
223
|
-
end
|
224
|
-
# check to see if we switched to a new field
|
225
|
-
term_field = posting.term.field
|
226
|
-
if (current_field != term_field)
|
227
|
-
# changing field - see if there is something to save
|
228
|
-
current_field = term_field
|
229
|
-
fi = @field_infos[current_field]
|
230
|
-
if (fi.store_term_vector?)
|
231
|
-
if tv_writer.nil?
|
232
|
-
tv_writer = TermVectorsWriter.new(@directory, segment, @field_infos)
|
233
|
-
tv_writer.open_document()
|
234
|
-
end
|
235
|
-
tv_writer.open_field(current_field)
|
236
|
-
|
237
|
-
elsif not tv_writer.nil?
|
238
|
-
tv_writer.close_field()
|
239
|
-
end
|
240
|
-
end
|
241
|
-
if not tv_writer.nil? and tv_writer.field_open?
|
242
|
-
tv_writer.add_term(posting.term.text, posting_freq, posting.positions, posting.offsets)
|
243
|
-
end
|
244
|
-
end
|
245
|
-
if not tv_writer.nil?
|
246
|
-
tv_writer.close_document()
|
247
|
-
end
|
248
|
-
ensure
|
249
|
-
# make an effort to close all streams we can but remember and re-raise
|
250
|
-
# the last exception encountered in this process
|
251
|
-
keep = nil
|
252
|
-
[freq, prox, tis_writer, tv_writer].compact.each do |obj|
|
253
|
-
begin
|
254
|
-
obj.close
|
255
|
-
rescue IOError => e
|
256
|
-
keep = e
|
257
|
-
end
|
258
|
-
end
|
259
|
-
raise keep if not keep.nil?
|
260
|
-
end
|
261
|
-
end
|
262
|
-
|
263
|
-
def write_norms(segment)
|
264
|
-
@field_infos.each_with_index do |fi, i|
|
265
|
-
if fi.indexed? and not fi.omit_norms?
|
266
|
-
norm = @field_boosts[i] * @similarity.length_norm(fi.name, @field_lengths[i])
|
267
|
-
norms = @directory.create_output(segment + ".f" + i.to_s)
|
268
|
-
begin
|
269
|
-
norms.write_byte(Ferret::Search::Similarity.encode_norm(norm))
|
270
|
-
ensure
|
271
|
-
norms.close()
|
272
|
-
end
|
273
|
-
end
|
274
|
-
end
|
275
|
-
end
|
276
|
-
|
277
|
-
end
|
278
|
-
|
279
|
-
class Posting # info about a Term in a doc
|
280
|
-
attr_accessor :term, :freq, :positions, :offsets
|
281
|
-
|
282
|
-
def initialize(t, position, offset)
|
283
|
-
@term = t
|
284
|
-
@freq = 1
|
285
|
-
@positions = [position]
|
286
|
-
@offsets = [offset]
|
287
|
-
end
|
288
|
-
end
|
289
|
-
end
|