ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
@@ -1,33 +0,0 @@
|
|
1
|
-
module Ferret
|
2
|
-
module Index
|
3
|
-
# Useful constants representing filenames and extensions used by lucene
|
4
|
-
class IndexFileNames
|
5
|
-
|
6
|
-
# Name of the index segment file
|
7
|
-
SEGMENTS = "segments"
|
8
|
-
|
9
|
-
# Name of the index deletable file
|
10
|
-
DELETABLE = "deletable"
|
11
|
-
|
12
|
-
# This array contains all filename extensions used by Lucene's index files, with
|
13
|
-
# one exception, namely the extension made up from +.f+ + a number.
|
14
|
-
# Also note that two of Lucene's files (+deletable+ and
|
15
|
-
# +segments+) don't have any filename extension.
|
16
|
-
INDEX_EXTENSIONS = [
|
17
|
-
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
|
18
|
-
"tvx", "tvd", "tvf", "tvp"
|
19
|
-
]
|
20
|
-
|
21
|
-
# File extensions of old-style index files
|
22
|
-
COMPOUND_EXTENSIONS = [
|
23
|
-
"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
|
24
|
-
]
|
25
|
-
|
26
|
-
# File extensions for term vector support
|
27
|
-
VECTOR_EXTENSIONS = [
|
28
|
-
"tvx", "tvd", "tvf"
|
29
|
-
]
|
30
|
-
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
@@ -1,503 +0,0 @@
|
|
1
|
-
require 'monitor'
|
2
|
-
|
3
|
-
module Ferret::Index
|
4
|
-
# IndexReader is an abstract class, providing an interface for accessing an
|
5
|
-
# index. Search of an index is done entirely through this abstract interface,
|
6
|
-
# class which implements it is searchable.
|
7
|
-
#
|
8
|
-
# Concrete subclasses of IndexReader are usually constructed with a call to
|
9
|
-
# one of the static <tt>open()</tt> methods, e.g. <tt>#open</tt>.
|
10
|
-
#
|
11
|
-
# For efficiency, in this API documents are often referred to via
|
12
|
-
# _document numbers_, non-negative integers which each name a unique
|
13
|
-
# document in the index. These document numbers are ephemeral, ie they may change
|
14
|
-
# as documents are added to and deleted from an index. Clients should thus not
|
15
|
-
# rely on a given document having the same number between sessions.
|
16
|
-
#
|
17
|
-
# An IndexReader can be opened on a directory for which an IndexWriter is
|
18
|
-
# opened already, but it cannot be used to delete documents from the index then.
|
19
|
-
class IndexReader
|
20
|
-
include MonitorMixin
|
21
|
-
|
22
|
-
# This array contains all filename extensions used by Lucene's index files, with
|
23
|
-
# one exception, namely the extension made up from +.f+ + a number.
|
24
|
-
# Also note that two of Lucene's files (+deletable+ and
|
25
|
-
# +segments+) don't have any filename extension.
|
26
|
-
FILENAME_EXTENSIONS = ["cfs",
|
27
|
-
"fnm",
|
28
|
-
"fdx",
|
29
|
-
"fdt",
|
30
|
-
"tii",
|
31
|
-
"tis",
|
32
|
-
"frq",
|
33
|
-
"prx",
|
34
|
-
"del",
|
35
|
-
"tvx",
|
36
|
-
"tvd",
|
37
|
-
"tvf",
|
38
|
-
"tvp"]
|
39
|
-
|
40
|
-
attr_reader :directory
|
41
|
-
|
42
|
-
class FieldOption < Ferret::Utils::Parameter
|
43
|
-
# all fields
|
44
|
-
ALL = FieldOption.new("ALL")
|
45
|
-
# all indexed fields
|
46
|
-
INDEXED = FieldOption.new("INDEXED")
|
47
|
-
# all fields which are not indexed
|
48
|
-
UNINDEXED = FieldOption.new("UNINDEXED")
|
49
|
-
# all fields which are indexed with termvectors enables
|
50
|
-
INDEXED_WITH_TERM_VECTOR = FieldOption.new("INDEXED_WITH_TERM_VECTOR")
|
51
|
-
# all fields which are indexed but don't have termvectors enabled
|
52
|
-
INDEXED_NO_TERM_VECTOR = FieldOption.new("INDEXED_NO_TERM_VECTOR")
|
53
|
-
# all fields where termvectors are enabled. Please note that only standard
|
54
|
-
# termvector fields are returned
|
55
|
-
TERM_VECTOR = FieldOption.new("TERM_VECTOR")
|
56
|
-
# all field with termvectors wiht positions enabled
|
57
|
-
TERM_VECTOR_WITH_POSITION = FieldOption.new("TERM_VECTOR_WITH_POSITION")
|
58
|
-
# all fields where termvectors with offset position are set
|
59
|
-
TERM_VECTOR_WITH_OFFSET = FieldOption.new("TERM_VECTOR_WITH_OFFSET")
|
60
|
-
# all fields where termvectors with offset and position values set
|
61
|
-
TERM_VECTOR_WITH_POSITION_OFFSET =
|
62
|
-
FieldOption.new("TERM_VECTOR_WITH_POSITION_OFFSET")
|
63
|
-
end
|
64
|
-
|
65
|
-
# To create an IndexReader use the IndexReader.open method. This method
|
66
|
-
# should only be used by subclasses.
|
67
|
-
#
|
68
|
-
# directory:: Directory where IndexReader files reside.
|
69
|
-
# segment_infos:: Used for write-l
|
70
|
-
# close_directory:: close the directory when the index reader is closed
|
71
|
-
def initialize(directory, segment_infos = nil,
|
72
|
-
close_directory = false, directory_owner = false)
|
73
|
-
super()
|
74
|
-
@directory = directory
|
75
|
-
@close_directory = close_directory
|
76
|
-
@segment_infos = segment_infos
|
77
|
-
@directory_owner = directory_owner
|
78
|
-
|
79
|
-
@has_changes = false
|
80
|
-
@stale = false
|
81
|
-
@write_lock = nil
|
82
|
-
|
83
|
-
#ObjectSpace.define_finalizer(self, lambda { |id| @write_lock.release() if @write_lock})
|
84
|
-
end
|
85
|
-
|
86
|
-
# Returns an index reader to read the index in the directory
|
87
|
-
#
|
88
|
-
# directory:: This can either be a Directory object or you can pass
|
89
|
-
# nil (RamDirectory is created) or a path (FSDirectory
|
90
|
-
# is created). If you chose the second or third options,
|
91
|
-
# you should leave close_directory as true and infos as
|
92
|
-
# nil.
|
93
|
-
# close_directory:: True if you want the IndexReader to close the
|
94
|
-
# directory when the IndexReader is closed. You'll want
|
95
|
-
# to set this to false if other objects are using the
|
96
|
-
# same directory object.
|
97
|
-
# infos:: Expert: This can be used to read an different version
|
98
|
-
# of the index but should really be left alone.
|
99
|
-
def IndexReader.open(directory, close_directory = true, infos = nil)
|
100
|
-
if directory.nil?
|
101
|
-
directory = Ferret::Store::RAMDirectory.new
|
102
|
-
elsif directory.is_a?(String)
|
103
|
-
directory = Ferret::Store::FSDirectory.new(directory, false)
|
104
|
-
end
|
105
|
-
directory.synchronize do # in- & inter-process sync
|
106
|
-
commit_lock = directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
|
107
|
-
commit_lock.while_locked() do
|
108
|
-
if infos.nil?
|
109
|
-
infos = SegmentInfos.new()
|
110
|
-
infos.read(directory)
|
111
|
-
end
|
112
|
-
if (infos.size() == 1) # index is optimized
|
113
|
-
return SegmentReader.get(infos[0], infos, close_directory)
|
114
|
-
end
|
115
|
-
readers = Array.new(infos.size)
|
116
|
-
infos.size.times do |i|
|
117
|
-
readers[i] = SegmentReader.get(infos[i])
|
118
|
-
end
|
119
|
-
return MultiReader.new(readers, directory, infos, close_directory)
|
120
|
-
end
|
121
|
-
end
|
122
|
-
end
|
123
|
-
|
124
|
-
# Reads version number from segments files. The version number counts the
|
125
|
-
# number of changes of the index.
|
126
|
-
#
|
127
|
-
# directory:: where the index resides.
|
128
|
-
# returns:: version number.
|
129
|
-
# raises:: IOError if segments file cannot be read.
|
130
|
-
def IndexReader.get_current_version(directory)
|
131
|
-
return SegmentInfos.read_current_version(directory)
|
132
|
-
end
|
133
|
-
|
134
|
-
# Return an array of term vectors for the specified document. The array
|
135
|
-
# contains a vector for each vectorized field in the document. Each vector
|
136
|
-
# contains terms and frequencies for all terms in a given vectorized field.
|
137
|
-
# If no such fields existed, the method returns nil. The term vectors that
|
138
|
-
# are returned my either be of type TermFreqVector or of type
|
139
|
-
# TermDocPosEnumVector if positions or offsets have been stored.
|
140
|
-
#
|
141
|
-
# doc_number:: document for which term vectors are returned
|
142
|
-
# returns:: array of term vectors. May be nil if no term vectors have been
|
143
|
-
# stored for the specified document.
|
144
|
-
# raises:: IOError if index cannot be accessed
|
145
|
-
#
|
146
|
-
# See Field::TermVector
|
147
|
-
def get_term_vectors(doc_number)
|
148
|
-
raise NotImplementedError
|
149
|
-
end
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
# Return a term vector for the specified document and field. The returned
|
154
|
-
# vector contains terms and frequencies for the terms in the specified
|
155
|
-
# field of this document, if the field had the storeTermVector flag set. If
|
156
|
-
# termvectors had been stored with positions or offsets, a
|
157
|
-
# TermDocPosEnumVector is returned.
|
158
|
-
#
|
159
|
-
# doc_number:: document for which the term vector is returned
|
160
|
-
# field:: field for which the term vector is returned.
|
161
|
-
# returns:: term vector May be nil if field does not exist in the specified
|
162
|
-
# document or term vector was not stored.
|
163
|
-
# raises:: IOError if index cannot be accessed
|
164
|
-
# See Field::TermVector
|
165
|
-
def get_term_vector(doc_number, field)
|
166
|
-
raise NotImplementedError
|
167
|
-
end
|
168
|
-
|
169
|
-
|
170
|
-
# Returns +true+ if an index exists at the specified directory. If the
|
171
|
-
# directory does not exist or if there is no index in it.
|
172
|
-
#
|
173
|
-
# directory:: the directory to check for an index
|
174
|
-
# returns:: +true+ if an index exists; +false+ otherwise
|
175
|
-
# raises:: IOError if there is a problem with accessing the index
|
176
|
-
def IndexReader.index_exists?(directory)
|
177
|
-
return directory.exists?("segments")
|
178
|
-
end
|
179
|
-
|
180
|
-
# Returns the number of documents in this index.
|
181
|
-
def num_docs()
|
182
|
-
raise NotImplementedError
|
183
|
-
end
|
184
|
-
|
185
|
-
# Returns one greater than the largest possible document number.
|
186
|
-
#
|
187
|
-
# This may be used to, e.g., determine how big to allocate an array which
|
188
|
-
# will have an element for every document number in an index.
|
189
|
-
def max_doc()
|
190
|
-
raise NotImplementedError
|
191
|
-
end
|
192
|
-
|
193
|
-
# Returns the stored fields of the +n+<sup>th</sup>
|
194
|
-
# +Document+ in this index.
|
195
|
-
def get_document(n)
|
196
|
-
raise NotImplementedError
|
197
|
-
end
|
198
|
-
|
199
|
-
# Returns the first document with the term +term+. This is useful, for
|
200
|
-
# example, if we are indexing rows from a database. We can store the id of
|
201
|
-
# each row in a field in the index and use this method get the document by
|
202
|
-
# the id. Hence, only one document is returned.
|
203
|
-
#
|
204
|
-
# term: The term we are searching for.
|
205
|
-
def get_document_with_term(term)
|
206
|
-
docs = term_docs_for(term)
|
207
|
-
if (docs == nil) then return nil end
|
208
|
-
document = nil
|
209
|
-
begin
|
210
|
-
document = get_document(docs.doc) if docs.next?
|
211
|
-
ensure
|
212
|
-
docs.close()
|
213
|
-
end
|
214
|
-
return document
|
215
|
-
end
|
216
|
-
|
217
|
-
# Returns true if document _n_ has been deleted
|
218
|
-
def deleted?(n)
|
219
|
-
raise NotImplementedError
|
220
|
-
end
|
221
|
-
|
222
|
-
# Returns true if any documents have been deleted
|
223
|
-
def has_deletions?()
|
224
|
-
raise NotImplementedError
|
225
|
-
end
|
226
|
-
|
227
|
-
# Returns true if there are norms stored for this field.
|
228
|
-
def has_norms?(field)
|
229
|
-
# backward compatible implementation.
|
230
|
-
# SegmentReader has an efficient implementation.
|
231
|
-
return (get_norms(field) != nil)
|
232
|
-
end
|
233
|
-
|
234
|
-
# Returns the byte-encoded normalization factor for the named field of
|
235
|
-
# every document. This is used by the search code to score documents.
|
236
|
-
#
|
237
|
-
# See Field#boost
|
238
|
-
def get_norms(field)
|
239
|
-
raise NotImplementedError
|
240
|
-
end
|
241
|
-
|
242
|
-
# Read norms into a pre-allocated array. This is used as an optimization
|
243
|
-
# of get_norms.
|
244
|
-
#
|
245
|
-
# See Field#boost
|
246
|
-
def get_norms_into(field, bytes, offset)
|
247
|
-
raise NotImplementedError
|
248
|
-
end
|
249
|
-
|
250
|
-
# Expert: Resets the normalization factor for the named field of the named
|
251
|
-
# document. The norm represents the product of the field's Field#boost and
|
252
|
-
# its Similarity#length_norm length normalization. Thus, to preserve the
|
253
|
-
# length normalization values when resetting this, one should base the new
|
254
|
-
# value upon the old.
|
255
|
-
#
|
256
|
-
# See #get_norms
|
257
|
-
# See Similarity#decode_norm
|
258
|
-
def set_norm(doc, field, value)
|
259
|
-
synchronize do
|
260
|
-
value = Similarity.encode_norm(value) if value.is_a? Float
|
261
|
-
if(@directory_owner)
|
262
|
-
acquire_write_lock()
|
263
|
-
end
|
264
|
-
do_set_norm(doc, field, value)
|
265
|
-
@has_changes = true
|
266
|
-
end
|
267
|
-
end
|
268
|
-
|
269
|
-
# Implements set_norm in subclass.
|
270
|
-
def do_set_norm(doc, field, value)
|
271
|
-
raise NotImplementedError
|
272
|
-
end
|
273
|
-
|
274
|
-
# Returns an enumeration of all the terms in the index.
|
275
|
-
# Each term is greater than all that precede it in the enumeration.
|
276
|
-
def terms()
|
277
|
-
raise NotImplementedError
|
278
|
-
end
|
279
|
-
|
280
|
-
# Returns an enumeration of all terms after a given term.
|
281
|
-
#
|
282
|
-
# Each term is greater than all that precede it in the enumeration.
|
283
|
-
def terms_from(t)
|
284
|
-
raise NotImplementedError
|
285
|
-
end
|
286
|
-
|
287
|
-
# Returns the number of documents containing the term +t+.
|
288
|
-
def doc_freq(t)
|
289
|
-
raise NotImplementedError
|
290
|
-
end
|
291
|
-
|
292
|
-
# Returns an enumeration of all the documents which contain +term+. For each
|
293
|
-
# document, the document number, the frequency of the term in that document
|
294
|
-
# is also provided, for use in search scoring. Thus, this method implements
|
295
|
-
# the mapping:
|
296
|
-
#
|
297
|
-
# Term => <doc_num, freq><sup>*</sup>
|
298
|
-
#
|
299
|
-
# The enumeration is ordered by document number. Each document number is
|
300
|
-
# greater than all that precede it in the enumeration.
|
301
|
-
def term_docs_for(term)
|
302
|
-
term_docs = term_docs()
|
303
|
-
term_docs.seek(term)
|
304
|
-
return term_docs
|
305
|
-
end
|
306
|
-
|
307
|
-
# Returns an unpositioned TermDocEnum enumerator.
|
308
|
-
def term_docs()
|
309
|
-
raise NotImplementedError
|
310
|
-
end
|
311
|
-
|
312
|
-
# Returns an enumeration of all the documents which contain
|
313
|
-
# +term+. For each document, in addition to the document number
|
314
|
-
# and frequency of the term in that document, a list of all of the ordinal
|
315
|
-
# positions of the term in the document is available. Thus, this method
|
316
|
-
# implements the mapping:
|
317
|
-
#
|
318
|
-
# Term => <doc_num, freq, < pos<sub>1</sub>, pos<sub>2</sub>, ...
|
319
|
-
# pos<sub>freq-1</sub> > > <sup>*</sup>
|
320
|
-
#
|
321
|
-
# This positional information faciliates phrase and proximity searching.
|
322
|
-
# The enumeration is ordered by document number. Each document number is
|
323
|
-
# greater than all that precede it in the enumeration.
|
324
|
-
def term_positions_for(term)
|
325
|
-
term_positions = term_positions()
|
326
|
-
term_positions.seek(term)
|
327
|
-
return term_positions
|
328
|
-
end
|
329
|
-
|
330
|
-
# Returns an unpositioned @link TermDocPosEnumendenumerator.
|
331
|
-
def term_positions()
|
332
|
-
raise NotImplementedError
|
333
|
-
end
|
334
|
-
|
335
|
-
# Tries to acquire the WriteLock on this directory.
|
336
|
-
#
|
337
|
-
# This method is only valid if this IndexReader is directory owner.
|
338
|
-
#
|
339
|
-
# raises:: IOError If WriteLock cannot be acquired.
|
340
|
-
def acquire_write_lock()
|
341
|
-
if @stale
|
342
|
-
raise IOError, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations"
|
343
|
-
end
|
344
|
-
|
345
|
-
if (@write_lock == nil)
|
346
|
-
@write_lock = @directory.make_lock(IndexWriter::WRITE_LOCK_NAME)
|
347
|
-
if not @write_lock.obtain(IndexWriter::WRITE_LOCK_TIMEOUT) # obtain write lock
|
348
|
-
raise IOError, "Index locked for write: " + @write_lock
|
349
|
-
end
|
350
|
-
|
351
|
-
# we have to check whether index has changed since this reader was opened.
|
352
|
-
# if so, this reader is no longer valid for deletion
|
353
|
-
if (SegmentInfos.read_current_version(@directory) > @segment_infos.version())
|
354
|
-
@stale = true
|
355
|
-
@write_lock.release()
|
356
|
-
@write_lock = nil
|
357
|
-
raise IOError, "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations"
|
358
|
-
end
|
359
|
-
end
|
360
|
-
end
|
361
|
-
|
362
|
-
# Returns true if the reader is reading from the latest version of the
|
363
|
-
# index.
|
364
|
-
def latest?()
|
365
|
-
SegmentInfos.read_current_version(@directory) == @segment_infos.version()
|
366
|
-
end
|
367
|
-
|
368
|
-
# Deletes the document numbered +doc_num+. Once a document is deleted it
|
369
|
-
# will not appear in TermDocEnum or TermPostitions enumerations. Attempts to
|
370
|
-
# read its field with the @link #documentend method will result in an error.
|
371
|
-
# The presence of this document may still be reflected in the @link
|
372
|
-
# #docFreqendstatistic, though this will be corrected eventually as the
|
373
|
-
# index is further modified.
|
374
|
-
def delete(doc_num)
|
375
|
-
synchronize do
|
376
|
-
acquire_write_lock() if @directory_owner
|
377
|
-
do_delete(doc_num)
|
378
|
-
@has_changes = true
|
379
|
-
end
|
380
|
-
return 1
|
381
|
-
end
|
382
|
-
|
383
|
-
# Implements deletion of the document numbered +doc_num+.
|
384
|
-
# Applications should call @link #delete(int)endor @link #delete(Term)end.
|
385
|
-
def do_delete(doc_num)
|
386
|
-
raise NotImplementedError
|
387
|
-
end
|
388
|
-
|
389
|
-
# Deletes all documents containing +term+.
|
390
|
-
# This is useful if one uses a document field to hold a unique ID string for
|
391
|
-
# the document. Then to delete such a document, one merely constructs a
|
392
|
-
# term with the appropriate field and the unique ID string as its text and
|
393
|
-
# passes it to this method. Returns the number of documents deleted. See
|
394
|
-
# #delete for information about when this deletion will become effective.
|
395
|
-
def delete_docs_with_term(term)
|
396
|
-
docs = term_docs_for(term)
|
397
|
-
if (docs == nil) then return 0 end
|
398
|
-
n = 0
|
399
|
-
begin
|
400
|
-
while (docs.next?)
|
401
|
-
delete(docs.doc)
|
402
|
-
n += 1
|
403
|
-
end
|
404
|
-
ensure
|
405
|
-
docs.close()
|
406
|
-
end
|
407
|
-
return n
|
408
|
-
end
|
409
|
-
|
410
|
-
# Undeletes all documents currently marked as deleted in this index.
|
411
|
-
def undelete_all()
|
412
|
-
synchronize do
|
413
|
-
acquire_write_lock() if @directory_owner
|
414
|
-
do_undelete_all()
|
415
|
-
@has_changes = true
|
416
|
-
end
|
417
|
-
end
|
418
|
-
|
419
|
-
# Commit changes resulting from delete, undelete_all, or set_norm operations
|
420
|
-
#
|
421
|
-
# raises:: IOError
|
422
|
-
def commit()
|
423
|
-
synchronize do
|
424
|
-
if @has_changes
|
425
|
-
if @directory_owner
|
426
|
-
@directory.synchronize do # in- & inter-process sync
|
427
|
-
commit_lock = @directory.make_lock(IndexWriter::COMMIT_LOCK_NAME)
|
428
|
-
commit_lock.while_locked do
|
429
|
-
do_commit()
|
430
|
-
@segment_infos.write(@directory)
|
431
|
-
end
|
432
|
-
end
|
433
|
-
if (@write_lock != nil)
|
434
|
-
@write_lock.release() # release write lock
|
435
|
-
@write_lock = nil
|
436
|
-
end
|
437
|
-
else
|
438
|
-
do_commit()
|
439
|
-
end
|
440
|
-
end
|
441
|
-
@has_changes = false
|
442
|
-
end
|
443
|
-
end
|
444
|
-
|
445
|
-
# Closes files associated with this index.
|
446
|
-
# Also saves any new deletions to disk.
|
447
|
-
# No other methods should be called after this has been called.
|
448
|
-
def close()
|
449
|
-
synchronize do
|
450
|
-
commit()
|
451
|
-
do_close()
|
452
|
-
@directory.close() if @close_directory
|
453
|
-
end
|
454
|
-
end
|
455
|
-
|
456
|
-
protected
|
457
|
-
|
458
|
-
# Implements actual undelete_all() in subclass.
|
459
|
-
def do_undelete_all()
|
460
|
-
raise NotImplementedError
|
461
|
-
end
|
462
|
-
|
463
|
-
# Implements commit.
|
464
|
-
def do_commit()
|
465
|
-
raise NotImplementedError
|
466
|
-
end
|
467
|
-
|
468
|
-
|
469
|
-
# Implements close.
|
470
|
-
def do_close()
|
471
|
-
raise NotImplementedError
|
472
|
-
end
|
473
|
-
|
474
|
-
# Get a list of unique field names that exist in this index and have the
|
475
|
-
# specified field option information.
|
476
|
-
# fld_option:: specifies which field option should be available for the
|
477
|
-
# returned fields
|
478
|
-
# returns:: Collection of Strings indicating the names of the fields.
|
479
|
-
# See IndexReader.FieldOption
|
480
|
-
def get_field_names()
|
481
|
-
raise NotImplementedError
|
482
|
-
end
|
483
|
-
|
484
|
-
# Returns +true+ iff the index in the named directory is
|
485
|
-
# currently locked.
|
486
|
-
# directory:: the directory to check for a lock
|
487
|
-
# raises:: IOError if there is a problem with accessing the index
|
488
|
-
def IndexReader.locked?(directory)
|
489
|
-
return (directory.make_lock(IndexWriter::WRITE_LOCK_NAME).locked? or
|
490
|
-
directory.make_lock(IndexWriter::COMMIT_LOCK_NAME).locked?)
|
491
|
-
end
|
492
|
-
|
493
|
-
# Forcibly unlocks the index in the named directory.
|
494
|
-
#
|
495
|
-
# Caution: this should only be used by failure recovery code,
|
496
|
-
# when it is known that no other process nor thread is in fact
|
497
|
-
# currently accessing this index.
|
498
|
-
def IndexReader.unlock(directory)
|
499
|
-
directory.make_lock(IndexWriter::WRITE_LOCK_NAME).release
|
500
|
-
directory.make_lock(IndexWriter::COMMIT_LOCK_NAME).release
|
501
|
-
end
|
502
|
-
end
|
503
|
-
end
|