ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/lib/ferret/index/index.rb
DELETED
@@ -1,675 +0,0 @@
|
|
1
|
-
require 'monitor'
|
2
|
-
|
3
|
-
module Ferret::Index
|
4
|
-
# This is a simplified interface to the index. See the TUTORIAL for more
|
5
|
-
# information on how to use this class.
|
6
|
-
class Index
|
7
|
-
include MonitorMixin
|
8
|
-
|
9
|
-
include Ferret::Store
|
10
|
-
include Ferret::Search
|
11
|
-
include Ferret::Document
|
12
|
-
|
13
|
-
# If you create an Index without any options, it'll simply create an index
|
14
|
-
# in memory. But this class is highly configurable and every option that
|
15
|
-
# you can supply to IndexWriter and QueryParser, you can also set here.
|
16
|
-
#
|
17
|
-
# === Options
|
18
|
-
#
|
19
|
-
# path:: A string representing the path to the index
|
20
|
-
# directory. If you are creating the index for the
|
21
|
-
# first time the directory will be created if it's
|
22
|
-
# missing. You should not choose a directory which
|
23
|
-
# contains other files.
|
24
|
-
# create_if_missing:: Create the index if no index is found in the
|
25
|
-
# specified directory. Otherwise, use the existing
|
26
|
-
# index. This defaults to true and has no effect on
|
27
|
-
# in memory indexes.
|
28
|
-
# create:: Creates the index, even if one already exists.
|
29
|
-
# That means any existing index will be deleted.
|
30
|
-
# This option defaults to false and has no effect
|
31
|
-
# for in memory indexes. It is probably better to
|
32
|
-
# use the create_if_missing option.
|
33
|
-
# default_field:: This specifies the default field that will be
|
34
|
-
# used when you add a simple string to the index
|
35
|
-
# using #add_document or <<. This will also be used
|
36
|
-
# for default_search_field unless you set it
|
37
|
-
# explicitly. The default for this value is the
|
38
|
-
# string "id".
|
39
|
-
# id_field: This field is as the field to search when doing
|
40
|
-
# searches on a term. For example, if you do a
|
41
|
-
# lookup by term "cat", ie index["cat"], this will
|
42
|
-
# be the field that is searched. This will default
|
43
|
-
# to default_field if not set.
|
44
|
-
# default_search_field:: This specifies the field or fields that will be
|
45
|
-
# searched by the query parser. You can use a
|
46
|
-
# string to specify one field, eg, "title". Or you
|
47
|
-
# can specify multiple fields with a String -
|
48
|
-
# "title|content" - or with an Array - ["title",
|
49
|
-
# "content"]. This defaults to the value passed in
|
50
|
-
# for default_field. If default_field is nil then
|
51
|
-
# the default is "*" which signifies all fields in
|
52
|
-
# the index.
|
53
|
-
# analyzer:: Sets the default analyzer for the index. This is
|
54
|
-
# used by both the IndexWriter and the QueryParser
|
55
|
-
# to tokenize the input. The default is the
|
56
|
-
# StandardAnalyzer.
|
57
|
-
# dir:: This is an Ferret::Store::Directory object. This
|
58
|
-
# can be useful if you have an already existing
|
59
|
-
# in-memory index which you'd like to read with
|
60
|
-
# this class. If you want to create a new index,
|
61
|
-
# you are better off passing in a path.
|
62
|
-
# close_dir:: This specifies whether you want this class to
|
63
|
-
# close the index directory when this class is
|
64
|
-
# closed. This only has any meaning when you pass
|
65
|
-
# in a directory object in the *dir* option, in
|
66
|
-
# which case it defaults to false. Otherwise it is
|
67
|
-
# always true.
|
68
|
-
# occur_default:: Set to either BooleanClause::Occur::SHOULD
|
69
|
-
# (default) or BooleanClause::Occur::MUST to
|
70
|
-
# specify the default Occur operator.
|
71
|
-
# wild_lower:: Set to false if you don't want the terms in fuzzy
|
72
|
-
# and wild queries to be set to lower case. You
|
73
|
-
# should do this if your analyzer doesn't downcase.
|
74
|
-
# The default is true.
|
75
|
-
# default_slop:: Set the default slop for phrase queries. This
|
76
|
-
# defaults to 0.
|
77
|
-
# key:: Expert: This should only be used if you really
|
78
|
-
# know what you are doing. Basically you can set a
|
79
|
-
# field or an array of fields to be the key for the
|
80
|
-
# index. So if you add a document with a same key
|
81
|
-
# as an existing document, the existing document will
|
82
|
-
# be replaced by the new object. This will slow
|
83
|
-
# down indexing so it should not be used if
|
84
|
-
# performance is a concern. You must make sure that
|
85
|
-
# your key/keys are either untokenized or that they
|
86
|
-
# are not broken up by the analyzer.
|
87
|
-
# use_compound_file:: Uses a compound file to store the index. This
|
88
|
-
# prevents an error being raised for having too
|
89
|
-
# many files open at the same time. The default is
|
90
|
-
# true but performance is better if this is set to
|
91
|
-
# false.
|
92
|
-
# handle_parse_errors:: Set this to true if you want the QueryParser to
|
93
|
-
# degrade gracefully on errors. If the query parser
|
94
|
-
# fails to parse this query, it will try to parse
|
95
|
-
# it as a straight boolean query on the default
|
96
|
-
# field ignoring all query punctuation. If this
|
97
|
-
# fails, it will return an empty TermQuery. If you
|
98
|
-
# use this and you need to know why your query
|
99
|
-
# isn't working you can use the Query#to_s method
|
100
|
-
# on the query returned to see what is happening to
|
101
|
-
# your query. This defualts to true. If you set it
|
102
|
-
# to false a QueryParseException is raised on a
|
103
|
-
# query parse error.
|
104
|
-
# auto_flush:: Set this option to true if you want the index
|
105
|
-
# automatically flushed every time you do a write
|
106
|
-
# (includes delete) to the index. This is useful if
|
107
|
-
# you have multiple processes accessing the index
|
108
|
-
# and you don't want lock errors. This is set to
|
109
|
-
# false by default.
|
110
|
-
#
|
111
|
-
# Some examples;
|
112
|
-
#
|
113
|
-
# index = Index::Index.new(:analyzer => WhiteSpaceAnalyzer.new())
|
114
|
-
#
|
115
|
-
# index = Index::Index.new(:path => '/path/to/index',
|
116
|
-
# :create_if_missing => false,
|
117
|
-
# :auto_flush => true)
|
118
|
-
#
|
119
|
-
# index = Index::Index.new(:dir => directory,
|
120
|
-
# :close_dir => false
|
121
|
-
# :default_slop => 2,
|
122
|
-
# :handle_parse_errors => false)
|
123
|
-
#
|
124
|
-
def initialize(options = {})
|
125
|
-
super()
|
126
|
-
|
127
|
-
options[:default_field] &&= options[:default_field].to_s
|
128
|
-
options[:create_if_missing] = true if options[:create_if_missing].nil?
|
129
|
-
@key = [options[:key]].flatten.map {|k| k.to_s} if options[:key]
|
130
|
-
|
131
|
-
if options[:path]
|
132
|
-
begin
|
133
|
-
@dir = FSDirectory.new(options[:path], options[:create])
|
134
|
-
rescue IOError => io
|
135
|
-
@dir = FSDirectory.new(options[:path], options[:create_if_missing])
|
136
|
-
end
|
137
|
-
options[:close_dir] = true
|
138
|
-
elsif options[:dir]
|
139
|
-
@dir = options[:dir]
|
140
|
-
else
|
141
|
-
options[:create] = true # this should always be true for a new RAMDir
|
142
|
-
@dir = RAMDirectory.new
|
143
|
-
end
|
144
|
-
|
145
|
-
@dir.synchronize do
|
146
|
-
@options = options
|
147
|
-
@writer = IndexWriter.new(@dir, options) # create the index if need be
|
148
|
-
options[:analyzer] = @analyzer = @writer.analyzer
|
149
|
-
@writer.close
|
150
|
-
@writer = nil
|
151
|
-
@has_writes = false
|
152
|
-
@reader = nil
|
153
|
-
@options.delete(:create) # only want to create the first time if at all
|
154
|
-
@close_dir = @options.delete(:close_dir) || false # we'll hold this here
|
155
|
-
@auto_flush = @options[:auto_flush] || false
|
156
|
-
@default_search_field = (@options[:default_search_field] || \
|
157
|
-
@options[:default_field] || "*")
|
158
|
-
if (@options[:id_field].nil? and
|
159
|
-
@options[:default_field].nil? and
|
160
|
-
@key and @key.size == 1)
|
161
|
-
@default_field = @key[0]
|
162
|
-
@id_field = @key[0]
|
163
|
-
else
|
164
|
-
@default_field =
|
165
|
-
(@options[:default_field] || @options[:id_field] || "id").to_s
|
166
|
-
@id_field =
|
167
|
-
(@options[:id_field] || @options[:default_field] || "id").to_s
|
168
|
-
end
|
169
|
-
@options[:handle_parse_errors] = true if @options[:handle_parse_errors].nil?
|
170
|
-
@open = true
|
171
|
-
@qp = nil
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
# Closes this index by closing its associated reader and writer objects.
|
176
|
-
def close
|
177
|
-
@dir.synchronize do
|
178
|
-
if not @open
|
179
|
-
raise "tried to close an already closed directory"
|
180
|
-
end
|
181
|
-
@reader.close() if @reader
|
182
|
-
@writer.close() if @writer
|
183
|
-
@dir.close()
|
184
|
-
|
185
|
-
@open = false
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
# Get the reader for this index.
|
190
|
-
# NOTE:: This will close the writer from this index.
|
191
|
-
def reader
|
192
|
-
ensure_reader_open()
|
193
|
-
return @reader
|
194
|
-
end
|
195
|
-
|
196
|
-
# Get the searcher for this index.
|
197
|
-
# NOTE:: This will close the writer from this index.
|
198
|
-
def searcher
|
199
|
-
ensure_searcher_open()
|
200
|
-
return @searcher
|
201
|
-
end
|
202
|
-
|
203
|
-
# Get the writer for this index.
|
204
|
-
# NOTE:: This will close the reader from this index.
|
205
|
-
def writer
|
206
|
-
ensure_writer_open()
|
207
|
-
return @writer
|
208
|
-
end
|
209
|
-
protected :reader, :writer, :searcher
|
210
|
-
|
211
|
-
# Adds a document to this index, using the provided analyzer instead of
|
212
|
-
# the local analyzer if provided. If the document contains more than
|
213
|
-
# IndexWriter::MAX_FIELD_LENGTH terms for a given field, the remainder are
|
214
|
-
# discarded.
|
215
|
-
#
|
216
|
-
# There are three ways to add a document to the index.
|
217
|
-
# To add a document you can simply add a string or an array of strings.
|
218
|
-
# This will store all the strings in the "" (ie empty string) field
|
219
|
-
# (unless you specify the default_field when you create the index).
|
220
|
-
#
|
221
|
-
# index << "This is a new document to be indexed"
|
222
|
-
# index << ["And here", "is another", "new document", "to be indexed"]
|
223
|
-
#
|
224
|
-
# But these are pretty simple documents. If this is all you want to index
|
225
|
-
# you could probably just use SimpleSearch. So let's give our documents
|
226
|
-
# some fields;
|
227
|
-
#
|
228
|
-
# index << {:title => "Programming Ruby", :content => "blah blah blah"}
|
229
|
-
# index << {:title => "Programming Ruby", :content => "yada yada yada"}
|
230
|
-
#
|
231
|
-
# Or if you are indexing data stored in a database, you'll probably want
|
232
|
-
# to store the id;
|
233
|
-
#
|
234
|
-
# index << {:id => row.id, :title => row.title, :date => row.date}
|
235
|
-
#
|
236
|
-
# The methods above while store all of the input data as well tokenizing
|
237
|
-
# and indexing it. Sometimes we won't want to tokenize (divide the string
|
238
|
-
# into tokens) the data. For example, we might want to leave the title as
|
239
|
-
# a complete string and only allow searchs for that complete string.
|
240
|
-
# Sometimes we won't want to store the data as it's already stored in the
|
241
|
-
# database so it'll be a waste to store it in the index. Or perhaps we are
|
242
|
-
# doing without a database and using Ferret to store all of our data, in
|
243
|
-
# which case we might not want to index it. For example, if we are storing
|
244
|
-
# images in the index, we won't want to index them. All of this can be
|
245
|
-
# done using Ferret's Ferret::Document module. eg;
|
246
|
-
#
|
247
|
-
# include Ferret::Document
|
248
|
-
# doc = Document.new
|
249
|
-
# doc << Field.new("id", row.id, Field::Store::NO, Field::Index::UNTOKENIZED)
|
250
|
-
# doc << Field.new("title", row.title, Field::Store::YES, Field::Index::UNTOKENIZED)
|
251
|
-
# doc << Field.new("data", row.data, Field::Store::YES, Field::Index::TOKENIZED)
|
252
|
-
# doc << Field.new("image", row.image, Field::Store::YES, Field::Index::NO)
|
253
|
-
# index << doc
|
254
|
-
#
|
255
|
-
# You can also compress the data that you are storing or store term
|
256
|
-
# vectors with the data. Read more about this in Ferret::Document::Field.
|
257
|
-
def add_document(doc, analyzer = nil)
|
258
|
-
@dir.synchronize do
|
259
|
-
fdoc = nil
|
260
|
-
if doc.is_a?(String)
|
261
|
-
fdoc = Document.new
|
262
|
-
fdoc << Field.new(@default_field, doc,
|
263
|
-
Field::Store::YES, Field::Index::TOKENIZED)
|
264
|
-
elsif doc.is_a?(Array)
|
265
|
-
fdoc = Document.new
|
266
|
-
doc.each() do |field|
|
267
|
-
fdoc << Field.new(@default_field, field,
|
268
|
-
Field::Store::YES, Field::Index::TOKENIZED)
|
269
|
-
end
|
270
|
-
elsif doc.is_a?(Hash)
|
271
|
-
fdoc = Document.new
|
272
|
-
doc.each_pair() do |field, text|
|
273
|
-
if @key and @key.index(field.to_s)
|
274
|
-
fdoc << Field.new(field.to_s, text.to_s,
|
275
|
-
Field::Store::YES, Field::Index::UNTOKENIZED)
|
276
|
-
else
|
277
|
-
fdoc << Field.new(field.to_s, text.to_s,
|
278
|
-
Field::Store::YES, Field::Index::TOKENIZED)
|
279
|
-
end
|
280
|
-
end
|
281
|
-
elsif doc.is_a?(Document)
|
282
|
-
fdoc = doc
|
283
|
-
else
|
284
|
-
raise ArgumentError, "Unknown document type #{doc.class}"
|
285
|
-
end
|
286
|
-
|
287
|
-
# delete existing documents with the same key
|
288
|
-
if @key
|
289
|
-
query = @key.inject(BooleanQuery.new()) do |bq, field|
|
290
|
-
bq.add_query(TermQuery.new(Term.new(field, fdoc[field])),
|
291
|
-
BooleanClause::Occur::MUST)
|
292
|
-
end
|
293
|
-
query_delete(query)
|
294
|
-
end
|
295
|
-
|
296
|
-
ensure_writer_open()
|
297
|
-
@has_writes = true
|
298
|
-
@writer.add_document(fdoc, analyzer || @writer.analyzer)
|
299
|
-
flush() if @auto_flush
|
300
|
-
end
|
301
|
-
end
|
302
|
-
alias :<< :add_document
|
303
|
-
|
304
|
-
# The main search method for the index. You need to create a query to
|
305
|
-
# pass to this method. You can also pass a hash with one or more of the
|
306
|
-
# following; {filter, num_docs, first_doc, sort}
|
307
|
-
#
|
308
|
-
# query:: The query to run on the index
|
309
|
-
# filter:: Filters docs from the search result
|
310
|
-
# first_doc:: The index in the results of the first doc retrieved.
|
311
|
-
# Default is 0
|
312
|
-
# num_docs:: The number of results returned. Default is 10
|
313
|
-
# sort:: An array of SortFields describing how to sort the results.
|
314
|
-
def search(query, options = {})
|
315
|
-
@dir.synchronize do
|
316
|
-
return do_search(query, options)
|
317
|
-
end
|
318
|
-
end
|
319
|
-
|
320
|
-
# See Index#search
|
321
|
-
#
|
322
|
-
# This method yields the doc and score for each hit.
|
323
|
-
# eg.
|
324
|
-
# index.search_each() do |doc, score|
|
325
|
-
# puts "hit document number #{doc} with a score of #{score}"
|
326
|
-
# end
|
327
|
-
#
|
328
|
-
# returns:: The total number of hits.
|
329
|
-
def search_each(query, options = {}) # :yield: doc, score
|
330
|
-
@dir.synchronize do
|
331
|
-
hits = do_search(query, options)
|
332
|
-
hits.score_docs.each do |score_doc|
|
333
|
-
yield score_doc.doc, score_doc.score
|
334
|
-
end
|
335
|
-
return hits.total_hits
|
336
|
-
end
|
337
|
-
end
|
338
|
-
|
339
|
-
# Retrieve the document referenced by the document number +id+, if id is
|
340
|
-
# an integer or the first document with term +id+ if +id+ is a term.
|
341
|
-
#
|
342
|
-
# id:: The number of the document to retrieve, or the term used as the id
|
343
|
-
# for the document we wish to retrieve
|
344
|
-
def doc(id)
|
345
|
-
@dir.synchronize do
|
346
|
-
ensure_reader_open()
|
347
|
-
if id.kind_of?(String) or id.kind_of?(Symbol)
|
348
|
-
t = Term.new(@id_field, id.to_s)
|
349
|
-
return @reader.get_document_with_term(t)
|
350
|
-
elsif id.is_a?(Term)
|
351
|
-
return @reader.get_document_with_term(id)
|
352
|
-
else
|
353
|
-
return @reader.get_document(id)
|
354
|
-
end
|
355
|
-
end
|
356
|
-
end
|
357
|
-
alias :[] :doc
|
358
|
-
|
359
|
-
# Delete the document referenced by the document number +id+ if +id+ is an
|
360
|
-
# integer or all of the documents which have the term +id+ if +id+ is a
|
361
|
-
# term..
|
362
|
-
#
|
363
|
-
# id:: The number of the document to delete
|
364
|
-
def delete(id)
|
365
|
-
@dir.synchronize do
|
366
|
-
cnt = 0
|
367
|
-
ensure_reader_open()
|
368
|
-
if id.is_a?(String)
|
369
|
-
t = Term.new(@id_field, id.to_s)
|
370
|
-
cnt = @reader.delete_docs_with_term(t)
|
371
|
-
elsif id.is_a?(Term)
|
372
|
-
cnt = @reader.delete_docs_with_term(id)
|
373
|
-
elsif id.is_a?(Integer)
|
374
|
-
cnt = @reader.delete(id)
|
375
|
-
else
|
376
|
-
raise ArgumentError, "Cannot delete for id of type #{id.class}"
|
377
|
-
end
|
378
|
-
flush() if @auto_flush
|
379
|
-
return cnt
|
380
|
-
end
|
381
|
-
end
|
382
|
-
|
383
|
-
# Delete all documents returned by the query.
|
384
|
-
#
|
385
|
-
# query:: The query to find documents you wish to delete. Can either be a
|
386
|
-
# string (in which case it is parsed by the standard query parser)
|
387
|
-
# or an actual query object.
|
388
|
-
def query_delete(query)
|
389
|
-
@dir.synchronize do
|
390
|
-
ensure_searcher_open()
|
391
|
-
query = process_query(query)
|
392
|
-
@searcher.search_each(query) do |doc, score|
|
393
|
-
@reader.delete(doc)
|
394
|
-
end
|
395
|
-
flush() if @auto_flush
|
396
|
-
end
|
397
|
-
end
|
398
|
-
|
399
|
-
# Returns true if document +n+ has been deleted
|
400
|
-
def deleted?(n)
|
401
|
-
@dir.synchronize do
|
402
|
-
ensure_reader_open()
|
403
|
-
return @reader.deleted?(n)
|
404
|
-
end
|
405
|
-
end
|
406
|
-
|
407
|
-
# Update the document referenced by the document number +id+ if +id+ is an
|
408
|
-
# integer or all of the documents which have the term +id+ if +id+ is a
|
409
|
-
# term..
|
410
|
-
#
|
411
|
-
# id:: The number of the document to update. Can also be a string
|
412
|
-
# representing the value in the +id+ field or a term to match.
|
413
|
-
# new_val:: The values we are updating. This can be a string in which case
|
414
|
-
# the default field is updated, or it can be a hash, in which
|
415
|
-
# case, all fields in the hash are updated. You can also pass a
|
416
|
-
# full Document object, which will completely replace the
|
417
|
-
# documents you remove.
|
418
|
-
def update(id, new_val)
|
419
|
-
@dir.synchronize do
|
420
|
-
if id.is_a?(String)
|
421
|
-
query_update("#{@id_field}:#{id}", new_val)
|
422
|
-
elsif id.is_a?(Term)
|
423
|
-
query_update(TermQuery.new(id), new_val)
|
424
|
-
elsif id.is_a?(Integer)
|
425
|
-
ensure_reader_open()
|
426
|
-
document = doc(id)
|
427
|
-
if new_val.is_a?(Hash)
|
428
|
-
new_val.each_pair {|name, content| document[name] = content.to_s}
|
429
|
-
elsif new_val.is_a?(Ferret::Document::Document)
|
430
|
-
document = new_val
|
431
|
-
else
|
432
|
-
document[@options[:default_field]] = new_val.to_s
|
433
|
-
end
|
434
|
-
@reader.delete(id)
|
435
|
-
ensure_writer_open()
|
436
|
-
@writer.add_document(document)
|
437
|
-
else
|
438
|
-
raise ArgumentError, "Cannot update for id of type #{id.class}"
|
439
|
-
end
|
440
|
-
flush() if @auto_flush
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
# Update all the documents returned by the query.
|
445
|
-
#
|
446
|
-
# query:: The query to find documents you wish to update. Can either be
|
447
|
-
# a string (in which case it is parsed by the standard query
|
448
|
-
# parser) or an actual query object.
|
449
|
-
# new_val:: The values we are updating. This can be a string in which case
|
450
|
-
# the default field is updated, or it can be a hash, in which
|
451
|
-
# case, all fields in the hash are updated. You can also pass a
|
452
|
-
# full Document object, which will completely replace the
|
453
|
-
# documents you remove. You should be careful when passing a
|
454
|
-
# whole document to be sure that your query will return one and
|
455
|
-
# only result.
|
456
|
-
def query_update(query, new_val)
|
457
|
-
@dir.synchronize do
|
458
|
-
ensure_searcher_open()
|
459
|
-
docs_to_add = []
|
460
|
-
query = process_query(query)
|
461
|
-
@searcher.search_each(query) do |id, score|
|
462
|
-
document = doc(id)
|
463
|
-
if new_val.is_a?(Hash)
|
464
|
-
new_val.each_pair {|name, content| document[name] = content.to_s}
|
465
|
-
elsif new_val.is_a?(Document)
|
466
|
-
document = new_val
|
467
|
-
else
|
468
|
-
document[@options[:default_field]] = new_val.to_s
|
469
|
-
end
|
470
|
-
docs_to_add << document
|
471
|
-
@reader.delete(id)
|
472
|
-
end
|
473
|
-
ensure_writer_open()
|
474
|
-
docs_to_add.each do |document|
|
475
|
-
@writer.add_document(document)
|
476
|
-
end
|
477
|
-
flush() if @auto_flush
|
478
|
-
end
|
479
|
-
end
|
480
|
-
|
481
|
-
# Returns true if any documents have been deleted since the index was last
|
482
|
-
# flushed.
|
483
|
-
def has_deletions?()
|
484
|
-
@dir.synchronize do
|
485
|
-
ensure_reader_open()
|
486
|
-
return @reader.has_deletions?
|
487
|
-
end
|
488
|
-
end
|
489
|
-
|
490
|
-
# Returns true if any documents have been added to the index since the
|
491
|
-
# last flush.
|
492
|
-
def has_writes?()
|
493
|
-
return @has_writes
|
494
|
-
end
|
495
|
-
|
496
|
-
# Flushes all writes to the index. This will not optimize the index but it
|
497
|
-
# will make sure that all writes are written to it.
|
498
|
-
#
|
499
|
-
# NOTE: this is not necessary if you are only using this class. All writes
|
500
|
-
# will automatically flush when you perform an operation that reads the
|
501
|
-
# index.
|
502
|
-
def flush()
|
503
|
-
@dir.synchronize do
|
504
|
-
@reader.close if @reader
|
505
|
-
@writer.close if @writer
|
506
|
-
@reader = nil
|
507
|
-
@writer = nil
|
508
|
-
@searcher = nil
|
509
|
-
@has_writes = false
|
510
|
-
end
|
511
|
-
end
|
512
|
-
|
513
|
-
# optimizes the index. This should only be called when the index will no
|
514
|
-
# longer be updated very often, but will be read a lot.
|
515
|
-
def optimize()
|
516
|
-
@dir.synchronize do
|
517
|
-
ensure_writer_open()
|
518
|
-
@writer.optimize()
|
519
|
-
flush()
|
520
|
-
end
|
521
|
-
end
|
522
|
-
|
523
|
-
# returns the number of documents in the index
|
524
|
-
def size()
|
525
|
-
@dir.synchronize do
|
526
|
-
ensure_reader_open()
|
527
|
-
return @reader.num_docs()
|
528
|
-
end
|
529
|
-
end
|
530
|
-
|
531
|
-
# Merges all segments from an index or an array of indexes into this
|
532
|
-
# index. You can pass a single Index::Index, Index::Reader,
|
533
|
-
# Store::Directory or an array of any single one of these.
|
534
|
-
#
|
535
|
-
# This may be used to parallelize batch indexing. A large document
|
536
|
-
# collection can be broken into sub-collections. Each sub-collection can
|
537
|
-
# be indexed in parallel, on a different thread, process or machine and
|
538
|
-
# perhaps all in memory. The complete index can then be created by
|
539
|
-
# merging sub-collection indexes with this method.
|
540
|
-
#
|
541
|
-
# After this completes, the index is optimized.
|
542
|
-
def add_indexes(indexes)
|
543
|
-
@dir.synchronize do
|
544
|
-
indexes = [indexes].flatten # make sure we have an array
|
545
|
-
return if indexes.size == 0 # nothing to do
|
546
|
-
if indexes[0].is_a?(Index)
|
547
|
-
readers = indexes.map {|index| index.reader }
|
548
|
-
indexes = readers
|
549
|
-
end
|
550
|
-
|
551
|
-
if indexes[0].is_a?(IndexReader)
|
552
|
-
ensure_reader_open
|
553
|
-
indexes.delete(@reader) # we don't want to merge with self
|
554
|
-
ensure_writer_open
|
555
|
-
@writer.add_indexes_readers(indexes)
|
556
|
-
elsif indexes[0].is_a?(Ferret::Store::Directory)
|
557
|
-
indexes.delete(@dir) # we don't want to merge with self
|
558
|
-
ensure_writer_open
|
559
|
-
@writer.add_indexes(indexes)
|
560
|
-
else
|
561
|
-
raise ArgumentError, "Unknown index type when trying to merge indexes"
|
562
|
-
end
|
563
|
-
end
|
564
|
-
end
|
565
|
-
|
566
|
-
# This is a simple utility method for saving an in memory or RAM index to
|
567
|
-
# the file system. The same thing can be achieved by using the
|
568
|
-
# Index::Index#add_indexes method and you will have more options when
|
569
|
-
# creating the new index, however this is a simple way to turn a RAM index
|
570
|
-
# into a file system index.
|
571
|
-
#
|
572
|
-
# directory:: This can either be a Store::Directory object or a string
|
573
|
-
# representing the path to the directory where you would
|
574
|
-
# like to store the the index.
|
575
|
-
#
|
576
|
-
# create:: True if you'd like to create the directory if it doesn't
|
577
|
-
# exist or copy over an existing directory. False if you'd
|
578
|
-
# like to merge with the existing directory. This defaults to
|
579
|
-
# false.
|
580
|
-
def persist(directory, create = true)
|
581
|
-
synchronize do
|
582
|
-
flush()
|
583
|
-
old_dir = @dir
|
584
|
-
if directory.is_a?(String)
|
585
|
-
@dir = FSDirectory.new(directory, create)
|
586
|
-
@options[:close_dir] = true
|
587
|
-
elsif directory.is_a?(Ferret::Store::Directory)
|
588
|
-
@dir = directory
|
589
|
-
end
|
590
|
-
ensure_writer_open
|
591
|
-
@writer.add_indexes([old_dir])
|
592
|
-
end
|
593
|
-
end
|
594
|
-
|
595
|
-
def to_s
|
596
|
-
buf = ""
|
597
|
-
(0...(size)).each do |i|
|
598
|
-
buf << self[i].to_s + "\n" if not deleted?(i)
|
599
|
-
end
|
600
|
-
buf
|
601
|
-
end
|
602
|
-
|
603
|
-
# Returns an Explanation that describes how +doc+ scored against
|
604
|
-
# +query+.
|
605
|
-
#
|
606
|
-
# This is intended to be used in developing Similarity implementations,
|
607
|
-
# and, for good performance, should not be displayed with every hit.
|
608
|
-
# Computing an explanation is as expensive as executing the query over the
|
609
|
-
# entire index.
|
610
|
-
def explain(query, doc)
|
611
|
-
synchronize do
|
612
|
-
ensure_searcher_open()
|
613
|
-
query = process_query(query)
|
614
|
-
|
615
|
-
return @searcher.explain(query, doc)
|
616
|
-
end
|
617
|
-
end
|
618
|
-
|
619
|
-
protected
|
620
|
-
def ensure_writer_open()
|
621
|
-
raise "tried to use a closed index" if not @open
|
622
|
-
return if @writer
|
623
|
-
if @reader
|
624
|
-
@reader.close
|
625
|
-
@reader = nil
|
626
|
-
@searcher = nil
|
627
|
-
end
|
628
|
-
@writer = IndexWriter.new(@dir, @options)
|
629
|
-
end
|
630
|
-
|
631
|
-
# returns the new reader if one is opened
|
632
|
-
def ensure_reader_open()
|
633
|
-
raise "tried to use a closed index" if not @open
|
634
|
-
if @reader
|
635
|
-
if not @reader.latest?
|
636
|
-
return @reader = IndexReader.open(@dir, false)
|
637
|
-
end
|
638
|
-
else
|
639
|
-
if @writer
|
640
|
-
@writer.close
|
641
|
-
@writer = nil
|
642
|
-
end
|
643
|
-
return @reader = IndexReader.open(@dir, false)
|
644
|
-
end
|
645
|
-
return false
|
646
|
-
end
|
647
|
-
|
648
|
-
def ensure_searcher_open()
|
649
|
-
raise "tried to use a closed index" if not @open
|
650
|
-
if ensure_reader_open() or not @searcher
|
651
|
-
@searcher = IndexSearcher.new(@reader)
|
652
|
-
end
|
653
|
-
end
|
654
|
-
|
655
|
-
private
|
656
|
-
def do_search(query, options)
|
657
|
-
ensure_searcher_open()
|
658
|
-
query = process_query(query)
|
659
|
-
|
660
|
-
return @searcher.search(query, options)
|
661
|
-
end
|
662
|
-
|
663
|
-
def process_query(query)
|
664
|
-
if query.is_a?(String)
|
665
|
-
if @qp.nil?
|
666
|
-
@qp = Ferret::QueryParser.new(@default_search_field, @options)
|
667
|
-
end
|
668
|
-
# we need to set this ever time, in case a new field has been added
|
669
|
-
@qp.fields = @reader.get_field_names.to_a
|
670
|
-
query = @qp.parse(query)
|
671
|
-
end
|
672
|
-
return query
|
673
|
-
end
|
674
|
-
end
|
675
|
-
end
|