ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/ext/index.h
CHANGED
@@ -1,35 +1,40 @@
|
|
1
1
|
#ifndef FRT_INDEX_H
|
2
2
|
#define FRT_INDEX_H
|
3
3
|
|
4
|
-
#include <limits.h>
|
5
4
|
#include "global.h"
|
6
|
-
#include "array.h"
|
7
|
-
#include "bitvector.h"
|
8
|
-
#include "hashset.h"
|
9
|
-
#include "priorityqueue.h"
|
10
|
-
#include "hash.h"
|
11
|
-
#include "store.h"
|
12
5
|
#include "document.h"
|
13
6
|
#include "analysis.h"
|
7
|
+
#include "hash.h"
|
8
|
+
#include "hashset.h"
|
9
|
+
#include "store.h"
|
10
|
+
#include "mem_pool.h"
|
14
11
|
#include "similarity.h"
|
12
|
+
#include "bitvector.h"
|
13
|
+
#include "priorityqueue.h"
|
15
14
|
|
15
|
+
typedef struct IndexReader IndexReader;
|
16
|
+
typedef struct MultiReader MultiReader;
|
16
17
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
int min_merge_docs;
|
23
|
-
int max_merge_docs;
|
24
|
-
int max_field_length;
|
25
|
-
int term_index_interval;
|
26
|
-
} FerretConfig;
|
18
|
+
/****************************************************************************
|
19
|
+
*
|
20
|
+
* Config
|
21
|
+
*
|
22
|
+
****************************************************************************/
|
27
23
|
|
28
|
-
|
24
|
+
typedef struct Config
|
25
|
+
{
|
26
|
+
int chunk_size;
|
27
|
+
int max_buffer_memory;
|
28
|
+
int index_interval;
|
29
|
+
int skip_interval;
|
30
|
+
int merge_factor;
|
31
|
+
int max_buffered_docs;
|
32
|
+
int max_merge_docs;
|
33
|
+
int max_field_length;
|
34
|
+
bool use_compound_file;
|
35
|
+
} Config;
|
29
36
|
|
30
|
-
|
31
|
-
typedef struct IndexWriter IndexWriter;
|
32
|
-
typedef struct SegmentReader SegmentReader;
|
37
|
+
extern const Config default_config;
|
33
38
|
|
34
39
|
/***************************************************************************
|
35
40
|
*
|
@@ -38,20 +43,18 @@ typedef struct SegmentReader SegmentReader;
|
|
38
43
|
***************************************************************************/
|
39
44
|
|
40
45
|
typedef struct CacheObject {
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
46
|
+
HashTable *ref_tab1;
|
47
|
+
HashTable *ref_tab2;
|
48
|
+
void *ref1;
|
49
|
+
void *ref2;
|
50
|
+
void *obj;
|
51
|
+
void (*destroy)(void *p);
|
47
52
|
} CacheObject;
|
48
53
|
|
49
|
-
void cache_destroy(CacheObject *co);
|
50
|
-
CacheObject *co_create(
|
51
|
-
|
52
|
-
|
53
|
-
int co_eq(const void *key1, const void *key2);
|
54
|
-
HshTable *co_hsh_create();
|
54
|
+
extern void cache_destroy(CacheObject *co);
|
55
|
+
extern CacheObject *co_create(HashTable *ref_tab1, HashTable *ref_tab2,
|
56
|
+
void *ref1, void *ref2, void (*destroy)(void *p), void *obj);
|
57
|
+
extern HashTable *co_hash_create();
|
55
58
|
|
56
59
|
/****************************************************************************
|
57
60
|
*
|
@@ -59,384 +62,310 @@ HshTable *co_hsh_create();
|
|
59
62
|
*
|
60
63
|
****************************************************************************/
|
61
64
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
bool store_pos : 1;
|
69
|
-
bool omit_norms : 1;
|
70
|
-
} FieldInfo;
|
71
|
-
|
72
|
-
FieldInfo *fi_create(char *name,
|
73
|
-
int number,
|
74
|
-
bool is_indexed,
|
75
|
-
bool store_tv,
|
76
|
-
bool store_pos,
|
77
|
-
bool store_offset,
|
78
|
-
bool omit_norms);
|
79
|
-
void fi_destroy(FieldInfo *fi);
|
80
|
-
|
81
|
-
/****************************************************************************
|
82
|
-
*
|
83
|
-
* FieldInfos
|
84
|
-
*
|
85
|
-
****************************************************************************/
|
86
|
-
|
87
|
-
typedef struct FieldInfos {
|
88
|
-
HashEntry **by_name;
|
89
|
-
FieldInfo **by_number;
|
90
|
-
int fcnt;
|
91
|
-
} FieldInfos;
|
65
|
+
enum StoreValues
|
66
|
+
{
|
67
|
+
STORE_NO = 0,
|
68
|
+
STORE_YES = 1,
|
69
|
+
STORE_COMPRESS = 2
|
70
|
+
};
|
92
71
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
bool store_pos,
|
102
|
-
bool omit_norms);
|
103
|
-
|
104
|
-
void fis_add_fields(FieldInfos *fis,
|
105
|
-
HashSet *field_names,
|
106
|
-
bool is_indexed,
|
107
|
-
bool store_tv,
|
108
|
-
bool store_offset,
|
109
|
-
bool store_pos,
|
110
|
-
bool omit_norms);
|
111
|
-
bool fis_has_vectors(FieldInfos *fis);
|
112
|
-
void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext);
|
113
|
-
FieldInfos *fis_read(FieldInfos *fis, InStream *is);
|
114
|
-
FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
|
115
|
-
ullong fis_get_number(FieldInfos *fis, char *name);
|
116
|
-
FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
|
117
|
-
bool fis_reorder_required(FieldInfos *fis, Document *doc);
|
72
|
+
enum IndexValues
|
73
|
+
{
|
74
|
+
INDEX_NO = 0,
|
75
|
+
INDEX_YES = 1,
|
76
|
+
INDEX_UNTOKENIZED = 3,
|
77
|
+
INDEX_YES_OMIT_NORMS = 5,
|
78
|
+
INDEX_UNTOKENIZED_OMIT_NORMS = 7
|
79
|
+
};
|
118
80
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
81
|
+
enum TermVectorValues
|
82
|
+
{
|
83
|
+
TERM_VECTOR_NO = 0,
|
84
|
+
TERM_VECTOR_YES = 1,
|
85
|
+
TERM_VECTOR_WITH_POSITIONS = 3,
|
86
|
+
TERM_VECTOR_WITH_OFFSETS = 5,
|
87
|
+
TERM_VECTOR_WITH_POSITIONS_OFFSETS = 7
|
88
|
+
};
|
124
89
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
90
|
+
#define FI_IS_STORED_BM 0x001
|
91
|
+
#define FI_IS_COMPRESSED_BM 0x002
|
92
|
+
#define FI_IS_INDEXED_BM 0x004
|
93
|
+
#define FI_IS_TOKENIZED_BM 0x008
|
94
|
+
#define FI_OMIT_NORMS_BM 0x010
|
95
|
+
#define FI_STORE_TERM_VECTOR_BM 0x020
|
96
|
+
#define FI_STORE_POSITIONS_BM 0x040
|
97
|
+
#define FI_STORE_OFFSETS_BM 0x080
|
98
|
+
|
99
|
+
typedef struct FieldInfo
|
100
|
+
{
|
101
|
+
char *name;
|
102
|
+
float boost;
|
103
|
+
unsigned int bits;
|
104
|
+
int number;
|
105
|
+
int ref_cnt;
|
106
|
+
} FieldInfo;
|
129
107
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
108
|
+
extern FieldInfo *fi_new(const char *name,
|
109
|
+
enum StoreValues store,
|
110
|
+
enum IndexValues index,
|
111
|
+
enum TermVectorValues term_vector);
|
112
|
+
extern char *fi_to_s(FieldInfo *fi);
|
113
|
+
extern void fi_deref(FieldInfo *fi);
|
114
|
+
|
115
|
+
#define fi_is_stored(fi) (((fi)->bits & FI_IS_STORED_BM) != 0)
|
116
|
+
#define fi_is_compressed(fi) (((fi)->bits & FI_IS_COMPRESSED_BM) != 0)
|
117
|
+
#define fi_is_indexed(fi) (((fi)->bits & FI_IS_INDEXED_BM) != 0)
|
118
|
+
#define fi_is_tokenized(fi) (((fi)->bits & FI_IS_TOKENIZED_BM) != 0)
|
119
|
+
#define fi_omit_norms(fi) (((fi)->bits & FI_OMIT_NORMS_BM) != 0)
|
120
|
+
#define fi_store_term_vector(fi) (((fi)->bits & FI_STORE_TERM_VECTOR_BM) != 0)
|
121
|
+
#define fi_store_positions(fi) (((fi)->bits & FI_STORE_POSITIONS_BM) != 0)
|
122
|
+
#define fi_store_offsets(fi) (((fi)->bits & FI_STORE_OFFSETS_BM) != 0)
|
123
|
+
#define fi_has_norms(fi)\
|
124
|
+
(((fi)->bits & (FI_OMIT_NORMS_BM|FI_IS_INDEXED_BM)) == FI_IS_INDEXED_BM)
|
138
125
|
|
139
126
|
/****************************************************************************
|
140
127
|
*
|
141
|
-
*
|
128
|
+
* FieldInfos
|
142
129
|
*
|
143
130
|
****************************************************************************/
|
144
131
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
132
|
+
#define FIELD_INFOS_INIT_CAPA 4
|
133
|
+
typedef struct FieldInfos
|
134
|
+
{
|
135
|
+
int store;
|
136
|
+
int index;
|
137
|
+
int term_vector;
|
138
|
+
int size;
|
139
|
+
int capa;
|
140
|
+
FieldInfo **fields;
|
141
|
+
HashTable *field_dict;
|
142
|
+
int ref_cnt;
|
143
|
+
} FieldInfos;
|
151
144
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
145
|
+
extern FieldInfos *fis_new(int store, int index, int term_vector);
|
146
|
+
extern FieldInfo *fis_add_field(FieldInfos *fis, FieldInfo *fi);
|
147
|
+
extern FieldInfo *fis_get_field(FieldInfos *fis, const char *name);
|
148
|
+
extern int fis_get_field_num(FieldInfos *fis, const char *name);
|
149
|
+
extern FieldInfo *fis_get_or_add_field(FieldInfos *fis, const char *name);
|
150
|
+
extern void fis_write(FieldInfos *fis, Store *store);
|
151
|
+
extern FieldInfos *fis_read(Store *store);
|
152
|
+
extern char *fis_to_s(FieldInfos *fis);
|
153
|
+
extern void fis_deref(FieldInfos *fis);
|
158
154
|
|
159
155
|
/****************************************************************************
|
160
156
|
*
|
161
|
-
*
|
157
|
+
* SegmentInfo
|
162
158
|
*
|
163
159
|
****************************************************************************/
|
164
160
|
|
165
|
-
|
166
|
-
struct TermEnum {
|
167
|
-
void *data;
|
168
|
-
TermBuffer *(*next)(TermEnum *te);
|
169
|
-
void (*close)(TermEnum *te);
|
170
|
-
TermEnum *(*clone)(TermEnum *te);
|
171
|
-
TermBuffer *tb_curr;
|
172
|
-
TermBuffer *tb_prev;
|
173
|
-
TermInfo *ti_curr;
|
174
|
-
};
|
175
|
-
|
176
|
-
TermBuffer *te_skip_to(struct TermEnum *te, Term *t);
|
177
|
-
|
178
|
-
Term *te_get_term(struct TermEnum *te);
|
179
|
-
TermInfo *te_get_ti(struct TermEnum *te);
|
180
|
-
|
181
|
-
/* * SegmentTermEnum * */
|
182
|
-
|
183
|
-
typedef struct SegmentTermEnum {
|
184
|
-
FieldInfos *fis;
|
185
|
-
int is_index;
|
186
|
-
InStream *is;
|
187
|
-
int size;
|
188
|
-
int pos;
|
189
|
-
int index_pointer;
|
190
|
-
int index_interval;
|
191
|
-
int skip_interval;
|
192
|
-
int format_m1skip_interval;
|
193
|
-
int format;
|
194
|
-
} SegmentTermEnum;
|
195
|
-
|
196
|
-
|
197
|
-
TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index);
|
198
|
-
TermBuffer *ste_next(struct TermEnum *te);
|
199
|
-
void ste_close(struct TermEnum *te);
|
200
|
-
|
201
|
-
/* * MultiTermEnum * */
|
161
|
+
#define SEGMENT_NAME_MAX_LENGTH 100
|
202
162
|
|
203
|
-
typedef struct
|
204
|
-
|
205
|
-
|
206
|
-
|
163
|
+
typedef struct SegmentInfo
|
164
|
+
{
|
165
|
+
char *name;
|
166
|
+
int doc_cnt;
|
167
|
+
Store *store;
|
168
|
+
} SegmentInfo;
|
207
169
|
|
208
|
-
|
170
|
+
extern SegmentInfo *si_new(char *name, int doc_cnt, Store *store);
|
171
|
+
extern void si_destroy(SegmentInfo *si);
|
172
|
+
extern bool si_has_deletions(SegmentInfo *si);
|
173
|
+
extern bool si_uses_compound_file(SegmentInfo *si);
|
174
|
+
extern bool si_has_separate_norms(SegmentInfo *si);
|
209
175
|
|
210
176
|
/****************************************************************************
|
211
177
|
*
|
212
|
-
*
|
178
|
+
* SegmentInfos
|
213
179
|
*
|
214
180
|
****************************************************************************/
|
215
181
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
Term *last_term;
|
227
|
-
TermInfo *last_term_info;
|
228
|
-
FieldInfos *fis;
|
229
|
-
char *curr_field;
|
230
|
-
ullong curr_field_num;
|
231
|
-
} TermInfosWriter;
|
182
|
+
typedef struct SegmentInfos
|
183
|
+
{
|
184
|
+
f_u64 counter;
|
185
|
+
f_u64 version;
|
186
|
+
f_u32 format;
|
187
|
+
Store *store;
|
188
|
+
SegmentInfo **segs;
|
189
|
+
int size;
|
190
|
+
int capa;
|
191
|
+
} SegmentInfos;
|
232
192
|
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
void
|
238
|
-
void
|
193
|
+
extern SegmentInfos *sis_new();
|
194
|
+
extern SegmentInfo *sis_new_segment(SegmentInfos *sis, int dcnt, Store *store);
|
195
|
+
extern SegmentInfo *sis_add_si(SegmentInfos *sis, SegmentInfo *si);
|
196
|
+
extern void sis_del_at(SegmentInfos *sis, int at);
|
197
|
+
extern void sis_del_from_to(SegmentInfos *sis, int from, int to);
|
198
|
+
extern void sis_clear(SegmentInfos *sis);
|
199
|
+
extern SegmentInfos *sis_read(Store *store);
|
200
|
+
extern void sis_write(SegmentInfos *sis, Store *store);
|
201
|
+
extern f_u64 sis_read_current_version(Store *store);
|
202
|
+
extern void sis_destroy(SegmentInfos *sis);
|
239
203
|
|
240
204
|
/****************************************************************************
|
241
205
|
*
|
242
|
-
*
|
206
|
+
* TermInfo
|
243
207
|
*
|
244
208
|
****************************************************************************/
|
245
209
|
|
246
|
-
typedef struct
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
int skip_interval;
|
254
|
-
int index_size;
|
255
|
-
Term **index_terms;
|
256
|
-
TermInfo **index_term_infos;
|
257
|
-
int *index_pointers;
|
258
|
-
} TermInfosReader;
|
210
|
+
typedef struct TermInfo
|
211
|
+
{
|
212
|
+
int doc_freq;
|
213
|
+
off_t frq_ptr;
|
214
|
+
off_t prx_ptr;
|
215
|
+
off_t skip_offset;
|
216
|
+
} TermInfo;
|
259
217
|
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
218
|
+
#define ti_set(ti, mdf, mfp, mpp, mso) do {\
|
219
|
+
(ti).doc_freq = mdf;\
|
220
|
+
(ti).frq_ptr = mfp;\
|
221
|
+
(ti).prx_ptr = mpp;\
|
222
|
+
(ti).skip_offset = mso;\
|
223
|
+
} while (0)
|
265
224
|
|
266
225
|
/****************************************************************************
|
267
226
|
*
|
268
|
-
*
|
227
|
+
* TermEnum
|
269
228
|
*
|
270
229
|
****************************************************************************/
|
271
230
|
|
272
|
-
typedef struct
|
273
|
-
int start;
|
274
|
-
int end;
|
275
|
-
} TVOffsetInfo;
|
276
|
-
|
277
|
-
TVOffsetInfo *tvoi_create(int start, int end);
|
278
|
-
void tvoi_destroy(void *p);
|
279
|
-
|
280
|
-
/****************************************************************************
|
281
|
-
*
|
282
|
-
* TVField
|
283
|
-
*
|
284
|
-
****************************************************************************/
|
231
|
+
typedef struct TermEnum TermEnum;
|
285
232
|
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
233
|
+
struct TermEnum
|
234
|
+
{
|
235
|
+
char curr_term[MAX_WORD_SIZE];
|
236
|
+
char prev_term[MAX_WORD_SIZE];
|
237
|
+
TermInfo curr_ti;
|
238
|
+
int curr_term_len;
|
239
|
+
int field_num;
|
240
|
+
TermEnum *(*set_field)(TermEnum *te, int field_num);
|
241
|
+
char *(*next)(TermEnum *te);
|
242
|
+
char *(*skip_to)(TermEnum *te, const char *term);
|
243
|
+
void (*close)(TermEnum *te);
|
244
|
+
TermEnum *(*clone)(TermEnum *te);
|
245
|
+
};
|
292
246
|
|
293
|
-
|
294
|
-
|
247
|
+
char *te_get_term(struct TermEnum *te);
|
248
|
+
TermInfo *te_get_ti(struct TermEnum *te);
|
295
249
|
|
296
250
|
/****************************************************************************
|
297
251
|
*
|
298
|
-
*
|
252
|
+
* SegmentTermEnum
|
299
253
|
*
|
300
254
|
****************************************************************************/
|
301
255
|
|
302
|
-
|
303
|
-
int field_num;
|
304
|
-
char *text;
|
305
|
-
int freq;
|
306
|
-
int *positions;
|
307
|
-
TVOffsetInfo **offsets;
|
308
|
-
} TVTerm;
|
256
|
+
/* * SegmentTermIndex * */
|
309
257
|
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
258
|
+
typedef struct SegmentTermIndex
|
259
|
+
{
|
260
|
+
off_t index_ptr;
|
261
|
+
off_t ptr;
|
262
|
+
int index_size;
|
263
|
+
int size;
|
264
|
+
char **index_terms;
|
265
|
+
int *index_term_lens;
|
266
|
+
TermInfo *index_term_infos;
|
267
|
+
off_t *index_ptrs;
|
268
|
+
} SegmentTermIndex;
|
315
269
|
|
316
|
-
|
317
|
-
*
|
318
|
-
* TermVector
|
319
|
-
*
|
320
|
-
****************************************************************************/
|
270
|
+
/* * SegmentFieldIndex * */
|
321
271
|
|
322
|
-
typedef struct
|
323
|
-
char *field;
|
324
|
-
char **terms;
|
325
|
-
int tcnt;
|
326
|
-
int *freqs;
|
327
|
-
int **positions;
|
328
|
-
TVOffsetInfo ***offsets;
|
329
|
-
} TermVector;
|
272
|
+
typedef struct SegmentTermEnum SegmentTermEnum;
|
330
273
|
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
274
|
+
typedef struct SegmentFieldIndex
|
275
|
+
{
|
276
|
+
mutex_t mutex;
|
277
|
+
int skip_interval;
|
278
|
+
int index_interval;
|
279
|
+
off_t index_ptr;
|
280
|
+
TermEnum *index_te;
|
281
|
+
HashTable *field_dict;
|
282
|
+
} SegmentFieldIndex;
|
338
283
|
|
339
|
-
|
340
|
-
*
|
341
|
-
* TermVectorsWriter
|
342
|
-
*
|
343
|
-
****************************************************************************/
|
284
|
+
extern SegmentFieldIndex *sfi_open(Store *store, const char *segment);
|
285
|
+
extern void sfi_close(SegmentFieldIndex *sfi);
|
344
286
|
|
345
|
-
#define STORE_POSITIONS_WITH_TERMVECTOR 0x1
|
346
|
-
#define STORE_OFFSET_WITH_TERMVECTOR 0x2
|
347
287
|
|
348
|
-
|
349
|
-
|
288
|
+
/* * SegmentTermEnum * */
|
289
|
+
struct SegmentTermEnum
|
290
|
+
{
|
291
|
+
TermEnum te;
|
292
|
+
InStream *is;
|
293
|
+
int size;
|
294
|
+
int pos;
|
295
|
+
int skip_interval;
|
296
|
+
SegmentFieldIndex *sfi;
|
297
|
+
};
|
350
298
|
|
351
|
-
|
352
|
-
|
353
|
-
|
299
|
+
extern void ste_close(TermEnum *te);
|
300
|
+
extern TermEnum *ste_clone(TermEnum *te);
|
301
|
+
extern TermEnum *ste_new(InStream *is, SegmentFieldIndex *sfi);
|
354
302
|
|
355
|
-
|
356
|
-
TVField *curr_field;
|
357
|
-
int curr_doc_pointer;
|
358
|
-
OutStream *tvx;
|
359
|
-
OutStream *tvd;
|
360
|
-
OutStream *tvf;
|
361
|
-
FieldInfos *fis;
|
362
|
-
TVField **fields;
|
363
|
-
int fcnt;
|
364
|
-
int fsize;
|
365
|
-
TVTerm **terms;
|
366
|
-
int tcnt;
|
367
|
-
int tsize;
|
368
|
-
} TermVectorsWriter;
|
369
|
-
|
370
|
-
TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
|
371
|
-
void tvw_close(TermVectorsWriter *tvw);
|
372
|
-
void tvw_open_doc(TermVectorsWriter *tvw);
|
373
|
-
void tvw_close_doc(TermVectorsWriter *tvw);
|
374
|
-
void tvw_open_field(TermVectorsWriter *tvw, char *field);
|
375
|
-
void tvw_close_field(TermVectorsWriter *tvw);
|
376
|
-
void tvw_add_term(TermVectorsWriter *tvw, char *text, int freq, int *positions, TVOffsetInfo **offsets);
|
377
|
-
void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
|
303
|
+
/* * MultiTermEnum * */
|
378
304
|
|
305
|
+
extern TermEnum *mte_new(MultiReader *mr, int field_num, const char *term);
|
379
306
|
|
380
307
|
/****************************************************************************
|
381
308
|
*
|
382
|
-
*
|
309
|
+
* TermInfosReader
|
383
310
|
*
|
384
311
|
****************************************************************************/
|
385
312
|
|
386
|
-
|
387
|
-
int size;
|
388
|
-
InStream *tvx;
|
389
|
-
InStream *tvd;
|
390
|
-
InStream *tvf;
|
391
|
-
FieldInfos *fis;
|
392
|
-
int tvd_format;
|
393
|
-
int tvf_format;
|
394
|
-
} TermVectorsReader;
|
395
|
-
|
396
|
-
TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
|
397
|
-
TermVectorsReader *tvr_clone(TermVectorsReader *orig);
|
398
|
-
void tvr_close(TermVectorsReader *tvr);
|
399
|
-
TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
400
|
-
char *field, int tvf_pointer);
|
401
|
-
Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
|
402
|
-
TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
|
313
|
+
#define TE_BUCKET_INIT_CAPA 1
|
403
314
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
int len;
|
412
|
-
FieldInfos *fis;
|
413
|
-
InStream *fields_in;
|
414
|
-
InStream *index_in;
|
415
|
-
} FieldsReader;
|
315
|
+
typedef struct TermInfosReader
|
316
|
+
{
|
317
|
+
thread_key_t thread_te;
|
318
|
+
void **te_bucket;
|
319
|
+
TermEnum *orig_te;
|
320
|
+
int field_num;
|
321
|
+
} TermInfosReader;
|
416
322
|
|
417
|
-
|
418
|
-
|
419
|
-
|
323
|
+
extern TermInfosReader *tir_open(Store *store,
|
324
|
+
SegmentFieldIndex *sfi,
|
325
|
+
const char *segment);
|
326
|
+
extern TermInfosReader *tir_set_field(TermInfosReader *tir, int field_num);
|
327
|
+
extern TermInfo *tir_get_ti(TermInfosReader *tir, const char *term);
|
328
|
+
extern char *tir_get_term(TermInfosReader *tir, int pos);
|
329
|
+
extern void tir_close(TermInfosReader *tir);
|
420
330
|
|
421
331
|
/****************************************************************************
|
422
332
|
*
|
423
|
-
*
|
333
|
+
* TermInfosWriter
|
424
334
|
*
|
425
335
|
****************************************************************************/
|
426
336
|
|
427
|
-
#define
|
428
|
-
#define
|
429
|
-
#define FIELD_IS_COMPRESSED 0X4
|
337
|
+
#define INDEX_INTERVAL 128
|
338
|
+
#define SKIP_INTERVAL 16
|
430
339
|
|
431
|
-
typedef struct
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
340
|
+
typedef struct TermWriter
|
341
|
+
{
|
342
|
+
int counter;
|
343
|
+
const char *last_term;
|
344
|
+
TermInfo last_term_info;
|
345
|
+
OutStream *os;
|
346
|
+
} TermWriter;
|
347
|
+
|
348
|
+
typedef struct TermInfosWriter
|
349
|
+
{
|
350
|
+
int field_count;
|
351
|
+
int index_interval;
|
352
|
+
int skip_interval;
|
353
|
+
off_t last_index_ptr;
|
354
|
+
OutStream *tfx_out;
|
355
|
+
TermWriter *tix_writer;
|
356
|
+
TermWriter *tis_writer;
|
357
|
+
} TermInfosWriter;
|
436
358
|
|
437
|
-
|
438
|
-
|
439
|
-
|
359
|
+
extern TermInfosWriter *tiw_open(Store *store,
|
360
|
+
const char *segment,
|
361
|
+
int index_interval,
|
362
|
+
int skip_interval);
|
363
|
+
extern void tiw_start_field(TermInfosWriter *tiw, int field_num);
|
364
|
+
extern void tiw_add(TermInfosWriter *tiw,
|
365
|
+
const char *term,
|
366
|
+
int t_len,
|
367
|
+
TermInfo *ti);
|
368
|
+
extern void tiw_close(TermInfosWriter *tiw);
|
440
369
|
|
441
370
|
/****************************************************************************
|
442
371
|
*
|
@@ -445,9 +374,10 @@ void fw_add_doc(FieldsWriter *fw, Document *doc);
|
|
445
374
|
****************************************************************************/
|
446
375
|
|
447
376
|
typedef struct TermDocEnum TermDocEnum;
|
448
|
-
struct TermDocEnum
|
449
|
-
|
450
|
-
void (*seek)(TermDocEnum *tde,
|
377
|
+
struct TermDocEnum
|
378
|
+
{
|
379
|
+
void (*seek)(TermDocEnum *tde, int field_num, const char *term);
|
380
|
+
void (*seek_te)(TermDocEnum *tde, TermEnum *te);
|
451
381
|
int (*doc_num)(TermDocEnum *tde);
|
452
382
|
int (*freq)(TermDocEnum *tde);
|
453
383
|
bool (*next)(TermDocEnum *tde);
|
@@ -460,71 +390,72 @@ struct TermDocEnum {
|
|
460
390
|
/* * SegmentTermDocEnum * */
|
461
391
|
|
462
392
|
typedef struct SegmentTermDocEnum SegmentTermDocEnum;
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
393
|
+
struct SegmentTermDocEnum
|
394
|
+
{
|
395
|
+
TermDocEnum tde;
|
396
|
+
void (*seek_prox)(SegmentTermDocEnum *stde, int prx_ptr);
|
397
|
+
void (*skip_prox)(SegmentTermDocEnum *stde);
|
398
|
+
TermInfosReader *tir;
|
399
|
+
InStream *frq_in;
|
400
|
+
InStream *prx_in;
|
401
|
+
InStream *skip_in;
|
402
|
+
BitVector *deleted_docs;
|
467
403
|
int count; /* number of docs for this term skipped */
|
468
404
|
int doc_freq; /* number of doc this term appears in */
|
469
|
-
BitVector *deleted_docs;
|
470
405
|
int doc_num;
|
471
406
|
int freq;
|
472
|
-
int skip_interval;
|
473
407
|
int num_skips;
|
408
|
+
int skip_interval;
|
474
409
|
int skip_count;
|
475
|
-
InStream *skip_in;
|
476
410
|
int skip_doc;
|
477
|
-
int
|
478
|
-
int
|
479
|
-
int
|
480
|
-
|
481
|
-
void (*skip_prox)(SegmentTermDocEnum *stde);
|
482
|
-
InStream *prox_in;
|
483
|
-
int prox_cnt;
|
411
|
+
int frq_ptr;
|
412
|
+
int prx_ptr;
|
413
|
+
int skip_ptr;
|
414
|
+
int prx_cnt;
|
484
415
|
int position;
|
485
|
-
|
416
|
+
bool have_skipped : 1;
|
486
417
|
};
|
487
418
|
|
488
|
-
TermDocEnum *
|
489
|
-
|
419
|
+
extern TermDocEnum *stde_new(TermInfosReader *tir, InStream *frq_in,
|
420
|
+
BitVector *deleted_docs, int skip_interval);
|
490
421
|
|
491
|
-
/* *
|
492
|
-
TermDocEnum *
|
493
|
-
|
494
|
-
|
495
|
-
typedef struct MultiTermDocEnum MultiTermDocEnum;
|
496
|
-
struct MultiTermDocEnum {
|
497
|
-
IndexReader **irs;
|
498
|
-
int *starts;
|
499
|
-
int ir_cnt;
|
500
|
-
Term *term;
|
501
|
-
int base;
|
502
|
-
int pointer;
|
503
|
-
TermDocEnum **irs_tde;
|
504
|
-
TermDocEnum *curr_tde;
|
505
|
-
TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
|
506
|
-
};
|
422
|
+
/* * SegmentTermDocEnum * */
|
423
|
+
extern TermDocEnum *stpe_new(TermInfosReader *tir, InStream *frq_in,
|
424
|
+
InStream *prx_in, BitVector *deleted_docs,
|
425
|
+
int skip_interval);
|
507
426
|
|
508
|
-
|
427
|
+
/****************************************************************************
|
428
|
+
* MultipleTermDocPosEnum
|
429
|
+
****************************************************************************/
|
509
430
|
|
510
|
-
|
511
|
-
|
431
|
+
extern TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms,
|
432
|
+
int t_cnt);
|
512
433
|
|
513
434
|
/****************************************************************************
|
514
|
-
*
|
435
|
+
*
|
436
|
+
* Offset
|
437
|
+
*
|
515
438
|
****************************************************************************/
|
516
439
|
|
517
|
-
|
518
|
-
|
519
|
-
int
|
520
|
-
int
|
521
|
-
|
522
|
-
int *pos_queue;
|
523
|
-
int pos_queue_index;
|
524
|
-
int pos_queue_capa;
|
525
|
-
} MultipleTermDocPosEnum;
|
440
|
+
typedef struct Offset
|
441
|
+
{
|
442
|
+
int start;
|
443
|
+
int end;
|
444
|
+
} Offset;
|
526
445
|
|
527
|
-
|
446
|
+
extern Offset *offset_new(int start, int end);
|
447
|
+
|
448
|
+
/****************************************************************************
|
449
|
+
*
|
450
|
+
* Occurence
|
451
|
+
*
|
452
|
+
****************************************************************************/
|
453
|
+
|
454
|
+
typedef struct Occurence
|
455
|
+
{
|
456
|
+
struct Occurence *next;
|
457
|
+
int pos;
|
458
|
+
} Occurence;
|
528
459
|
|
529
460
|
/****************************************************************************
|
530
461
|
*
|
@@ -532,283 +463,388 @@ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
|
|
532
463
|
*
|
533
464
|
****************************************************************************/
|
534
465
|
|
535
|
-
typedef struct Posting
|
536
|
-
|
466
|
+
typedef struct Posting
|
467
|
+
{
|
537
468
|
int freq;
|
538
|
-
int
|
539
|
-
|
540
|
-
|
469
|
+
int doc_num;
|
470
|
+
Occurence *first_occ;
|
471
|
+
struct Posting *next;
|
541
472
|
} Posting;
|
542
473
|
|
543
|
-
Posting *
|
544
|
-
void p_destroy(Posting *self);
|
545
|
-
void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset);
|
474
|
+
extern inline Posting *p_new(MemoryPool *mp, int doc_num, int pos);
|
546
475
|
|
476
|
+
/****************************************************************************
|
477
|
+
*
|
478
|
+
* PostingList
|
479
|
+
*
|
480
|
+
****************************************************************************/
|
481
|
+
|
482
|
+
typedef struct PostingList
|
483
|
+
{
|
484
|
+
const char *term;
|
485
|
+
int term_len;
|
486
|
+
Posting *first;
|
487
|
+
Posting *last;
|
488
|
+
Occurence *last_occ;
|
489
|
+
} PostingList;
|
490
|
+
|
491
|
+
extern PostingList *pl_new(MemoryPool *mp, const char *term,
|
492
|
+
int term_len, Posting *p);
|
493
|
+
extern void pl_add_occ(MemoryPool *mp, PostingList *pl, int pos);
|
547
494
|
|
548
495
|
/****************************************************************************
|
549
496
|
*
|
550
|
-
*
|
497
|
+
* TVField
|
551
498
|
*
|
552
499
|
****************************************************************************/
|
553
500
|
|
554
|
-
typedef struct
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
int max_field_length;
|
566
|
-
int term_index_interval;
|
567
|
-
} DocumentWriter;
|
501
|
+
typedef struct TVField
|
502
|
+
{
|
503
|
+
int field_num;
|
504
|
+
int size;
|
505
|
+
} TVField;
|
506
|
+
|
507
|
+
/****************************************************************************
|
508
|
+
*
|
509
|
+
* TVTerm
|
510
|
+
*
|
511
|
+
****************************************************************************/
|
568
512
|
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
513
|
+
typedef struct TVTerm
|
514
|
+
{
|
515
|
+
char *text;
|
516
|
+
int freq;
|
517
|
+
int *positions;
|
518
|
+
} TVTerm;
|
573
519
|
|
574
520
|
/****************************************************************************
|
575
521
|
*
|
576
|
-
*
|
522
|
+
* TermVector
|
577
523
|
*
|
578
524
|
****************************************************************************/
|
579
525
|
|
580
|
-
typedef struct
|
581
|
-
|
582
|
-
int
|
583
|
-
|
584
|
-
|
526
|
+
typedef struct TermVector
|
527
|
+
{
|
528
|
+
int field_num;
|
529
|
+
char *field;
|
530
|
+
int term_cnt;
|
531
|
+
TVTerm *terms;
|
532
|
+
int offset_cnt;
|
533
|
+
Offset *offsets;
|
534
|
+
} TermVector;
|
585
535
|
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
bool si_uses_compound_file(SegmentInfo *si);
|
590
|
-
bool si_has_separate_norms(SegmentInfo *si);
|
536
|
+
extern void tv_destroy(TermVector *tv);
|
537
|
+
extern int tv_get_tv_term_index(TermVector *tv, const char *term);
|
538
|
+
extern TVTerm *tv_get_tv_term(TermVector *tv, const char *term);
|
591
539
|
|
592
540
|
/****************************************************************************
|
593
541
|
*
|
594
|
-
*
|
542
|
+
* TermVectorsWriter
|
595
543
|
*
|
596
544
|
****************************************************************************/
|
597
545
|
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
546
|
+
#define TV_FIELD_INIT_CAPA 8
|
547
|
+
|
548
|
+
typedef struct TermVectorsWriter
|
549
|
+
{
|
550
|
+
OutStream *tvx_out;
|
551
|
+
OutStream *tvd_out;
|
552
|
+
FieldInfos *fis;
|
553
|
+
TVField *fields;
|
554
|
+
off_t tvd_ptr;
|
555
|
+
} TermVectorsWriter;
|
607
556
|
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
void
|
612
|
-
void
|
613
|
-
void
|
614
|
-
|
615
|
-
|
616
|
-
int
|
557
|
+
extern TermVectorsWriter *tvw_open(Store *store,
|
558
|
+
const char *segment,
|
559
|
+
FieldInfos *fis);
|
560
|
+
extern void tvw_open_doc(TermVectorsWriter *tvw);
|
561
|
+
extern void tvw_close_doc(TermVectorsWriter *tvw);
|
562
|
+
extern void tvw_add_postings(TermVectorsWriter *tvw,
|
563
|
+
int field_num,
|
564
|
+
PostingList **plists,
|
565
|
+
int posting_count,
|
566
|
+
Offset *offsets,
|
567
|
+
int offset_count);
|
568
|
+
extern void tvw_close(TermVectorsWriter *tvw);
|
617
569
|
|
618
570
|
/****************************************************************************
|
619
571
|
*
|
620
|
-
*
|
572
|
+
* TermVectorsReader
|
621
573
|
*
|
622
574
|
****************************************************************************/
|
623
575
|
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
/* all fields which are indexed with termvectors enables */
|
632
|
-
IR_INDEXED_WITH_TERM_VECTOR,
|
633
|
-
/* all fields which are indexed but don't have termvectors enabled */
|
634
|
-
IR_INDEXED_NO_TERM_VECTOR,
|
635
|
-
/* all fields where termvectors are enabled. Please note that only standard */
|
636
|
-
/* termvector fields are returned */
|
637
|
-
IR_TERM_VECTOR,
|
638
|
-
/* all field with termvectors wiht positions enabled */
|
639
|
-
IR_TERM_VECTOR_WITH_POSITION,
|
640
|
-
/* all fields where termvectors with offset position are set */
|
641
|
-
IR_TERM_VECTOR_WITH_OFFSET,
|
642
|
-
/* all fields where termvectors with offset and position values set */
|
643
|
-
IR_TERM_VECTOR_WITH_POSITION_OFFSET
|
644
|
-
};
|
576
|
+
typedef struct TermVectorsReader
|
577
|
+
{
|
578
|
+
int size;
|
579
|
+
InStream *tvx_in;
|
580
|
+
InStream *tvd_in;
|
581
|
+
FieldInfos *fis;
|
582
|
+
} TermVectorsReader;
|
645
583
|
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
584
|
+
extern TermVectorsReader *tvr_open(Store *store,
|
585
|
+
const char *segment,
|
586
|
+
FieldInfos *fis);
|
587
|
+
extern TermVectorsReader *tvr_clone(TermVectorsReader *orig);
|
588
|
+
extern void tvr_close(TermVectorsReader *tvr);
|
589
|
+
extern HashTable *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
|
590
|
+
extern TermVector *tvr_get_field_tv(TermVectorsReader *tvr,
|
591
|
+
int doc_num,
|
592
|
+
int field_num);
|
593
|
+
|
594
|
+
/****************************************************************************
|
595
|
+
*
|
596
|
+
* LazyDoc
|
597
|
+
*
|
598
|
+
****************************************************************************/
|
599
|
+
|
600
|
+
/* * * LazyDocField * * */
|
601
|
+
typedef struct LazyDocFieldData
|
602
|
+
{
|
603
|
+
int start;
|
604
|
+
int length;
|
605
|
+
char *text;
|
606
|
+
} LazyDocFieldData;
|
607
|
+
|
608
|
+
typedef struct LazyDoc LazyDoc;
|
609
|
+
typedef struct LazyDocField
|
610
|
+
{
|
611
|
+
char *name;
|
612
|
+
int size; /* number of data elements */
|
613
|
+
LazyDocFieldData *data;
|
614
|
+
int len; /* length of data elements concatenated */
|
615
|
+
LazyDoc *doc;
|
616
|
+
} LazyDocField;
|
617
|
+
|
618
|
+
extern char *lazy_df_get_data(LazyDocField *self, int i);
|
619
|
+
extern void lazy_df_get_bytes(LazyDocField *self, char *buf,
|
620
|
+
int start, int len);
|
621
|
+
|
622
|
+
/* * * LazyDoc * * */
|
623
|
+
struct LazyDoc
|
624
|
+
{
|
625
|
+
HashTable *field_dict;
|
626
|
+
int size;
|
627
|
+
LazyDocField **fields;
|
628
|
+
InStream *fields_in;
|
682
629
|
};
|
683
630
|
|
684
|
-
|
685
|
-
IndexReader *ir_open(Store *store);
|
686
|
-
bool ir_index_exists(Store *store);
|
687
|
-
void ir_close(IndexReader *ir);
|
688
|
-
void ir_commit(IndexReader *ir);
|
689
|
-
void ir_delete_doc(IndexReader *ir, int doc_num);
|
690
|
-
void ir_undelete_all(IndexReader *ir);
|
691
|
-
void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val);
|
692
|
-
void ir_destroy(IndexReader *self);
|
693
|
-
Document *ir_get_doc_with_term(IndexReader *ir, Term *term);
|
694
|
-
TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term);
|
695
|
-
TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term);
|
696
|
-
void ir_add_cache(IndexReader *ir);
|
697
|
-
bool ir_is_latest(IndexReader *ir);
|
631
|
+
extern void lazy_doc_close(LazyDoc *self);
|
698
632
|
|
699
633
|
/****************************************************************************
|
700
634
|
*
|
701
|
-
*
|
635
|
+
* FieldsReader
|
702
636
|
*
|
703
637
|
****************************************************************************/
|
704
638
|
|
705
|
-
typedef struct
|
706
|
-
|
707
|
-
int
|
708
|
-
|
709
|
-
|
710
|
-
|
639
|
+
typedef struct FieldsReader
|
640
|
+
{
|
641
|
+
int size;
|
642
|
+
FieldInfos *fis;
|
643
|
+
Store *store;
|
644
|
+
InStream *fdx_in;
|
645
|
+
InStream *fdt_in;
|
646
|
+
} FieldsReader;
|
647
|
+
|
648
|
+
extern FieldsReader *fr_open(Store *store,
|
649
|
+
const char *segment, FieldInfos *fis);
|
650
|
+
extern FieldsReader *fr_clone(FieldsReader *orig);
|
651
|
+
extern void fr_close(FieldsReader *fr);
|
652
|
+
extern Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
653
|
+
extern LazyDoc *fr_get_lazy_doc(FieldsReader *fr, int doc_num);
|
654
|
+
extern HashTable *fr_get_tv(FieldsReader *fr, int doc_num);
|
655
|
+
extern TermVector *fr_get_field_tv(FieldsReader *fr, int doc_num,
|
656
|
+
int field_num);
|
711
657
|
|
712
658
|
/****************************************************************************
|
713
659
|
*
|
714
|
-
*
|
660
|
+
* FieldsWriter
|
715
661
|
*
|
716
662
|
****************************************************************************/
|
717
663
|
|
718
|
-
struct
|
664
|
+
typedef struct FieldsWriter
|
665
|
+
{
|
719
666
|
FieldInfos *fis;
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
bool norms_dirty : 1;
|
726
|
-
InStream *freq_in;
|
727
|
-
InStream *prox_in;
|
728
|
-
TermInfosReader *tir;
|
729
|
-
TermVectorsReader *orig_tvr;
|
730
|
-
thread_key_t thread_tvr;
|
731
|
-
Array *tvr_bucket;
|
732
|
-
HshTable *norms;
|
733
|
-
Store *cfs_store;
|
734
|
-
uchar *fake_norms;
|
735
|
-
};
|
667
|
+
OutStream *fdt_out;
|
668
|
+
OutStream *fdx_out;
|
669
|
+
TVField *tv_fields;
|
670
|
+
off_t start_ptr;
|
671
|
+
} FieldsWriter;
|
736
672
|
|
737
|
-
|
738
|
-
|
673
|
+
extern FieldsWriter *fw_open(Store *store,
|
674
|
+
const char *segment, FieldInfos *fis);
|
675
|
+
extern void fw_close(FieldsWriter *fw);
|
676
|
+
extern void fw_add_doc(FieldsWriter *fw, Document *doc);
|
677
|
+
extern void fw_add_postings(FieldsWriter *fw,
|
678
|
+
int field_num,
|
679
|
+
PostingList **plists,
|
680
|
+
int posting_count,
|
681
|
+
Offset *offsets,
|
682
|
+
int offset_count);
|
683
|
+
extern void fw_write_tv_index(FieldsWriter *fw);
|
739
684
|
|
740
685
|
/****************************************************************************
|
741
686
|
*
|
742
|
-
*
|
687
|
+
* IndexReader
|
743
688
|
*
|
744
689
|
****************************************************************************/
|
745
690
|
|
746
|
-
|
747
|
-
|
691
|
+
#define WRITE_LOCK_NAME "write"
|
692
|
+
#define COMMIT_LOCK_NAME "commit"
|
693
|
+
|
694
|
+
struct IndexReader
|
695
|
+
{
|
696
|
+
int (*num_docs)(IndexReader *ir);
|
697
|
+
int (*max_doc)(IndexReader *ir);
|
698
|
+
Document *(*get_doc)(IndexReader *ir, int doc_num);
|
699
|
+
LazyDoc *(*get_lazy_doc)(IndexReader *ir, int doc_num);
|
700
|
+
uchar *(*get_norms)(IndexReader *ir, int field_num);
|
701
|
+
uchar *(*get_norms_into)(IndexReader *ir, int field_num,
|
702
|
+
uchar *buf);
|
703
|
+
TermEnum *(*terms)(IndexReader *ir, int field_num);
|
704
|
+
TermEnum *(*terms_from)(IndexReader *ir, int field_num,
|
705
|
+
const char *term);
|
706
|
+
int (*doc_freq)(IndexReader *ir, int field_num,
|
707
|
+
const char *term);
|
708
|
+
TermDocEnum *(*term_docs)(IndexReader *ir);
|
709
|
+
TermDocEnum *(*term_positions)(IndexReader *ir);
|
710
|
+
TermVector *(*term_vector)(IndexReader *ir, int doc_num,
|
711
|
+
const char *field);
|
712
|
+
HashTable *(*term_vectors)(IndexReader *ir, int doc_num);
|
713
|
+
bool (*is_deleted)(IndexReader *ir, int doc_num);
|
714
|
+
bool (*has_deletions)(IndexReader *ir);
|
715
|
+
void (*acquire_write_lock)(IndexReader *ir);
|
716
|
+
void (*set_norm_i)(IndexReader *ir, int doc_num, int field_num,
|
717
|
+
uchar val);
|
718
|
+
void (*delete_doc_i)(IndexReader *ir, int doc_num);
|
719
|
+
void (*undelete_all_i)(IndexReader *ir);
|
720
|
+
void (*commit_i)(IndexReader *ir);
|
721
|
+
void (*close_i)(IndexReader *ir);
|
722
|
+
int ref_cnt;
|
723
|
+
Store *store;
|
724
|
+
Lock *write_lock;
|
725
|
+
SegmentInfos *sis;
|
726
|
+
FieldInfos *fis;
|
727
|
+
HashTable *cache;
|
728
|
+
HashTable *sort_cache;
|
729
|
+
uchar *fake_norms;
|
730
|
+
mutex_t mutex;
|
731
|
+
bool has_changes : 1;
|
732
|
+
bool is_stale : 1;
|
733
|
+
bool is_owner : 1;
|
734
|
+
};
|
735
|
+
|
736
|
+
extern IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
|
737
|
+
extern IndexReader *ir_open(Store *store);
|
738
|
+
extern int ir_get_field_num(IndexReader *ir, const char *field);
|
739
|
+
extern bool ir_index_exists(Store *store);
|
740
|
+
extern void ir_close(IndexReader *ir);
|
741
|
+
extern void ir_commit(IndexReader *ir);
|
742
|
+
extern void ir_delete_doc(IndexReader *ir, int doc_num);
|
743
|
+
extern void ir_undelete_all(IndexReader *ir);
|
744
|
+
extern int ir_doc_freq(IndexReader *ir, const char *field, const char *term);
|
745
|
+
extern void ir_set_norm(IndexReader *ir, int doc_num, const char *field,
|
746
|
+
uchar val);
|
747
|
+
extern uchar *ir_get_norms(IndexReader *ir, const char *field);
|
748
|
+
extern uchar *ir_get_norms_into(IndexReader *ir, const char *field, uchar *buf);
|
749
|
+
extern void ir_destroy(IndexReader *self);
|
750
|
+
extern Document *ir_get_doc_with_term(IndexReader *ir, const char *field,
|
751
|
+
const char *term);
|
752
|
+
extern TermEnum *ir_terms(IndexReader *ir, const char *field);
|
753
|
+
extern TermEnum *ir_terms_from(IndexReader *ir, const char *field,
|
754
|
+
const char *t);
|
755
|
+
extern TermDocEnum *ir_term_docs_for(IndexReader *ir, const char *field,
|
756
|
+
const char *term);
|
757
|
+
extern TermDocEnum *ir_term_positions_for(IndexReader *ir, const char *fld,
|
758
|
+
const char *t);
|
759
|
+
extern void ir_add_cache(IndexReader *ir);
|
760
|
+
extern bool ir_is_latest(IndexReader *ir);
|
761
|
+
|
762
|
+
/****************************************************************************
|
763
|
+
* MultiReader
|
764
|
+
****************************************************************************/
|
765
|
+
|
766
|
+
struct MultiReader {
|
767
|
+
IndexReader ir;
|
748
768
|
int max_doc;
|
749
769
|
int num_docs_cache;
|
750
|
-
int
|
770
|
+
int r_cnt;
|
751
771
|
int *starts;
|
752
772
|
IndexReader **sub_readers;
|
753
|
-
|
754
|
-
|
773
|
+
HashTable *norms_cache;
|
774
|
+
bool has_deletions : 1;
|
775
|
+
int **field_num_map;
|
776
|
+
};
|
777
|
+
|
778
|
+
extern int mr_get_field_num(MultiReader *mr, int ir_num, int f_num);
|
779
|
+
extern IndexReader *mr_open(IndexReader **sub_readers, const int r_cnt);
|
755
780
|
|
756
|
-
IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
|
757
|
-
int rcnt);
|
758
781
|
|
759
782
|
/****************************************************************************
|
760
783
|
*
|
761
|
-
*
|
784
|
+
* Boost
|
762
785
|
*
|
763
786
|
****************************************************************************/
|
764
787
|
|
765
|
-
typedef struct
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
int *doc_map;
|
772
|
-
} SegmentMergeInfo;
|
788
|
+
typedef struct Boost
|
789
|
+
{
|
790
|
+
float val;
|
791
|
+
int doc_num;
|
792
|
+
struct Boost *next;
|
793
|
+
} Boost;
|
773
794
|
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
795
|
+
/****************************************************************************
|
796
|
+
*
|
797
|
+
* FieldInverter
|
798
|
+
*
|
799
|
+
****************************************************************************/
|
800
|
+
|
801
|
+
typedef struct FieldInverter
|
802
|
+
{
|
803
|
+
HashTable *plists;
|
804
|
+
uchar *norms;
|
805
|
+
FieldInfo *fi;
|
806
|
+
int length;
|
807
|
+
bool is_tokenized : 1;
|
808
|
+
bool store_term_vector : 1;
|
809
|
+
bool store_offsets : 1;
|
810
|
+
bool has_norms : 1;
|
811
|
+
} FieldInverter;
|
778
812
|
|
779
813
|
/****************************************************************************
|
780
814
|
*
|
781
|
-
*
|
815
|
+
* DocWriter
|
782
816
|
*
|
783
817
|
****************************************************************************/
|
784
818
|
|
785
|
-
|
819
|
+
#define DW_OFFSET_INIT_CAPA 512
|
820
|
+
typedef struct IndexWriter IndexWriter;
|
821
|
+
|
822
|
+
typedef struct DocWriter
|
823
|
+
{
|
786
824
|
Store *store;
|
787
|
-
char *
|
788
|
-
Array *readers;
|
825
|
+
const char *segment;
|
789
826
|
FieldInfos *fis;
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
int
|
799
|
-
|
827
|
+
TermVectorsWriter *tvw;
|
828
|
+
FieldsWriter *fw;
|
829
|
+
MemoryPool *mp;
|
830
|
+
Analyzer *analyzer;
|
831
|
+
HashTable *curr_plists;
|
832
|
+
HashTable *fields;
|
833
|
+
Similarity *similarity;
|
834
|
+
Offset *offsets;
|
835
|
+
int offsets_size;
|
836
|
+
int offsets_capa;
|
837
|
+
int doc_num;
|
838
|
+
int index_interval;
|
800
839
|
int skip_interval;
|
801
|
-
int
|
802
|
-
int
|
803
|
-
|
804
|
-
} SegmentMerger;
|
805
|
-
|
806
|
-
SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
|
807
|
-
void sm_destroy(SegmentMerger *sm);
|
808
|
-
void sm_add(SegmentMerger *sm, IndexReader *ir);
|
809
|
-
int sm_merge(SegmentMerger *sm);
|
810
|
-
Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
|
840
|
+
int max_field_length;
|
841
|
+
int max_buffered_docs;
|
842
|
+
} DocWriter;
|
811
843
|
|
844
|
+
extern DocWriter *dw_open(IndexWriter *is, const char *segment);
|
845
|
+
extern void dw_close(DocWriter *dw);
|
846
|
+
extern void dw_add_doc(DocWriter *dw, Document *doc);
|
847
|
+
extern void dw_new_segment(DocWriter *dw, char *segment);
|
812
848
|
|
813
849
|
/****************************************************************************
|
814
850
|
*
|
@@ -816,35 +852,38 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
|
|
816
852
|
*
|
817
853
|
****************************************************************************/
|
818
854
|
|
819
|
-
|
820
|
-
|
821
|
-
|
855
|
+
typedef struct DelTerm
|
856
|
+
{
|
857
|
+
int field_num;
|
858
|
+
char *term;
|
859
|
+
} DelTerm;
|
860
|
+
|
861
|
+
struct IndexWriter
|
862
|
+
{
|
863
|
+
Config config;
|
822
864
|
mutex_t mutex;
|
823
|
-
HshTable *postings;
|
824
|
-
FieldInfos *fis;
|
825
|
-
int merge_factor;
|
826
|
-
int min_merge_docs;
|
827
|
-
int max_merge_docs;
|
828
|
-
int max_field_length;
|
829
|
-
int term_index_interval;
|
830
865
|
Store *store;
|
831
866
|
Analyzer *analyzer;
|
832
|
-
Similarity *similarity;
|
833
867
|
SegmentInfos *sis;
|
834
|
-
|
868
|
+
FieldInfos *fis;
|
869
|
+
DocWriter *dw;
|
870
|
+
Similarity *similarity;
|
871
|
+
DelTerm **del_terms;
|
835
872
|
Lock *write_lock;
|
836
|
-
bool use_compound_file : 1;
|
837
873
|
};
|
838
874
|
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
void
|
843
|
-
|
844
|
-
void
|
845
|
-
void
|
846
|
-
|
847
|
-
void
|
875
|
+
extern void index_create(Store *store, FieldInfos *fis);
|
876
|
+
extern IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
877
|
+
const Config *config);
|
878
|
+
extern void iw_delete_term(IndexWriter *iw, const char *field,
|
879
|
+
const char *term);
|
880
|
+
extern void iw_close(IndexWriter *iw);
|
881
|
+
extern void iw_add_doc(IndexWriter *iw, Document *doc);
|
882
|
+
extern int iw_doc_count(IndexWriter *iw);
|
883
|
+
extern void iw_commit(IndexWriter *iw);
|
884
|
+
extern void iw_optimize(IndexWriter *iw);
|
885
|
+
extern void iw_add_readers(IndexWriter *iw, IndexReader **readers,
|
886
|
+
const int r_cnt);
|
848
887
|
|
849
888
|
/****************************************************************************
|
850
889
|
*
|
@@ -852,16 +891,24 @@ void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
|
|
852
891
|
*
|
853
892
|
****************************************************************************/
|
854
893
|
|
894
|
+
#define CW_INIT_CAPA 16
|
895
|
+
typedef struct CWFileEntry
|
896
|
+
{
|
897
|
+
char *name;
|
898
|
+
off_t dir_offset;
|
899
|
+
off_t data_offset;
|
900
|
+
} CWFileEntry;
|
901
|
+
|
855
902
|
typedef struct CompoundWriter {
|
856
903
|
Store *store;
|
857
904
|
const char *name;
|
858
905
|
HashSet *ids;
|
859
|
-
|
860
|
-
bool merged;
|
906
|
+
CWFileEntry *file_entries;
|
861
907
|
} CompoundWriter;
|
862
908
|
|
863
|
-
CompoundWriter *open_cw(Store *store, char *name);
|
864
|
-
void cw_add_file(CompoundWriter *cw, char *id);
|
865
|
-
void cw_close(CompoundWriter *cw);
|
909
|
+
extern CompoundWriter *open_cw(Store *store, char *name);
|
910
|
+
extern void cw_add_file(CompoundWriter *cw, char *id);
|
911
|
+
extern void cw_close(CompoundWriter *cw);
|
912
|
+
|
866
913
|
|
867
914
|
#endif
|