ferret 0.11.6 → 0.11.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +10 -22
- data/RELEASE_CHANGES +137 -0
- data/RELEASE_NOTES +60 -0
- data/Rakefile +379 -274
- data/TODO +100 -8
- data/bin/ferret-browser +0 -0
- data/ext/BZLIB_blocksort.c +1094 -0
- data/ext/BZLIB_bzlib.c +1578 -0
- data/ext/BZLIB_compress.c +672 -0
- data/ext/BZLIB_crctable.c +104 -0
- data/ext/BZLIB_decompress.c +626 -0
- data/ext/BZLIB_huffman.c +205 -0
- data/ext/BZLIB_randtable.c +84 -0
- data/ext/{api.c → STEMMER_api.c} +7 -10
- data/ext/{libstemmer.c → STEMMER_libstemmer.c} +3 -2
- data/ext/{stem_ISO_8859_1_danish.c → STEMMER_stem_ISO_8859_1_danish.c} +123 -124
- data/ext/{stem_ISO_8859_1_dutch.c → STEMMER_stem_ISO_8859_1_dutch.c} +177 -188
- data/ext/STEMMER_stem_ISO_8859_1_english.c +1117 -0
- data/ext/{stem_ISO_8859_1_finnish.c → STEMMER_stem_ISO_8859_1_finnish.c} +276 -306
- data/ext/STEMMER_stem_ISO_8859_1_french.c +1246 -0
- data/ext/{stem_ISO_8859_1_german.c → STEMMER_stem_ISO_8859_1_german.c} +161 -170
- data/ext/STEMMER_stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/STEMMER_stem_ISO_8859_1_italian.c +1065 -0
- data/ext/STEMMER_stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/{stem_ISO_8859_1_porter.c → STEMMER_stem_ISO_8859_1_porter.c} +263 -290
- data/ext/{stem_ISO_8859_1_portuguese.c → STEMMER_stem_ISO_8859_1_portuguese.c} +362 -380
- data/ext/STEMMER_stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/STEMMER_stem_ISO_8859_1_swedish.c +307 -0
- data/ext/STEMMER_stem_ISO_8859_2_romanian.c +998 -0
- data/ext/{stem_KOI8_R_russian.c → STEMMER_stem_KOI8_R_russian.c} +244 -245
- data/ext/STEMMER_stem_UTF_8_danish.c +339 -0
- data/ext/{stem_UTF_8_dutch.c → STEMMER_stem_UTF_8_dutch.c} +192 -211
- data/ext/STEMMER_stem_UTF_8_english.c +1125 -0
- data/ext/{stem_UTF_8_finnish.c → STEMMER_stem_UTF_8_finnish.c} +284 -324
- data/ext/STEMMER_stem_UTF_8_french.c +1256 -0
- data/ext/{stem_UTF_8_german.c → STEMMER_stem_UTF_8_german.c} +170 -187
- data/ext/STEMMER_stem_UTF_8_hungarian.c +1234 -0
- data/ext/STEMMER_stem_UTF_8_italian.c +1073 -0
- data/ext/STEMMER_stem_UTF_8_norwegian.c +299 -0
- data/ext/{stem_UTF_8_porter.c → STEMMER_stem_UTF_8_porter.c} +271 -310
- data/ext/STEMMER_stem_UTF_8_portuguese.c +1023 -0
- data/ext/STEMMER_stem_UTF_8_romanian.c +1004 -0
- data/ext/STEMMER_stem_UTF_8_russian.c +694 -0
- data/ext/STEMMER_stem_UTF_8_spanish.c +1097 -0
- data/ext/STEMMER_stem_UTF_8_swedish.c +309 -0
- data/ext/STEMMER_stem_UTF_8_turkish.c +2205 -0
- data/ext/{utilities.c → STEMMER_utilities.c} +100 -68
- data/ext/analysis.c +276 -121
- data/ext/analysis.h +190 -143
- data/ext/api.h +3 -4
- data/ext/array.c +5 -3
- data/ext/array.h +52 -43
- data/ext/bitvector.c +38 -482
- data/ext/bitvector.h +446 -124
- data/ext/bzlib.h +282 -0
- data/ext/bzlib_private.h +503 -0
- data/ext/compound_io.c +23 -22
- data/ext/config.h +21 -11
- data/ext/document.c +43 -40
- data/ext/document.h +31 -21
- data/ext/except.c +20 -38
- data/ext/except.h +89 -76
- data/ext/extconf.rb +3 -2
- data/ext/ferret.c +49 -35
- data/ext/ferret.h +14 -11
- data/ext/field_index.c +262 -0
- data/ext/field_index.h +52 -0
- data/ext/filter.c +11 -10
- data/ext/fs_store.c +65 -47
- data/ext/global.c +245 -165
- data/ext/global.h +252 -54
- data/ext/hash.c +200 -243
- data/ext/hash.h +205 -163
- data/ext/hashset.c +118 -96
- data/ext/hashset.h +110 -82
- data/ext/header.h +19 -19
- data/ext/helper.c +11 -10
- data/ext/helper.h +14 -6
- data/ext/index.c +745 -366
- data/ext/index.h +503 -529
- data/ext/internal.h +1020 -0
- data/ext/lang.c +10 -0
- data/ext/lang.h +35 -15
- data/ext/mempool.c +5 -4
- data/ext/mempool.h +30 -22
- data/ext/modules.h +35 -7
- data/ext/multimapper.c +43 -2
- data/ext/multimapper.h +32 -23
- data/ext/posh.c +0 -0
- data/ext/posh.h +4 -38
- data/ext/priorityqueue.c +10 -12
- data/ext/priorityqueue.h +33 -21
- data/ext/q_boolean.c +22 -9
- data/ext/q_const_score.c +3 -2
- data/ext/q_filtered_query.c +15 -12
- data/ext/q_fuzzy.c +147 -135
- data/ext/q_match_all.c +3 -2
- data/ext/q_multi_term.c +28 -32
- data/ext/q_parser.c +451 -173
- data/ext/q_phrase.c +158 -79
- data/ext/q_prefix.c +16 -18
- data/ext/q_range.c +363 -31
- data/ext/q_span.c +130 -141
- data/ext/q_term.c +21 -21
- data/ext/q_wildcard.c +19 -23
- data/ext/r_analysis.c +369 -242
- data/ext/r_index.c +421 -434
- data/ext/r_qparser.c +142 -92
- data/ext/r_search.c +790 -407
- data/ext/r_store.c +44 -44
- data/ext/r_utils.c +264 -96
- data/ext/ram_store.c +29 -23
- data/ext/scanner.c +895 -0
- data/ext/scanner.h +36 -0
- data/ext/scanner_mb.c +6701 -0
- data/ext/scanner_utf8.c +4415 -0
- data/ext/search.c +210 -87
- data/ext/search.h +556 -488
- data/ext/similarity.c +17 -16
- data/ext/similarity.h +51 -44
- data/ext/sort.c +157 -354
- data/ext/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/stem_UTF_8_hungarian.h +16 -0
- data/ext/stem_UTF_8_romanian.h +16 -0
- data/ext/stem_UTF_8_turkish.h +16 -0
- data/ext/stopwords.c +287 -278
- data/ext/store.c +57 -51
- data/ext/store.h +308 -286
- data/ext/symbol.c +10 -0
- data/ext/symbol.h +23 -0
- data/ext/term_vectors.c +14 -293
- data/ext/threading.h +22 -22
- data/ext/win32.h +12 -4
- data/lib/ferret.rb +2 -1
- data/lib/ferret/browser.rb +1 -1
- data/lib/ferret/field_symbol.rb +94 -0
- data/lib/ferret/index.rb +221 -34
- data/lib/ferret/number_tools.rb +6 -6
- data/lib/ferret/version.rb +3 -0
- data/test/{unit → long_running}/largefile/tc_largefile.rb +1 -1
- data/test/test_helper.rb +7 -2
- data/test/test_installed.rb +1 -0
- data/test/threading/thread_safety_index_test.rb +10 -1
- data/test/threading/thread_safety_read_write_test.rb +4 -7
- data/test/threading/thread_safety_test.rb +0 -0
- data/test/unit/analysis/tc_analyzer.rb +29 -27
- data/test/unit/analysis/tc_token_stream.rb +23 -16
- data/test/unit/index/tc_index.rb +116 -11
- data/test/unit/index/tc_index_reader.rb +27 -27
- data/test/unit/index/tc_index_writer.rb +10 -0
- data/test/unit/index/th_doc.rb +38 -21
- data/test/unit/search/tc_filter.rb +31 -10
- data/test/unit/search/tc_index_searcher.rb +6 -0
- data/test/unit/search/tm_searcher.rb +53 -1
- data/test/unit/store/tc_fs_store.rb +40 -2
- data/test/unit/store/tc_ram_store.rb +0 -0
- data/test/unit/store/tm_store.rb +0 -0
- data/test/unit/store/tm_store_lock.rb +7 -6
- data/test/unit/tc_field_symbol.rb +26 -0
- data/test/unit/ts_analysis.rb +0 -0
- data/test/unit/ts_index.rb +0 -0
- data/test/unit/ts_store.rb +0 -0
- data/test/unit/ts_utils.rb +0 -0
- data/test/unit/utils/tc_number_tools.rb +0 -0
- data/test/utils/content_generator.rb +226 -0
- metadata +262 -221
- data/ext/inc/lang.h +0 -48
- data/ext/inc/threading.h +0 -31
- data/ext/stem_ISO_8859_1_english.c +0 -1156
- data/ext/stem_ISO_8859_1_french.c +0 -1276
- data/ext/stem_ISO_8859_1_italian.c +0 -1091
- data/ext/stem_ISO_8859_1_norwegian.c +0 -296
- data/ext/stem_ISO_8859_1_spanish.c +0 -1119
- data/ext/stem_ISO_8859_1_swedish.c +0 -307
- data/ext/stem_UTF_8_danish.c +0 -344
- data/ext/stem_UTF_8_english.c +0 -1176
- data/ext/stem_UTF_8_french.c +0 -1296
- data/ext/stem_UTF_8_italian.c +0 -1113
- data/ext/stem_UTF_8_norwegian.c +0 -302
- data/ext/stem_UTF_8_portuguese.c +0 -1055
- data/ext/stem_UTF_8_russian.c +0 -709
- data/ext/stem_UTF_8_spanish.c +0 -1137
- data/ext/stem_UTF_8_swedish.c +0 -313
- data/lib/ferret_version.rb +0 -3
data/ext/header.h
CHANGED
@@ -14,7 +14,7 @@
|
|
14
14
|
|
15
15
|
struct among
|
16
16
|
{ int s_size; /* number of chars in string */
|
17
|
-
symbol * s; /* search string */
|
17
|
+
const symbol * s; /* search string */
|
18
18
|
int substring_i;/* index to longest matching substring */
|
19
19
|
int result; /* result of the lookup */
|
20
20
|
int (* function)(struct SN_env *);
|
@@ -25,31 +25,31 @@ extern void lose_s(symbol * p);
|
|
25
25
|
|
26
26
|
extern int skip_utf8(const symbol * p, int c, int lb, int l, int n);
|
27
27
|
|
28
|
-
extern int in_grouping_U(struct SN_env * z, unsigned char * s, int min, int max);
|
29
|
-
extern int in_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max);
|
30
|
-
extern int out_grouping_U(struct SN_env * z, unsigned char * s, int min, int max);
|
31
|
-
extern int out_grouping_b_U(struct SN_env * z, unsigned char * s, int min, int max);
|
28
|
+
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
29
|
+
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
30
|
+
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
31
|
+
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
32
32
|
|
33
|
-
extern int in_grouping(struct SN_env * z, unsigned char * s, int min, int max);
|
34
|
-
extern int in_grouping_b(struct SN_env * z, unsigned char * s, int min, int max);
|
35
|
-
extern int out_grouping(struct SN_env * z, unsigned char * s, int min, int max);
|
36
|
-
extern int out_grouping_b(struct SN_env * z, unsigned char * s, int min, int max);
|
33
|
+
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
34
|
+
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
35
|
+
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
36
|
+
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
|
37
37
|
|
38
|
-
extern int eq_s(struct SN_env * z, int s_size, symbol * s);
|
39
|
-
extern int eq_s_b(struct SN_env * z, int s_size, symbol * s);
|
40
|
-
extern int eq_v(struct SN_env * z, symbol * p);
|
41
|
-
extern int eq_v_b(struct SN_env * z, symbol * p);
|
38
|
+
extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
|
39
|
+
extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
|
40
|
+
extern int eq_v(struct SN_env * z, const symbol * p);
|
41
|
+
extern int eq_v_b(struct SN_env * z, const symbol * p);
|
42
42
|
|
43
|
-
extern int find_among(struct SN_env * z, struct among * v, int v_size);
|
44
|
-
extern int find_among_b(struct SN_env * z, struct among * v, int v_size);
|
43
|
+
extern int find_among(struct SN_env * z, const struct among * v, int v_size);
|
44
|
+
extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
|
45
45
|
|
46
46
|
extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
|
47
|
-
extern int slice_from_s(struct SN_env * z, int s_size, symbol * s);
|
48
|
-
extern int slice_from_v(struct SN_env * z, symbol * p);
|
47
|
+
extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
|
48
|
+
extern int slice_from_v(struct SN_env * z, const symbol * p);
|
49
49
|
extern int slice_del(struct SN_env * z);
|
50
50
|
|
51
|
-
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, symbol * s);
|
52
|
-
extern int insert_v(struct SN_env * z, int bra, int ket, symbol * p);
|
51
|
+
extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
|
52
|
+
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
|
53
53
|
|
54
54
|
extern symbol * slice_to(struct SN_env * z, symbol * p);
|
55
55
|
extern symbol * assign_to(struct SN_env * z, symbol * p);
|
data/ext/helper.c
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#include "helper.h"
|
2
|
+
#include "internal.h"
|
2
3
|
|
3
4
|
int hlp_string_diff(register const char *const s1,
|
4
5
|
register const char *const s2)
|
@@ -10,17 +11,17 @@ int hlp_string_diff(register const char *const s1,
|
|
10
11
|
return i;
|
11
12
|
}
|
12
13
|
|
13
|
-
|
14
|
+
i32 float2int(float f)
|
14
15
|
{
|
15
|
-
union {
|
16
|
+
union { i32 i; float f; } tmp;
|
16
17
|
tmp.f = f;
|
17
18
|
return tmp.i;
|
18
19
|
}
|
19
20
|
|
20
|
-
float int2float(
|
21
|
+
float int2float(i32 v)
|
21
22
|
{
|
22
|
-
union {
|
23
|
-
tmp.i =
|
23
|
+
union { i32 i; float f; } tmp;
|
24
|
+
tmp.i = v;
|
24
25
|
return tmp.f;
|
25
26
|
}
|
26
27
|
|
@@ -30,8 +31,8 @@ float byte2float(unsigned char b)
|
|
30
31
|
return 0.0;
|
31
32
|
}
|
32
33
|
else {
|
33
|
-
|
34
|
-
|
34
|
+
u32 mantissa = b & 0x07;
|
35
|
+
u32 exponent = (b >> 3) & 0x1f;
|
35
36
|
|
36
37
|
return int2float((mantissa << 21) | ((exponent + 48) << 24));
|
37
38
|
}
|
@@ -44,9 +45,9 @@ unsigned char float2byte(float f)
|
|
44
45
|
}
|
45
46
|
else {
|
46
47
|
/* correctly order the bytes for encoding */
|
47
|
-
|
48
|
-
int mantissa = (
|
49
|
-
int exponent = ((
|
48
|
+
u32 i = float2int(f);
|
49
|
+
int mantissa = (i & 0xEf0000) >> 21;
|
50
|
+
int exponent = ((i >> 24) - 48);
|
50
51
|
|
51
52
|
if (exponent > 0x1f) {
|
52
53
|
exponent = 0x1f; /* 0x1f = 31 = 0b00011111 */
|
data/ext/helper.h
CHANGED
@@ -1,13 +1,21 @@
|
|
1
1
|
#ifndef FRT_HELPER_H
|
2
2
|
#define FRT_HELPER_H
|
3
3
|
|
4
|
+
#ifdef __cplusplus
|
5
|
+
extern "C" {
|
6
|
+
#endif
|
7
|
+
|
4
8
|
#include "config.h"
|
5
9
|
|
6
|
-
extern int
|
7
|
-
|
8
|
-
extern
|
9
|
-
extern float
|
10
|
-
extern float
|
11
|
-
extern unsigned char
|
10
|
+
extern int frt_hlp_string_diff(register const char *const s1,
|
11
|
+
register const char *const s2);
|
12
|
+
extern frt_i32 frt_float2int(float f);
|
13
|
+
extern float frt_int2float(frt_i32 i32);
|
14
|
+
extern float frt_byte2float(unsigned char b);
|
15
|
+
extern unsigned char frt_float2byte(float f);
|
16
|
+
|
17
|
+
#ifdef __cplusplus
|
18
|
+
} // extern "C"
|
19
|
+
#endif
|
12
20
|
|
13
21
|
#endif
|
data/ext/index.c
CHANGED
@@ -1,11 +1,17 @@
|
|
1
1
|
#include "index.h"
|
2
|
+
#include "symbol.h"
|
2
3
|
#include "similarity.h"
|
3
4
|
#include "helper.h"
|
4
5
|
#include "array.h"
|
5
|
-
#include "priorityqueue.h"
|
6
6
|
#include <string.h>
|
7
7
|
#include <limits.h>
|
8
8
|
#include <ctype.h>
|
9
|
+
#ifdef USE_ZLIB
|
10
|
+
# include <zlib.h>
|
11
|
+
#else
|
12
|
+
# include "bzlib.h"
|
13
|
+
#endif
|
14
|
+
#include "internal.h"
|
9
15
|
|
10
16
|
#define GET_LOCK(lock, name, store, err_msg) do {\
|
11
17
|
lock = store->open_lock(store, name);\
|
@@ -26,7 +32,7 @@ const Config default_config = {
|
|
26
32
|
SKIP_INTERVAL, /* skip interval */
|
27
33
|
10, /* default merge factor */
|
28
34
|
10000, /* max_buffered_docs */
|
29
|
-
INT_MAX, /*
|
35
|
+
INT_MAX, /* max_merge_docs */
|
30
36
|
10000, /* maximum field length (number of terms) */
|
31
37
|
true /* use compound file by default */
|
32
38
|
};
|
@@ -37,21 +43,22 @@ static char *ste_next(TermEnum *te);
|
|
37
43
|
#define FORMAT 0
|
38
44
|
#define SEGMENTS_GEN_FILE_NAME "segments"
|
39
45
|
#define MAX_EXT_LEN 10
|
46
|
+
#define ZIP_BUFFER_SIZE 16348
|
47
|
+
#define ZIP_LEVEL 9
|
40
48
|
|
41
49
|
/* *** Must be three characters *** */
|
42
|
-
const char *INDEX_EXTENSIONS[] = {
|
50
|
+
static const char *INDEX_EXTENSIONS[] = {
|
43
51
|
"frq", "prx", "fdx", "fdt", "tfx", "tix", "tis", "del", "gen", "cfs"
|
44
52
|
};
|
45
53
|
|
46
54
|
/* *** Must be three characters *** */
|
47
|
-
const char *COMPOUND_EXTENSIONS[] = {
|
55
|
+
static const char *COMPOUND_EXTENSIONS[] = {
|
48
56
|
"frq", "prx", "fdx", "fdt", "tfx", "tix", "tis"
|
49
57
|
};
|
50
58
|
|
51
|
-
|
52
59
|
static const char BASE36_DIGITMAP[] = "0123456789abcdefghijklmnopqrstuvwxyz";
|
53
60
|
|
54
|
-
static char *u64_to_str36(char *buf, int buf_size,
|
61
|
+
static char *u64_to_str36(char *buf, int buf_size, u64 u)
|
55
62
|
{
|
56
63
|
int i = buf_size - 1;
|
57
64
|
buf[i] = '\0';
|
@@ -69,9 +76,9 @@ static char *u64_to_str36(char *buf, int buf_size, f_u64 u)
|
|
69
76
|
return buf + i;
|
70
77
|
}
|
71
78
|
|
72
|
-
static
|
79
|
+
static u64 str36_to_u64(char *p)
|
73
80
|
{
|
74
|
-
|
81
|
+
u64 u = 0;
|
75
82
|
while (true) {
|
76
83
|
if ('0' <= *p && '9' >= *p) {
|
77
84
|
u = u * 36 + *p - '0';
|
@@ -98,14 +105,14 @@ static f_u64 str36_to_u64(char *p)
|
|
98
105
|
* @param ext extension of the filename (including .)
|
99
106
|
* @param gen generation
|
100
107
|
*/
|
101
|
-
char *fn_for_generation(char *buf, char *base, char *ext,
|
108
|
+
char *fn_for_generation(char *buf, char *base, char *ext, i64 gen)
|
102
109
|
{
|
103
110
|
if (-1 == gen) {
|
104
111
|
return NULL;
|
105
112
|
}
|
106
113
|
else {
|
107
114
|
char b[SEGMENT_NAME_MAX_LENGTH];
|
108
|
-
char *u = u64_to_str36(b, SEGMENT_NAME_MAX_LENGTH, (
|
115
|
+
char *u = u64_to_str36(b, SEGMENT_NAME_MAX_LENGTH, (u64)gen);
|
109
116
|
if (ext == NULL) {
|
110
117
|
sprintf(buf, "%s_%s", base, u);
|
111
118
|
}
|
@@ -116,7 +123,7 @@ char *fn_for_generation(char *buf, char *base, char *ext, f_i64 gen)
|
|
116
123
|
}
|
117
124
|
}
|
118
125
|
|
119
|
-
char *segfn_for_generation(char *buf,
|
126
|
+
static char *segfn_for_generation(char *buf, u64 generation)
|
120
127
|
{
|
121
128
|
char b[SEGMENT_NAME_MAX_LENGTH];
|
122
129
|
char *u = u64_to_str36(b, SEGMENT_NAME_MAX_LENGTH, generation);
|
@@ -137,9 +144,9 @@ char *segfn_for_generation(char *buf, f_u64 generation)
|
|
137
144
|
* @param field_num field number
|
138
145
|
*/
|
139
146
|
static char *fn_for_gen_field(char *buf,
|
140
|
-
char *base,
|
141
|
-
char *ext,
|
142
|
-
|
147
|
+
const char *base,
|
148
|
+
const char *ext,
|
149
|
+
i64 gen,
|
143
150
|
int field_num)
|
144
151
|
{
|
145
152
|
if (-1 == gen) {
|
@@ -149,7 +156,7 @@ static char *fn_for_gen_field(char *buf,
|
|
149
156
|
char b[SEGMENT_NAME_MAX_LENGTH];
|
150
157
|
sprintf(buf, "%s_%s.%s%d",
|
151
158
|
base,
|
152
|
-
u64_to_str36(b, SEGMENT_NAME_MAX_LENGTH, (
|
159
|
+
u64_to_str36(b, SEGMENT_NAME_MAX_LENGTH, (u64)gen),
|
153
160
|
ext,
|
154
161
|
field_num);
|
155
162
|
return buf;
|
@@ -172,7 +179,7 @@ static int co_eq(const void *key1, const void *key2)
|
|
172
179
|
return (key1 == key2);
|
173
180
|
}
|
174
181
|
|
175
|
-
void co_destroy(CacheObject *self)
|
182
|
+
static void co_destroy(CacheObject *self)
|
176
183
|
{
|
177
184
|
h_rem(self->ref_tab1, self->ref2, false);
|
178
185
|
h_rem(self->ref_tab2, self->ref1, false);
|
@@ -180,7 +187,7 @@ void co_destroy(CacheObject *self)
|
|
180
187
|
free(self);
|
181
188
|
}
|
182
189
|
|
183
|
-
CacheObject *co_create(
|
190
|
+
CacheObject *co_create(Hash *ref_tab1, Hash *ref_tab2,
|
184
191
|
void *ref1, void *ref2, free_ft destroy, void *obj)
|
185
192
|
{
|
186
193
|
CacheObject *self = ALLOC(CacheObject);
|
@@ -195,7 +202,7 @@ CacheObject *co_create(HashTable *ref_tab1, HashTable *ref_tab2,
|
|
195
202
|
return self;
|
196
203
|
}
|
197
204
|
|
198
|
-
|
205
|
+
Hash *co_hash_create()
|
199
206
|
{
|
200
207
|
return h_new(&co_hash, &co_eq, (free_ft)NULL, (free_ft)&co_destroy);
|
201
208
|
}
|
@@ -206,7 +213,7 @@ HashTable *co_hash_create()
|
|
206
213
|
*
|
207
214
|
****************************************************************************/
|
208
215
|
|
209
|
-
INLINE void fi_set_store(FieldInfo *fi, int store)
|
216
|
+
static INLINE void fi_set_store(FieldInfo *fi, int store)
|
210
217
|
{
|
211
218
|
switch (store) {
|
212
219
|
case STORE_NO:
|
@@ -220,7 +227,7 @@ INLINE void fi_set_store(FieldInfo *fi, int store)
|
|
220
227
|
}
|
221
228
|
}
|
222
229
|
|
223
|
-
INLINE void fi_set_index(FieldInfo *fi, int index)
|
230
|
+
static INLINE void fi_set_index(FieldInfo *fi, int index)
|
224
231
|
{
|
225
232
|
switch (index) {
|
226
233
|
case INDEX_NO:
|
@@ -241,7 +248,7 @@ INLINE void fi_set_index(FieldInfo *fi, int index)
|
|
241
248
|
}
|
242
249
|
}
|
243
250
|
|
244
|
-
INLINE void fi_set_term_vector(FieldInfo *fi, int term_vector)
|
251
|
+
static INLINE void fi_set_term_vector(FieldInfo *fi, int term_vector)
|
245
252
|
{
|
246
253
|
switch (term_vector) {
|
247
254
|
case TERM_VECTOR_NO:
|
@@ -271,14 +278,15 @@ static void fi_check_params(int store, int index, int term_vector)
|
|
271
278
|
}
|
272
279
|
}
|
273
280
|
|
274
|
-
FieldInfo *fi_new(
|
275
|
-
|
276
|
-
|
277
|
-
|
281
|
+
FieldInfo *fi_new(Symbol name,
|
282
|
+
StoreValue store,
|
283
|
+
IndexValue index,
|
284
|
+
TermVectorValue term_vector)
|
278
285
|
{
|
279
286
|
FieldInfo *fi = ALLOC(FieldInfo);
|
287
|
+
assert(NULL != name);
|
280
288
|
fi_check_params(store, index, term_vector);
|
281
|
-
fi->name =
|
289
|
+
fi->name = name;
|
282
290
|
fi->boost = 1.0;
|
283
291
|
fi->bits = 0;
|
284
292
|
fi_set_store(fi, store);
|
@@ -291,28 +299,28 @@ FieldInfo *fi_new(const char *name,
|
|
291
299
|
void fi_deref(FieldInfo *fi)
|
292
300
|
{
|
293
301
|
if (0 == --(fi->ref_cnt)) {
|
294
|
-
free(fi->name);
|
295
302
|
free(fi);
|
296
303
|
}
|
297
304
|
}
|
298
305
|
|
299
306
|
char *fi_to_s(FieldInfo *fi)
|
300
307
|
{
|
301
|
-
char *str = ALLOC_N(char, strlen(fi->name) + 200);
|
308
|
+
char *str = ALLOC_N(char, strlen((char *)fi->name) + 200);
|
302
309
|
char *s = str;
|
303
|
-
sprintf(str, "[\"%s\":(%s%s%s%s%s%s%s%s", fi->name,
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
s
|
310
|
+
s += sprintf(str, "[\"%s\":(%s%s%s%s%s%s%s%s", (char *)fi->name,
|
311
|
+
fi_is_stored(fi) ? "is_stored, " : "",
|
312
|
+
fi_is_compressed(fi) ? "is_compressed, " : "",
|
313
|
+
fi_is_indexed(fi) ? "is_indexed, " : "",
|
314
|
+
fi_is_tokenized(fi) ? "is_tokenized, " : "",
|
315
|
+
fi_omit_norms(fi) ? "omit_norms, " : "",
|
316
|
+
fi_store_term_vector(fi) ? "store_term_vector, " : "",
|
317
|
+
fi_store_positions(fi) ? "store_positions, " : "",
|
318
|
+
fi_store_offsets(fi) ? "store_offsets, " : "");
|
319
|
+
s -= 2;
|
313
320
|
if (*s != ',') {
|
314
321
|
s += 2;
|
315
322
|
}
|
323
|
+
|
316
324
|
sprintf(s, ")]");
|
317
325
|
return str;
|
318
326
|
}
|
@@ -323,11 +331,12 @@ char *fi_to_s(FieldInfo *fi)
|
|
323
331
|
*
|
324
332
|
****************************************************************************/
|
325
333
|
|
326
|
-
FieldInfos *fis_new(
|
334
|
+
FieldInfos *fis_new(StoreValue store, IndexValue index,
|
335
|
+
TermVectorValue term_vector)
|
327
336
|
{
|
328
337
|
FieldInfos *fis = ALLOC(FieldInfos);
|
329
338
|
fi_check_params(store, index, term_vector);
|
330
|
-
fis->field_dict =
|
339
|
+
fis->field_dict = h_new_ptr((free_ft)&fi_deref);
|
331
340
|
fis->size = 0;
|
332
341
|
fis->capa = FIELD_INFOS_INIT_CAPA;
|
333
342
|
fis->fields = ALLOC_N(FieldInfo *, fis->capa);
|
@@ -346,7 +355,7 @@ FieldInfo *fis_add_field(FieldInfos *fis, FieldInfo *fi)
|
|
346
355
|
}
|
347
356
|
if (!h_set_safe(fis->field_dict, fi->name, fi)) {
|
348
357
|
RAISE(ARG_ERROR,
|
349
|
-
"Field :%s already exists", fi->name);
|
358
|
+
"Field :%s already exists", (char *)fi->name);
|
350
359
|
}
|
351
360
|
fi->number = fis->size;
|
352
361
|
fis->fields[fis->size] = fi;
|
@@ -354,14 +363,14 @@ FieldInfo *fis_add_field(FieldInfos *fis, FieldInfo *fi)
|
|
354
363
|
return fi;
|
355
364
|
}
|
356
365
|
|
357
|
-
FieldInfo *fis_get_field(FieldInfos *fis,
|
366
|
+
FieldInfo *fis_get_field(FieldInfos *fis, Symbol name)
|
358
367
|
{
|
359
|
-
return h_get(fis->field_dict, name);
|
368
|
+
return (FieldInfo *)h_get(fis->field_dict, name);
|
360
369
|
}
|
361
370
|
|
362
|
-
int fis_get_field_num(FieldInfos *fis,
|
371
|
+
int fis_get_field_num(FieldInfos *fis, Symbol name)
|
363
372
|
{
|
364
|
-
FieldInfo *fi = h_get(fis->field_dict, name);
|
373
|
+
FieldInfo *fi = (FieldInfo *)h_get(fis->field_dict, name);
|
365
374
|
if (fi) {
|
366
375
|
return fi->number;
|
367
376
|
}
|
@@ -370,11 +379,11 @@ int fis_get_field_num(FieldInfos *fis, const char *name)
|
|
370
379
|
}
|
371
380
|
}
|
372
381
|
|
373
|
-
FieldInfo *fis_get_or_add_field(FieldInfos *fis,
|
382
|
+
FieldInfo *fis_get_or_add_field(FieldInfos *fis, Symbol name)
|
374
383
|
{
|
375
|
-
FieldInfo *fi = h_get(fis->field_dict, name);
|
384
|
+
FieldInfo *fi = (FieldInfo *)h_get(fis->field_dict, name);
|
376
385
|
if (!fi) {
|
377
|
-
fi = fi_new(name, fis->store, fis->index, fis->term_vector);
|
386
|
+
fi = (FieldInfo*)fi_new(name, fis->store, fis->index, fis->term_vector);
|
378
387
|
fis_add_field(fis, fi);
|
379
388
|
}
|
380
389
|
return fi;
|
@@ -392,27 +401,28 @@ FieldInfo *fis_by_number(FieldInfos *fis, int num)
|
|
392
401
|
|
393
402
|
FieldInfos *fis_read(InStream *is)
|
394
403
|
{
|
395
|
-
FieldInfos *volatile fis;
|
404
|
+
FieldInfos *volatile fis = NULL;
|
396
405
|
TRY
|
397
406
|
do {
|
398
|
-
|
399
|
-
|
400
|
-
|
407
|
+
StoreValue store_val;
|
408
|
+
IndexValue index_val;
|
409
|
+
TermVectorValue term_vector_val;
|
410
|
+
volatile int i;
|
411
|
+
union { u32 i; float f; } tmp;
|
401
412
|
FieldInfo *volatile fi;
|
402
413
|
|
403
|
-
store_val = is_read_vint(is);
|
404
|
-
index_val = is_read_vint(is);
|
405
|
-
term_vector_val = is_read_vint(is);
|
414
|
+
store_val = (StoreValue)is_read_vint(is);
|
415
|
+
index_val = (IndexValue)is_read_vint(is);
|
416
|
+
term_vector_val = (TermVectorValue)is_read_vint(is);
|
406
417
|
fis = fis_new(store_val, index_val, term_vector_val);
|
407
418
|
for (i = is_read_vint(is); i > 0; i--) {
|
408
419
|
fi = ALLOC_AND_ZERO(FieldInfo);
|
409
420
|
TRY
|
410
|
-
fi->name = is_read_string_safe(is);
|
421
|
+
fi->name = intern_and_free(is_read_string_safe(is));
|
411
422
|
tmp.i = is_read_u32(is);
|
412
423
|
fi->boost = tmp.f;
|
413
424
|
fi->bits = is_read_vint(is);
|
414
425
|
XCATCHALL
|
415
|
-
free(fi->name);
|
416
426
|
free(fi);
|
417
427
|
XENDTRY
|
418
428
|
fis_add_field(fis, fi);
|
@@ -423,13 +433,13 @@ FieldInfos *fis_read(InStream *is)
|
|
423
433
|
fis_deref(fis);
|
424
434
|
XENDTRY
|
425
435
|
|
426
|
-
return fis;
|
436
|
+
return fis;
|
427
437
|
}
|
428
438
|
|
429
439
|
void fis_write(FieldInfos *fis, OutStream *os)
|
430
440
|
{
|
431
441
|
int i;
|
432
|
-
union {
|
442
|
+
union { u32 i; float f; } tmp;
|
433
443
|
FieldInfo *fi;
|
434
444
|
const int fis_size = fis->size;
|
435
445
|
|
@@ -439,7 +449,7 @@ void fis_write(FieldInfos *fis, OutStream *os)
|
|
439
449
|
os_write_vint(os, fis->size);
|
440
450
|
for (i = 0; i < fis_size; i++) {
|
441
451
|
fi = fis->fields[i];
|
442
|
-
os_write_string(os, fi->name);
|
452
|
+
os_write_string(os, S(fi->name));
|
443
453
|
tmp.f = fi->boost;
|
444
454
|
os_write_u32(os, tmp.i);
|
445
455
|
os_write_vint(os, fi->bits);
|
@@ -497,27 +507,25 @@ char *fis_to_s(FieldInfos *fis)
|
|
497
507
|
FieldInfo *fi;
|
498
508
|
const int fis_size = fis->size;
|
499
509
|
|
500
|
-
sprintf(buf,
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
510
|
+
pos = sprintf(buf,
|
511
|
+
"default:\n"
|
512
|
+
" store: %s\n"
|
513
|
+
" index: %s\n"
|
514
|
+
" term_vector: %s\n"
|
515
|
+
"fields:\n",
|
516
|
+
store_str[fis->store],
|
517
|
+
index_str[fis->index],
|
518
|
+
term_vector_str[fis->term_vector]);
|
509
519
|
for (i = 0; i < fis_size; i++) {
|
510
520
|
fi = fis->fields[i];
|
511
|
-
sprintf(buf + pos,
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
pos += strlen(buf + pos);
|
521
|
+
pos += sprintf(buf + pos,
|
522
|
+
" %s:\n"
|
523
|
+
" boost: %f\n"
|
524
|
+
" store: %s\n"
|
525
|
+
" index: %s\n"
|
526
|
+
" term_vector: %s\n",
|
527
|
+
(char *)fi->name, fi->boost, fi_store_str(fi),
|
528
|
+
fi_index_str(fi), fi_term_vector_str(fi));
|
521
529
|
}
|
522
530
|
|
523
531
|
return buf;
|
@@ -565,7 +573,7 @@ SegmentInfo *si_new(char *name, int doc_cnt, Store *store)
|
|
565
573
|
return si;
|
566
574
|
}
|
567
575
|
|
568
|
-
SegmentInfo *si_read(Store *store, InStream *is)
|
576
|
+
static SegmentInfo *si_read(Store *store, InStream *is)
|
569
577
|
{
|
570
578
|
SegmentInfo *volatile si = ALLOC_AND_ZERO(SegmentInfo);
|
571
579
|
TRY
|
@@ -590,7 +598,7 @@ SegmentInfo *si_read(Store *store, InStream *is)
|
|
590
598
|
return si;
|
591
599
|
}
|
592
600
|
|
593
|
-
void si_write(SegmentInfo *si, OutStream *os)
|
601
|
+
static void si_write(SegmentInfo *si, OutStream *os)
|
594
602
|
{
|
595
603
|
os_write_string(os, si->name);
|
596
604
|
os_write_vint(os, si->doc_cnt);
|
@@ -619,7 +627,9 @@ bool si_has_deletions(SegmentInfo *si)
|
|
619
627
|
return si->del_gen >= 0;
|
620
628
|
}
|
621
629
|
|
622
|
-
|
630
|
+
/*
|
631
|
+
FIXME: not used
|
632
|
+
static char *si_del_file_name(SegmentInfo *si, char *buf)
|
623
633
|
{
|
624
634
|
if (si->del_gen < 0) {
|
625
635
|
return NULL;
|
@@ -628,6 +638,7 @@ char *si_del_file_name(SegmentInfo *si, char *buf)
|
|
628
638
|
return fn_for_generation(buf, si->name, ".del", si->del_gen);
|
629
639
|
}
|
630
640
|
}
|
641
|
+
*/
|
631
642
|
|
632
643
|
bool si_has_separate_norms(SegmentInfo *si)
|
633
644
|
{
|
@@ -653,7 +664,7 @@ void si_advance_norm_gen(SegmentInfo *si, int field_num)
|
|
653
664
|
si->norm_gens[field_num]++;
|
654
665
|
}
|
655
666
|
|
656
|
-
char *si_norm_file_name(SegmentInfo *si, char *buf, int field_num)
|
667
|
+
static char *si_norm_file_name(SegmentInfo *si, char *buf, int field_num)
|
657
668
|
{
|
658
669
|
int norm_gen;
|
659
670
|
if (field_num >= si->norm_gens_size
|
@@ -661,12 +672,12 @@ char *si_norm_file_name(SegmentInfo *si, char *buf, int field_num)
|
|
661
672
|
return NULL;
|
662
673
|
}
|
663
674
|
else {
|
664
|
-
char *ext = (si->use_compound_file && norm_gen > 0) ? "s" : "f";
|
675
|
+
const char *ext = (si->use_compound_file && norm_gen > 0) ? "s" : "f";
|
665
676
|
return fn_for_gen_field(buf, si->name, ext, norm_gen, field_num);
|
666
677
|
}
|
667
678
|
}
|
668
679
|
|
669
|
-
void deleter_queue_file(Deleter *dlr, char *file_name);
|
680
|
+
static void deleter_queue_file(Deleter *dlr, const char *file_name);
|
670
681
|
#define DEL(file_name) deleter_queue_file(dlr, file_name)
|
671
682
|
|
672
683
|
static void si_delete_files(SegmentInfo *si, FieldInfos *fis, Deleter *dlr)
|
@@ -708,11 +719,11 @@ static void si_delete_files(SegmentInfo *si, FieldInfos *fis, Deleter *dlr)
|
|
708
719
|
****************************************************************************/
|
709
720
|
|
710
721
|
#include <time.h>
|
711
|
-
static char *new_segment(
|
722
|
+
static char *new_segment(i64 generation)
|
712
723
|
{
|
713
724
|
char buf[SEGMENT_NAME_MAX_LENGTH];
|
714
725
|
char *fn_p = u64_to_str36(buf, SEGMENT_NAME_MAX_LENGTH - 1,
|
715
|
-
(
|
726
|
+
(u64)generation);
|
716
727
|
*(--fn_p) = '_';
|
717
728
|
return estrdup(fn_p);
|
718
729
|
}
|
@@ -722,18 +733,21 @@ static char *new_segment(f_i64 generation)
|
|
722
733
|
****************************************************************************/
|
723
734
|
|
724
735
|
typedef struct FindSegmentsFile {
|
725
|
-
|
726
|
-
|
727
|
-
|
736
|
+
i64 generation;
|
737
|
+
union {
|
738
|
+
SegmentInfos *sis;
|
739
|
+
IndexReader *ir;
|
740
|
+
u64 uint64;
|
741
|
+
} ret;
|
728
742
|
} FindSegmentsFile;
|
729
743
|
|
730
|
-
static void which_gen_i(char *file_name, void *arg)
|
744
|
+
static void which_gen_i(const char *file_name, void *arg)
|
731
745
|
{
|
732
|
-
|
746
|
+
i64 *max_generation = (i64 *)arg;
|
733
747
|
if (0 == strncmp(SEGMENTS_FILE_NAME"_", file_name,
|
734
748
|
sizeof(SEGMENTS_FILE_NAME))) {
|
735
749
|
char *p = strrchr(file_name, '_') + 1;
|
736
|
-
|
750
|
+
i64 generation = (i64)str36_to_u64(p);
|
737
751
|
if (generation > *max_generation) *max_generation = generation;
|
738
752
|
}
|
739
753
|
}
|
@@ -776,9 +790,9 @@ void sis_put(SegmentInfos *sis, FILE *stream)
|
|
776
790
|
*
|
777
791
|
* @param store - the Store to look in
|
778
792
|
*/
|
779
|
-
|
793
|
+
i64 sis_current_segment_generation(Store *store)
|
780
794
|
{
|
781
|
-
|
795
|
+
i64 current_generation = -1;
|
782
796
|
store->each(store, &which_gen_i, ¤t_generation);
|
783
797
|
return current_generation;
|
784
798
|
}
|
@@ -802,22 +816,25 @@ char *sis_curr_seg_file_name(char *buf, Store *store)
|
|
802
816
|
* @param store - the Store to look in
|
803
817
|
* @return segments_N where N is the +next+ generation
|
804
818
|
*/
|
805
|
-
|
819
|
+
/*
|
820
|
+
FIXME: not used
|
821
|
+
static char *sis_next_seg_file_name(char *buf, Store *store)
|
806
822
|
{
|
807
823
|
return segfn_for_generation(buf, sis_current_segment_generation(store) + 1);
|
808
824
|
}
|
825
|
+
*/
|
809
826
|
|
810
827
|
#define GEN_FILE_RETRY_COUNT 10
|
811
828
|
#define GEN_LOOK_AHEAD_COUNT 10
|
812
|
-
void sis_find_segments_file(Store *store, FindSegmentsFile *fsf,
|
829
|
+
static void sis_find_segments_file(Store *store, FindSegmentsFile *fsf,
|
813
830
|
void (*run)(Store *store, FindSegmentsFile *fsf))
|
814
831
|
{
|
815
|
-
int i;
|
816
|
-
int gen_look_ahead_count = 0;
|
817
|
-
bool retry = false;
|
818
|
-
int method = 0;
|
819
|
-
|
820
|
-
|
832
|
+
volatile int i;
|
833
|
+
volatile int gen_look_ahead_count = 0;
|
834
|
+
volatile bool retry = false;
|
835
|
+
volatile int method = 0;
|
836
|
+
volatile i64 last_gen = -1;
|
837
|
+
volatile i64 gen = 0;
|
821
838
|
|
822
839
|
/* Loop until we succeed in calling doBody() without hitting an
|
823
840
|
* IOException. An IOException most likely means a commit was in process
|
@@ -855,7 +872,7 @@ void sis_find_segments_file(Store *store, FindSegmentsFile *fsf,
|
|
855
872
|
XENDTRY
|
856
873
|
|
857
874
|
if (NULL != gen_is) {
|
858
|
-
|
875
|
+
i64 gen0 = -1, gen1 = -1;
|
859
876
|
|
860
877
|
TRY
|
861
878
|
gen0 = is_read_u64(gen_is);
|
@@ -897,8 +914,14 @@ void sis_find_segments_file(Store *store, FindSegmentsFile *fsf,
|
|
897
914
|
/* OK, we've tried the same segments_N file twice in a row, so
|
898
915
|
* this must be a real error. We throw the original exception
|
899
916
|
* we got. */
|
917
|
+
char *listing, listing_buffer[1024];
|
918
|
+
listing = store_to_s(store);
|
919
|
+
strncpy(listing_buffer, listing, 1023);
|
920
|
+
listing_buffer[1023] = '\0';
|
921
|
+
free(listing);
|
900
922
|
RAISE(IO_ERROR,
|
901
|
-
"Error reading the segment infos. Store
|
923
|
+
"Error reading the segment infos. Store:\n %s\n",
|
924
|
+
listing_buffer);
|
902
925
|
}
|
903
926
|
else {
|
904
927
|
micro_sleep(50000);
|
@@ -985,7 +1008,7 @@ SegmentInfos *sis_new(FieldInfos *fis)
|
|
985
1008
|
REF(fis);
|
986
1009
|
sis->fis = fis;
|
987
1010
|
sis->format = FORMAT;
|
988
|
-
sis->version = (
|
1011
|
+
sis->version = (u64)time(NULL);
|
989
1012
|
sis->size = 0;
|
990
1013
|
sis->counter = 0;
|
991
1014
|
sis->generation = -1;
|
@@ -1053,7 +1076,7 @@ void sis_clear(SegmentInfos *sis)
|
|
1053
1076
|
sis->size = 0;
|
1054
1077
|
}
|
1055
1078
|
|
1056
|
-
void sis_read_i(Store *store, FindSegmentsFile *fsf)
|
1079
|
+
static void sis_read_i(Store *store, FindSegmentsFile *fsf)
|
1057
1080
|
{
|
1058
1081
|
int seg_cnt;
|
1059
1082
|
int i;
|
@@ -1062,7 +1085,7 @@ void sis_read_i(Store *store, FindSegmentsFile *fsf)
|
|
1062
1085
|
InStream *volatile is = NULL;
|
1063
1086
|
SegmentInfos *volatile sis = ALLOC_AND_ZERO(SegmentInfos);
|
1064
1087
|
segfn_for_generation(seg_file_name, fsf->generation);
|
1065
|
-
fsf->
|
1088
|
+
fsf->ret.sis = NULL;
|
1066
1089
|
TRY
|
1067
1090
|
is = store->open_input(store, seg_file_name);
|
1068
1091
|
sis->store = store;
|
@@ -1090,20 +1113,20 @@ void sis_read_i(Store *store, FindSegmentsFile *fsf)
|
|
1090
1113
|
sis_destroy(sis);
|
1091
1114
|
}
|
1092
1115
|
XENDTRY
|
1093
|
-
fsf->
|
1116
|
+
fsf->ret.sis = sis;
|
1094
1117
|
}
|
1095
1118
|
|
1096
1119
|
SegmentInfos *sis_read(Store *store)
|
1097
1120
|
{
|
1098
1121
|
FindSegmentsFile fsf;
|
1099
1122
|
sis_find_segments_file(store, &fsf, &sis_read_i);
|
1100
|
-
return fsf.
|
1123
|
+
return fsf.ret.sis;
|
1101
1124
|
}
|
1102
1125
|
|
1103
1126
|
void sis_write(SegmentInfos *sis, Store *store, Deleter *deleter)
|
1104
1127
|
{
|
1105
1128
|
int i;
|
1106
|
-
OutStream *os = NULL;
|
1129
|
+
OutStream *volatile os = NULL;
|
1107
1130
|
const int sis_size = sis->size;
|
1108
1131
|
char buf[SEGMENT_NAME_MAX_LENGTH];
|
1109
1132
|
sis->generation++;
|
@@ -1114,7 +1137,7 @@ void sis_write(SegmentInfos *sis, Store *store, Deleter *deleter)
|
|
1114
1137
|
os_write_u32(os, FORMAT);
|
1115
1138
|
os_write_u64(os, ++(sis->version)); /* every write changes the index */
|
1116
1139
|
os_write_u64(os, sis->counter);
|
1117
|
-
os_write_vint(os, sis->size);
|
1140
|
+
os_write_vint(os, sis->size);
|
1118
1141
|
for (i = 0; i < sis_size; i++) {
|
1119
1142
|
si_write(sis->segs[i], os);
|
1120
1143
|
}
|
@@ -1140,14 +1163,14 @@ void sis_write(SegmentInfos *sis, Store *store, Deleter *deleter)
|
|
1140
1163
|
}
|
1141
1164
|
}
|
1142
1165
|
|
1143
|
-
void sis_read_ver_i(Store *store, FindSegmentsFile *fsf)
|
1166
|
+
static void sis_read_ver_i(Store *store, FindSegmentsFile *fsf)
|
1144
1167
|
{
|
1145
1168
|
InStream *is;
|
1146
|
-
|
1147
|
-
|
1169
|
+
u32 format = 0;
|
1170
|
+
u64 version = 0;
|
1148
1171
|
char seg_file_name[SEGMENT_NAME_MAX_LENGTH];
|
1149
1172
|
|
1150
|
-
segfn_for_generation(seg_file_name, (
|
1173
|
+
segfn_for_generation(seg_file_name, (u64)fsf->generation);
|
1151
1174
|
is = store->open_input(store, seg_file_name);
|
1152
1175
|
|
1153
1176
|
TRY
|
@@ -1157,14 +1180,14 @@ void sis_read_ver_i(Store *store, FindSegmentsFile *fsf)
|
|
1157
1180
|
is_close(is);
|
1158
1181
|
XENDTRY
|
1159
1182
|
|
1160
|
-
fsf->
|
1183
|
+
fsf->ret.uint64 = version;
|
1161
1184
|
}
|
1162
1185
|
|
1163
|
-
|
1186
|
+
u64 sis_read_current_version(Store *store)
|
1164
1187
|
{
|
1165
1188
|
FindSegmentsFile fsf;
|
1166
1189
|
sis_find_segments_file(store, &fsf, &sis_read_ver_i);
|
1167
|
-
return fsf.
|
1190
|
+
return fsf.ret.uint64;
|
1168
1191
|
}
|
1169
1192
|
|
1170
1193
|
/****************************************************************************
|
@@ -1173,12 +1196,14 @@ f_u64 sis_read_current_version(Store *store)
|
|
1173
1196
|
*
|
1174
1197
|
****************************************************************************/
|
1175
1198
|
|
1176
|
-
static LazyDocField *lazy_df_new(
|
1199
|
+
static LazyDocField *lazy_df_new(Symbol name, const int size,
|
1200
|
+
bool is_compressed)
|
1177
1201
|
{
|
1178
1202
|
LazyDocField *self = ALLOC(LazyDocField);
|
1179
|
-
self->name =
|
1203
|
+
self->name = name;
|
1180
1204
|
self->size = size;
|
1181
1205
|
self->data = ALLOC_AND_ZERO_N(LazyDocFieldData, size);
|
1206
|
+
self->is_compressed = is_compressed;
|
1182
1207
|
return self;
|
1183
1208
|
}
|
1184
1209
|
|
@@ -1190,11 +1215,171 @@ static void lazy_df_destroy(LazyDocField *self)
|
|
1190
1215
|
free(self->data[i].text);
|
1191
1216
|
}
|
1192
1217
|
}
|
1193
|
-
free(self->name);
|
1194
1218
|
free(self->data);
|
1195
1219
|
free(self);
|
1196
1220
|
}
|
1197
1221
|
|
1222
|
+
#ifdef USE_ZLIB
|
1223
|
+
/* good zlib example at http://www.zlib.net/zlib_how.html */
|
1224
|
+
|
1225
|
+
/* report a zlib or i/o error */
|
1226
|
+
static void zraise(int ret)
|
1227
|
+
{
|
1228
|
+
switch (ret) {
|
1229
|
+
case Z_ERRNO:
|
1230
|
+
if (ferror(stdin))
|
1231
|
+
RAISE(IO_ERROR, "zlib: error reading stdin");
|
1232
|
+
if (ferror(stdout))
|
1233
|
+
RAISE(IO_ERROR, "zlib: error writing stdout");
|
1234
|
+
break;
|
1235
|
+
case Z_STREAM_ERROR:
|
1236
|
+
RAISE(IO_ERROR, "zlib: invalid compression level");
|
1237
|
+
break;
|
1238
|
+
case Z_DATA_ERROR:
|
1239
|
+
RAISE(IO_ERROR, "zlib: invalid or incomplete deflate data");
|
1240
|
+
break;
|
1241
|
+
case Z_MEM_ERROR:
|
1242
|
+
RAISE(IO_ERROR, "zlib: out of memory");
|
1243
|
+
break;
|
1244
|
+
case Z_VERSION_ERROR:
|
1245
|
+
RAISE(IO_ERROR, "zlib: version mismatch!");
|
1246
|
+
break;
|
1247
|
+
default:
|
1248
|
+
RAISE(EXCEPTION, "zlib: unknown error");
|
1249
|
+
}
|
1250
|
+
}
|
1251
|
+
|
1252
|
+
static char *is_read_zipped_bytes(InStream *is, int zip_len, int *len)
|
1253
|
+
{
|
1254
|
+
int buf_out_idx = 0, ret, read_len;
|
1255
|
+
uchar *buf_out = NULL;
|
1256
|
+
uchar buf_in[ZIP_BUFFER_SIZE];
|
1257
|
+
z_stream zstrm;
|
1258
|
+
zstrm.zalloc = Z_NULL;
|
1259
|
+
zstrm.zfree = Z_NULL;
|
1260
|
+
zstrm.opaque = Z_NULL;
|
1261
|
+
zstrm.next_in = Z_NULL;
|
1262
|
+
zstrm.avail_in = 0;
|
1263
|
+
if ((ret = inflateInit(&zstrm)) != Z_OK) zraise(ret);
|
1264
|
+
|
1265
|
+
do {
|
1266
|
+
read_len = zip_len > ZIP_BUFFER_SIZE ? ZIP_BUFFER_SIZE : zip_len;
|
1267
|
+
is_read_bytes(is, buf_in, zip_len);
|
1268
|
+
zip_len -= read_len;
|
1269
|
+
zstrm.avail_in = read_len;
|
1270
|
+
zstrm.next_in = buf_in;
|
1271
|
+
zstrm.avail_out = ZIP_BUFFER_SIZE;
|
1272
|
+
|
1273
|
+
do {
|
1274
|
+
REALLOC_N(buf_out, uchar, buf_out_idx + ZIP_BUFFER_SIZE);
|
1275
|
+
zstrm.next_out = buf_out + buf_out_idx;
|
1276
|
+
ret = inflate(&zstrm, Z_NO_FLUSH);
|
1277
|
+
assert(ret != Z_STREAM_ERROR); /* state not clobbered */
|
1278
|
+
switch(ret) {
|
1279
|
+
case Z_NEED_DICT:
|
1280
|
+
ret = Z_DATA_ERROR; /* and fall through */
|
1281
|
+
case Z_DATA_ERROR:
|
1282
|
+
case Z_MEM_ERROR:
|
1283
|
+
(void)inflateEnd(&zstrm);
|
1284
|
+
zraise(ret);
|
1285
|
+
}
|
1286
|
+
buf_out_idx += ZIP_BUFFER_SIZE - zstrm.avail_out;
|
1287
|
+
} while (zstrm.avail_out == 0);
|
1288
|
+
} while (ret != Z_STREAM_END && zip_len != 0);
|
1289
|
+
|
1290
|
+
/* clean up */
|
1291
|
+
(void)inflateEnd(&zstrm);
|
1292
|
+
|
1293
|
+
buf_out[buf_out_idx] = '\0';
|
1294
|
+
REALLOC_N(buf_out, uchar, buf_out_idx + 1);
|
1295
|
+
*len = buf_out_idx;
|
1296
|
+
return (char *)buf_out;
|
1297
|
+
}
|
1298
|
+
#else /* use bzlib */
|
1299
|
+
static void zraise(int ret)
|
1300
|
+
{
|
1301
|
+
switch (ret) {
|
1302
|
+
case BZ_IO_ERROR:
|
1303
|
+
if (ferror(stdin))
|
1304
|
+
RAISE(IO_ERROR, "bzlib: error reading stdin");
|
1305
|
+
if (ferror(stdout))
|
1306
|
+
RAISE(IO_ERROR, "bzlib: error writing stdout");
|
1307
|
+
break;
|
1308
|
+
case BZ_CONFIG_ERROR:
|
1309
|
+
RAISE(IO_ERROR, "bzlib: system configuration error");
|
1310
|
+
break;
|
1311
|
+
case BZ_SEQUENCE_ERROR: /* shouldn't occur if code is correct */
|
1312
|
+
RAISE(IO_ERROR, "bzlib: !!BUG!! sequence error");
|
1313
|
+
break;
|
1314
|
+
case BZ_PARAM_ERROR: /* shouldn't occur if code is correct */
|
1315
|
+
RAISE(IO_ERROR, "bzlib: !!BUG!! parameter error");
|
1316
|
+
break;
|
1317
|
+
case BZ_MEM_ERROR:
|
1318
|
+
RAISE(IO_ERROR, "bzlib: memory error");
|
1319
|
+
break;
|
1320
|
+
case BZ_DATA_ERROR:
|
1321
|
+
RAISE(IO_ERROR, "bzlib: data integrity check error");
|
1322
|
+
break;
|
1323
|
+
case BZ_DATA_ERROR_MAGIC:
|
1324
|
+
RAISE(IO_ERROR, "bzlib: data integrity check - non-matching magic");
|
1325
|
+
break;
|
1326
|
+
case BZ_UNEXPECTED_EOF:
|
1327
|
+
RAISE(IO_ERROR, "bzlib: unexpected end-of-file");
|
1328
|
+
break;
|
1329
|
+
case BZ_OUTBUFF_FULL:
|
1330
|
+
RAISE(IO_ERROR, "bzlib: output buffer full");
|
1331
|
+
break;
|
1332
|
+
default:
|
1333
|
+
RAISE(EXCEPTION, "bzlib: unknown error");
|
1334
|
+
}
|
1335
|
+
}
|
1336
|
+
|
1337
|
+
static char *is_read_zipped_bytes(InStream *is, int zip_len, int *len)
|
1338
|
+
{
|
1339
|
+
int buf_out_idx = 0, ret, read_len;
|
1340
|
+
char *buf_out = NULL;
|
1341
|
+
char buf_in[ZIP_BUFFER_SIZE];
|
1342
|
+
bz_stream zstrm;
|
1343
|
+
zstrm.bzalloc = NULL;
|
1344
|
+
zstrm.bzfree = NULL;
|
1345
|
+
zstrm.opaque = NULL;
|
1346
|
+
zstrm.next_in = NULL;
|
1347
|
+
zstrm.avail_in = 0;
|
1348
|
+
if ((ret = BZ2_bzDecompressInit(&zstrm, 0, 0)) != BZ_OK) zraise(ret);
|
1349
|
+
|
1350
|
+
do {
|
1351
|
+
read_len = zip_len > ZIP_BUFFER_SIZE ? ZIP_BUFFER_SIZE : zip_len;
|
1352
|
+
is_read_bytes(is, (uchar *)buf_in, zip_len);
|
1353
|
+
zip_len -= read_len;
|
1354
|
+
zstrm.avail_in = read_len;
|
1355
|
+
zstrm.next_in = buf_in;
|
1356
|
+
zstrm.avail_out = ZIP_BUFFER_SIZE;
|
1357
|
+
|
1358
|
+
do {
|
1359
|
+
REALLOC_N(buf_out, char, buf_out_idx + ZIP_BUFFER_SIZE);
|
1360
|
+
zstrm.next_out = buf_out + buf_out_idx;
|
1361
|
+
ret = BZ2_bzDecompress(&zstrm);
|
1362
|
+
assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
|
1363
|
+
if (ret != BZ_OK && ret != BZ_STREAM_END) {
|
1364
|
+
(void)BZ2_bzDecompressEnd(&zstrm);
|
1365
|
+
zraise(ret);
|
1366
|
+
}
|
1367
|
+
buf_out_idx += ZIP_BUFFER_SIZE - zstrm.avail_out;
|
1368
|
+
} while (zstrm.avail_out == 0);
|
1369
|
+
} while (ret != BZ_STREAM_END && zip_len != 0);
|
1370
|
+
|
1371
|
+
/* clean up */
|
1372
|
+
(void)BZ2_bzDecompressEnd(&zstrm);
|
1373
|
+
|
1374
|
+
buf_out[buf_out_idx] = '\0';
|
1375
|
+
REALLOC_N(buf_out, char, buf_out_idx + 1);
|
1376
|
+
*len = buf_out_idx;
|
1377
|
+
return (char *)buf_out;
|
1378
|
+
}
|
1379
|
+
|
1380
|
+
#endif
|
1381
|
+
|
1382
|
+
|
1198
1383
|
char *lazy_df_get_data(LazyDocField *self, int i)
|
1199
1384
|
{
|
1200
1385
|
char *text = NULL;
|
@@ -1202,10 +1387,17 @@ char *lazy_df_get_data(LazyDocField *self, int i)
|
|
1202
1387
|
text = self->data[i].text;
|
1203
1388
|
if (NULL == text) {
|
1204
1389
|
const int read_len = self->data[i].length + 1;
|
1205
|
-
self->data[i].text = text = ALLOC_N(char, read_len);
|
1206
1390
|
is_seek(self->doc->fields_in, self->data[i].start);
|
1207
|
-
|
1208
|
-
|
1391
|
+
if (self->is_compressed) {
|
1392
|
+
text = self->data[i].text =
|
1393
|
+
is_read_zipped_bytes(self->doc->fields_in, read_len,
|
1394
|
+
&(self->data[i].length));
|
1395
|
+
}
|
1396
|
+
else {
|
1397
|
+
self->data[i].text = text = ALLOC_N(char, read_len);
|
1398
|
+
is_read_bytes(self->doc->fields_in, (uchar *)text, read_len);
|
1399
|
+
text[read_len - 1] = '\0';
|
1400
|
+
}
|
1209
1401
|
}
|
1210
1402
|
}
|
1211
1403
|
|
@@ -1214,6 +1406,16 @@ char *lazy_df_get_data(LazyDocField *self, int i)
|
|
1214
1406
|
|
1215
1407
|
void lazy_df_get_bytes(LazyDocField *self, char *buf, int start, int len)
|
1216
1408
|
{
|
1409
|
+
if (self->is_compressed == 1) {
|
1410
|
+
int i;
|
1411
|
+
self->len = 0;
|
1412
|
+
for (i = self->size-1; i >= 0; i--) {
|
1413
|
+
(void)lazy_df_get_data(self, i);
|
1414
|
+
self->len += self->data[i].length + 1;
|
1415
|
+
}
|
1416
|
+
self->len--; /* each field separated by ' ' but no need to add to end */
|
1417
|
+
self->is_compressed = 2;
|
1418
|
+
}
|
1217
1419
|
if (start < 0 || start >= self->len) {
|
1218
1420
|
RAISE(IO_ERROR, "start out of range in LazyDocField#get_bytes. %d "
|
1219
1421
|
"is not between 0 and %d", start, self->len);
|
@@ -1225,8 +1427,37 @@ void lazy_df_get_bytes(LazyDocField *self, char *buf, int start, int len)
|
|
1225
1427
|
RAISE(IO_ERROR, "Tried to read past end of field. Field is only %d "
|
1226
1428
|
"bytes long but tried to read to %d", self->len, start + len);
|
1227
1429
|
}
|
1228
|
-
|
1229
|
-
|
1430
|
+
if (self->is_compressed) {
|
1431
|
+
int cur_start = 0, buf_start = 0, cur_end, i, copy_start, copy_len;
|
1432
|
+
for (i = 0; i < self->size; i++) {
|
1433
|
+
cur_end = cur_start + self->data[i].length;
|
1434
|
+
if (start < cur_end) {
|
1435
|
+
copy_start = start > cur_start ? start - cur_start : 0;
|
1436
|
+
copy_len = cur_end - cur_start - copy_start;
|
1437
|
+
if (copy_len >= len) {
|
1438
|
+
copy_len = len;
|
1439
|
+
len = 0;
|
1440
|
+
}
|
1441
|
+
else {
|
1442
|
+
len -= copy_len;
|
1443
|
+
}
|
1444
|
+
memcpy(buf + buf_start,
|
1445
|
+
self->data[i].text + copy_start,
|
1446
|
+
copy_len);
|
1447
|
+
buf_start += copy_len;
|
1448
|
+
if (len > 0) {
|
1449
|
+
buf[buf_start++] = ' ';
|
1450
|
+
len--;
|
1451
|
+
}
|
1452
|
+
if (len == 0) break;
|
1453
|
+
}
|
1454
|
+
cur_start = cur_end + 1;
|
1455
|
+
}
|
1456
|
+
}
|
1457
|
+
else {
|
1458
|
+
is_seek(self->doc->fields_in, self->data[0].start + start);
|
1459
|
+
is_read_bytes(self->doc->fields_in, (uchar *)buf, len);
|
1460
|
+
}
|
1230
1461
|
}
|
1231
1462
|
|
1232
1463
|
/****************************************************************************
|
@@ -1238,7 +1469,7 @@ void lazy_df_get_bytes(LazyDocField *self, char *buf, int start, int len)
|
|
1238
1469
|
static LazyDoc *lazy_doc_new(int size, InStream *fdt_in)
|
1239
1470
|
{
|
1240
1471
|
LazyDoc *self = ALLOC(LazyDoc);
|
1241
|
-
self->
|
1472
|
+
self->field_dictionary = h_new_ptr((free_ft)&lazy_df_destroy);
|
1242
1473
|
self->size = size;
|
1243
1474
|
self->fields = ALLOC_AND_ZERO_N(LazyDocField *, size);
|
1244
1475
|
self->fields_in = is_clone(fdt_in);
|
@@ -1247,7 +1478,7 @@ static LazyDoc *lazy_doc_new(int size, InStream *fdt_in)
|
|
1247
1478
|
|
1248
1479
|
void lazy_doc_close(LazyDoc *self)
|
1249
1480
|
{
|
1250
|
-
h_destroy(self->
|
1481
|
+
h_destroy(self->field_dictionary);
|
1251
1482
|
is_close(self->fields_in);
|
1252
1483
|
free(self->fields);
|
1253
1484
|
free(self);
|
@@ -1256,10 +1487,15 @@ void lazy_doc_close(LazyDoc *self)
|
|
1256
1487
|
static void lazy_doc_add_field(LazyDoc *self, LazyDocField *lazy_df, int i)
|
1257
1488
|
{
|
1258
1489
|
self->fields[i] = lazy_df;
|
1259
|
-
h_set(self->
|
1490
|
+
h_set(self->field_dictionary, lazy_df->name, lazy_df);
|
1260
1491
|
lazy_df->doc = self;
|
1261
1492
|
}
|
1262
1493
|
|
1494
|
+
LazyDocField *frt_lazy_doc_get(LazyDoc *self, Symbol field)
|
1495
|
+
{
|
1496
|
+
return (LazyDocField *)h_get(self->field_dictionary, field);
|
1497
|
+
}
|
1498
|
+
|
1263
1499
|
/****************************************************************************
|
1264
1500
|
*
|
1265
1501
|
* FieldsReader
|
@@ -1296,7 +1532,7 @@ FieldsReader *fr_clone(FieldsReader *orig)
|
|
1296
1532
|
memcpy(fr, orig, sizeof(FieldsReader));
|
1297
1533
|
fr->fdx_in = is_clone(orig->fdx_in);
|
1298
1534
|
fr->fdt_in = is_clone(orig->fdt_in);
|
1299
|
-
|
1535
|
+
|
1300
1536
|
return fr;
|
1301
1537
|
}
|
1302
1538
|
|
@@ -1307,25 +1543,36 @@ void fr_close(FieldsReader *fr)
|
|
1307
1543
|
free(fr);
|
1308
1544
|
}
|
1309
1545
|
|
1310
|
-
static DocField *fr_df_new(
|
1546
|
+
static DocField *fr_df_new(Symbol name, int size, bool is_compressed)
|
1311
1547
|
{
|
1312
1548
|
DocField *df = ALLOC(DocField);
|
1313
|
-
df->name =
|
1549
|
+
df->name = name;
|
1314
1550
|
df->capa = df->size = size;
|
1315
1551
|
df->data = ALLOC_N(char *, df->capa);
|
1316
1552
|
df->lengths = ALLOC_N(int, df->capa);
|
1317
1553
|
df->destroy_data = true;
|
1318
1554
|
df->boost = 1.0;
|
1555
|
+
df->is_compressed = is_compressed;
|
1319
1556
|
return df;
|
1320
1557
|
}
|
1321
1558
|
|
1559
|
+
static void fr_read_zipped_fields(FieldsReader *fr, DocField *df)
|
1560
|
+
{
|
1561
|
+
int i;
|
1562
|
+
const int df_size = df->size;
|
1563
|
+
InStream *fdt_in = fr->fdt_in;
|
1564
|
+
|
1565
|
+
for (i = 0; i < df_size; i++) {
|
1566
|
+
const int zip_len = df->lengths[i] + 1;
|
1567
|
+
df->data[i] = is_read_zipped_bytes(fdt_in, zip_len, &(df->lengths[i]));
|
1568
|
+
}
|
1569
|
+
}
|
1570
|
+
|
1322
1571
|
Document *fr_get_doc(FieldsReader *fr, int doc_num)
|
1323
1572
|
{
|
1324
1573
|
int i, j;
|
1325
|
-
FieldInfo *fi;
|
1326
1574
|
off_t pos;
|
1327
|
-
int stored_cnt
|
1328
|
-
DocField *df;
|
1575
|
+
int stored_cnt;
|
1329
1576
|
Document *doc = doc_new();
|
1330
1577
|
InStream *fdx_in = fr->fdx_in;
|
1331
1578
|
InStream *fdt_in = fr->fdt_in;
|
@@ -1336,34 +1583,42 @@ Document *fr_get_doc(FieldsReader *fr, int doc_num)
|
|
1336
1583
|
stored_cnt = is_read_vint(fdt_in);
|
1337
1584
|
|
1338
1585
|
for (i = 0; i < stored_cnt; i++) {
|
1339
|
-
field_num = is_read_vint(fdt_in);
|
1340
|
-
fi = fr->fis->fields[field_num];
|
1341
|
-
df_size = is_read_vint(fdt_in);
|
1342
|
-
df = fr_df_new(fi->name, df_size);
|
1586
|
+
const int field_num = is_read_vint(fdt_in);
|
1587
|
+
FieldInfo *fi = fr->fis->fields[field_num];
|
1588
|
+
const int df_size = is_read_vint(fdt_in);
|
1589
|
+
DocField *df = fr_df_new(fi->name, df_size, fi_is_compressed(fi));
|
1343
1590
|
|
1344
1591
|
for (j = 0; j < df_size; j++) {
|
1345
1592
|
df->lengths[j] = is_read_vint(fdt_in);
|
1346
1593
|
}
|
1347
1594
|
|
1348
|
-
for (j = 0; j < df_size; j++) {
|
1349
|
-
const int read_len = df->lengths[j] + 1;
|
1350
|
-
df->data[j] = ALLOC_N(char, read_len);
|
1351
|
-
is_read_bytes(fdt_in, (uchar *)df->data[j], read_len);
|
1352
|
-
df->data[j][read_len - 1] = '\0';
|
1353
|
-
}
|
1354
1595
|
doc_add_field(doc, df);
|
1355
1596
|
}
|
1597
|
+
for (i = 0; i < stored_cnt; i++) {
|
1598
|
+
DocField *df = doc->fields[i];
|
1599
|
+
if (df->is_compressed) {
|
1600
|
+
fr_read_zipped_fields(fr, df);
|
1601
|
+
}
|
1602
|
+
else {
|
1603
|
+
const int df_size = df->size;
|
1604
|
+
for (j = 0; j < df_size; j++) {
|
1605
|
+
const int read_len = df->lengths[j] + 1;
|
1606
|
+
df->data[j] = ALLOC_N(char, read_len);
|
1607
|
+
is_read_bytes(fdt_in, (uchar *)df->data[j], read_len);
|
1608
|
+
df->data[j][read_len - 1] = '\0';
|
1609
|
+
}
|
1610
|
+
}
|
1611
|
+
}
|
1356
1612
|
|
1357
1613
|
return doc;
|
1358
1614
|
}
|
1359
1615
|
|
1360
1616
|
LazyDoc *fr_get_lazy_doc(FieldsReader *fr, int doc_num)
|
1361
1617
|
{
|
1618
|
+
int start = 0;
|
1362
1619
|
int i, j;
|
1363
|
-
FieldInfo *fi;
|
1364
1620
|
off_t pos;
|
1365
|
-
int stored_cnt
|
1366
|
-
LazyDocField *lazy_df;
|
1621
|
+
int stored_cnt;
|
1367
1622
|
LazyDoc *lazy_doc;
|
1368
1623
|
InStream *fdx_in = fr->fdx_in;
|
1369
1624
|
InStream *fdt_in = fr->fdt_in;
|
@@ -1375,43 +1630,43 @@ LazyDoc *fr_get_lazy_doc(FieldsReader *fr, int doc_num)
|
|
1375
1630
|
lazy_doc = lazy_doc_new(stored_cnt, fdt_in);
|
1376
1631
|
|
1377
1632
|
for (i = 0; i < stored_cnt; i++) {
|
1378
|
-
|
1379
|
-
int data_cnt;
|
1380
|
-
|
1381
|
-
|
1382
|
-
|
1383
|
-
lazy_df = lazy_df_new(fi->name, data_cnt);
|
1633
|
+
FieldInfo *fi = fr->fis->fields[is_read_vint(fdt_in)];
|
1634
|
+
const int data_cnt = is_read_vint(fdt_in);
|
1635
|
+
LazyDocField *lazy_df = lazy_df_new(fi->name, data_cnt,
|
1636
|
+
fi_is_compressed(fi));
|
1637
|
+
const int field_start = start;
|
1384
1638
|
|
1385
1639
|
/* get the starts relative positions this time around */
|
1386
1640
|
for (j = 0; j < data_cnt; j++) {
|
1387
1641
|
lazy_df->data[j].start = start;
|
1388
1642
|
start += 1 + (lazy_df->data[j].length = is_read_vint(fdt_in));
|
1389
1643
|
}
|
1390
|
-
|
1391
|
-
lazy_df->len = start - 1;
|
1644
|
+
lazy_df->len = start - field_start - 1;
|
1392
1645
|
|
1393
|
-
|
1394
|
-
|
1646
|
+
lazy_doc_add_field(lazy_doc, lazy_df, i);
|
1647
|
+
}
|
1648
|
+
/* correct the starts to their correct absolute positions */
|
1649
|
+
for (i = 0; i < stored_cnt; i++) {
|
1650
|
+
LazyDocField *lazy_df = lazy_doc->fields[i];
|
1651
|
+
const int data_cnt = lazy_df->size;
|
1652
|
+
const int start = is_pos(fdt_in);
|
1395
1653
|
for (j = 0; j < data_cnt; j++) {
|
1396
1654
|
lazy_df->data[j].start += start;
|
1397
1655
|
}
|
1398
|
-
|
1399
|
-
lazy_doc_add_field(lazy_doc, lazy_df, i);
|
1400
|
-
is_seek(fdt_in, end);
|
1401
1656
|
}
|
1402
1657
|
|
1403
1658
|
return lazy_doc;
|
1404
1659
|
}
|
1405
1660
|
|
1406
|
-
TermVector *fr_read_term_vector(FieldsReader *fr, int field_num)
|
1661
|
+
static TermVector *fr_read_term_vector(FieldsReader *fr, int field_num)
|
1407
1662
|
{
|
1408
1663
|
TermVector *tv = ALLOC_AND_ZERO(TermVector);
|
1409
1664
|
InStream *fdt_in = fr->fdt_in;
|
1410
1665
|
FieldInfo *fi = fr->fis->fields[field_num];
|
1411
1666
|
const int num_terms = is_read_vint(fdt_in);
|
1412
|
-
|
1667
|
+
|
1413
1668
|
tv->field_num = field_num;
|
1414
|
-
tv->field =
|
1669
|
+
tv->field = fi->name;
|
1415
1670
|
|
1416
1671
|
if (num_terms > 0) {
|
1417
1672
|
int i, j, delta_start, delta_len, total_len, freq;
|
@@ -1431,7 +1686,8 @@ TermVector *fr_read_term_vector(FieldsReader *fr, int field_num)
|
|
1431
1686
|
total_len = delta_start + delta_len;
|
1432
1687
|
is_read_bytes(fdt_in, buffer + delta_start, delta_len);
|
1433
1688
|
buffer[total_len++] = '\0';
|
1434
|
-
term->text = memcpy(ALLOC_N(char, total_len),
|
1689
|
+
term->text = (char *)memcpy(ALLOC_N(char, total_len),
|
1690
|
+
buffer, total_len);
|
1435
1691
|
|
1436
1692
|
/* read freq */
|
1437
1693
|
freq = term->freq = is_read_vint(fdt_in);
|
@@ -1450,21 +1706,21 @@ TermVector *fr_read_term_vector(FieldsReader *fr, int field_num)
|
|
1450
1706
|
if (store_offsets) {
|
1451
1707
|
int num_positions = tv->offset_cnt = is_read_vint(fdt_in);
|
1452
1708
|
Offset *offsets = tv->offsets = ALLOC_N(Offset, num_positions);
|
1453
|
-
|
1709
|
+
i64 offset = 0;
|
1454
1710
|
for (i = 0; i < num_positions; i++) {
|
1455
1711
|
offsets[i].start =
|
1456
|
-
(off_t)(offset += (
|
1712
|
+
(off_t)(offset += (i64)is_read_vll(fdt_in));
|
1457
1713
|
offsets[i].end =
|
1458
|
-
(off_t)(offset += (
|
1714
|
+
(off_t)(offset += (i64)is_read_vll(fdt_in));
|
1459
1715
|
}
|
1460
1716
|
}
|
1461
1717
|
}
|
1462
1718
|
return tv;
|
1463
1719
|
}
|
1464
1720
|
|
1465
|
-
|
1721
|
+
Hash *fr_get_tv(FieldsReader *fr, int doc_num)
|
1466
1722
|
{
|
1467
|
-
|
1723
|
+
Hash *term_vectors = h_new_ptr((free_ft)&tv_destroy);
|
1468
1724
|
int i;
|
1469
1725
|
InStream *fdx_in = fr->fdx_in;
|
1470
1726
|
InStream *fdt_in = fr->fdt_in;
|
@@ -1556,6 +1812,8 @@ FieldsWriter *fw_open(Store *store, const char *segment, FieldInfos *fis)
|
|
1556
1812
|
strcpy(file_name + segment_len, ".fdx");
|
1557
1813
|
fw->fdx_out = store->new_output(store, file_name);
|
1558
1814
|
|
1815
|
+
fw->buffer = ram_new_buffer();
|
1816
|
+
|
1559
1817
|
fw->fis = fis;
|
1560
1818
|
fw->tv_fields = ary_new_type_capa(TVField, TV_FIELD_INIT_CAPA);
|
1561
1819
|
|
@@ -1566,15 +1824,68 @@ void fw_close(FieldsWriter *fw)
|
|
1566
1824
|
{
|
1567
1825
|
os_close(fw->fdt_out);
|
1568
1826
|
os_close(fw->fdx_out);
|
1827
|
+
ram_destroy_buffer(fw->buffer);
|
1569
1828
|
ary_free(fw->tv_fields);
|
1570
1829
|
free(fw);
|
1571
1830
|
}
|
1572
1831
|
|
1573
|
-
|
1832
|
+
#ifdef USE_ZLIB
|
1833
|
+
static int os_write_zipped_bytes(OutStream* out_stream, uchar *data, int length)
|
1574
1834
|
{
|
1575
|
-
|
1576
|
-
|
1835
|
+
int ret, buf_size, zip_len = 0;
|
1836
|
+
uchar out_buffer[ZIP_BUFFER_SIZE];
|
1837
|
+
z_stream zstrm;
|
1838
|
+
zstrm.zalloc = Z_NULL;
|
1839
|
+
zstrm.zfree = Z_NULL;
|
1840
|
+
zstrm.opaque = Z_NULL;
|
1841
|
+
if ((ret = deflateInit(&zstrm, ZIP_LEVEL)) != Z_OK) zraise(ret);
|
1842
|
+
|
1843
|
+
zstrm.avail_in = length;
|
1844
|
+
zstrm.next_in = data;
|
1845
|
+
zstrm.avail_out = ZIP_BUFFER_SIZE;
|
1846
|
+
zstrm.next_out = out_buffer;
|
1847
|
+
|
1848
|
+
do {
|
1849
|
+
ret = deflate(&zstrm, Z_FINISH); /* no bad return value */
|
1850
|
+
assert(ret != Z_STREAM_ERROR) ; /* state not clobbered */
|
1851
|
+
zip_len += buf_size = ZIP_BUFFER_SIZE - zstrm.avail_out;
|
1852
|
+
os_write_bytes(out_stream, out_buffer, buf_size);
|
1853
|
+
} while (zstrm.avail_out == 0);
|
1854
|
+
assert(zstrm.avail_in == 0); /* all input will be used */
|
1855
|
+
|
1856
|
+
/* clean up */
|
1857
|
+
(void)deflateEnd(&zstrm);
|
1858
|
+
return zip_len;
|
1859
|
+
}
|
1860
|
+
#else
|
1861
|
+
static int os_write_zipped_bytes(OutStream* out_stream, uchar *data, int length)
|
1862
|
+
{
|
1863
|
+
int ret, buf_size, zip_len = 0;
|
1864
|
+
char out_buffer[ZIP_BUFFER_SIZE];
|
1865
|
+
bz_stream zstrm;
|
1866
|
+
zstrm.bzalloc = NULL;
|
1867
|
+
zstrm.bzfree = NULL;
|
1868
|
+
zstrm.opaque = NULL;
|
1869
|
+
if ((ret = BZ2_bzCompressInit(&zstrm, ZIP_LEVEL, 0, 0)) != BZ_OK) zraise(ret);
|
1870
|
+
|
1871
|
+
zstrm.avail_in = length;
|
1872
|
+
zstrm.next_in = (char *)data;
|
1873
|
+
zstrm.avail_out = ZIP_BUFFER_SIZE;
|
1874
|
+
zstrm.next_out = out_buffer;
|
1875
|
+
|
1876
|
+
do {
|
1877
|
+
ret = BZ2_bzCompress(&zstrm, BZ_FINISH); /* no bad return value */
|
1878
|
+
assert(ret != BZ_SEQUENCE_ERROR); /* state not clobbered */
|
1879
|
+
zip_len += buf_size = ZIP_BUFFER_SIZE - zstrm.avail_out;
|
1880
|
+
os_write_bytes(out_stream, (uchar *)out_buffer, buf_size);
|
1881
|
+
} while (zstrm.avail_out == 0);
|
1882
|
+
assert(zstrm.avail_in == 0); /* all input will be used */
|
1883
|
+
|
1884
|
+
/* clean up */
|
1885
|
+
(void)BZ2_bzCompressEnd(&zstrm);
|
1886
|
+
return zip_len;
|
1577
1887
|
}
|
1888
|
+
#endif
|
1578
1889
|
|
1579
1890
|
void fw_add_doc(FieldsWriter *fw, Document *doc)
|
1580
1891
|
{
|
@@ -1595,6 +1906,7 @@ void fw_add_doc(FieldsWriter *fw, Document *doc)
|
|
1595
1906
|
ary_size(fw->tv_fields) = 0;
|
1596
1907
|
os_write_u64(fdx_out, fw->start_ptr);
|
1597
1908
|
os_write_vint(fdt_out, stored_cnt);
|
1909
|
+
ramo_reset(fw->buffer);
|
1598
1910
|
|
1599
1911
|
for (i = 0; i < doc_size; i++) {
|
1600
1912
|
df = doc->fields[i];
|
@@ -1602,21 +1914,29 @@ void fw_add_doc(FieldsWriter *fw, Document *doc)
|
|
1602
1914
|
if (fi_is_stored(fi)) {
|
1603
1915
|
const int df_size = df->size;
|
1604
1916
|
os_write_vint(fdt_out, fi->number);
|
1605
|
-
os_write_vint(fdt_out,
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
|
1917
|
+
os_write_vint(fdt_out, df_size);
|
1918
|
+
if (fi_is_compressed(fi)) {
|
1919
|
+
for (j = 0; j < df_size; j++) {
|
1920
|
+
const int length = df->lengths[j];
|
1921
|
+
int zip_len = os_write_zipped_bytes(fw->buffer,
|
1922
|
+
(uchar*)df->data[j],
|
1923
|
+
length);
|
1924
|
+
os_write_vint(fdt_out, zip_len - 1);
|
1925
|
+
}
|
1611
1926
|
}
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
|
1927
|
+
else {
|
1928
|
+
for (j = 0; j < df_size; j++) {
|
1929
|
+
const int length = df->lengths[j];
|
1930
|
+
os_write_vint(fdt_out, length);
|
1931
|
+
os_write_bytes(fw->buffer, (uchar*)df->data[j], length);
|
1932
|
+
/* leave a space between fields as that is how they are
|
1933
|
+
* analyzed */
|
1934
|
+
os_write_byte(fw->buffer, ' ');
|
1935
|
+
}
|
1617
1936
|
}
|
1618
1937
|
}
|
1619
1938
|
}
|
1939
|
+
ramo_write_to(fw->buffer, fdt_out);
|
1620
1940
|
}
|
1621
1941
|
|
1622
1942
|
void fw_write_tv_index(FieldsWriter *fw)
|
@@ -1624,7 +1944,7 @@ void fw_write_tv_index(FieldsWriter *fw)
|
|
1624
1944
|
int i;
|
1625
1945
|
const int tv_cnt = ary_size(fw->tv_fields);
|
1626
1946
|
OutStream *fdt_out = fw->fdt_out;
|
1627
|
-
os_write_u32(fw->fdx_out, (
|
1947
|
+
os_write_u32(fw->fdx_out, (u32)(os_pos(fdt_out) - fw->start_ptr));
|
1628
1948
|
os_write_vint(fdt_out, tv_cnt);
|
1629
1949
|
/* write in reverse order so we can count back from the start position to
|
1630
1950
|
* the beginning of the TermVector's data */
|
@@ -1683,13 +2003,13 @@ void fw_add_postings(FieldsWriter *fw,
|
|
1683
2003
|
|
1684
2004
|
if (fi_store_offsets(fi)) {
|
1685
2005
|
/* use delta encoding for offsets */
|
1686
|
-
|
2006
|
+
i64 last_end = 0;
|
1687
2007
|
os_write_vint(fdt_out, offset_count); /* write shared prefix length */
|
1688
2008
|
for (i = 0; i < offset_count; i++) {
|
1689
|
-
|
1690
|
-
|
1691
|
-
os_write_vll(fdt_out, (
|
1692
|
-
os_write_vll(fdt_out, (
|
2009
|
+
i64 start = (i64)offsets[i].start;
|
2010
|
+
i64 end = (i64)offsets[i].end;
|
2011
|
+
os_write_vll(fdt_out, (u64)(start - last_end));
|
2012
|
+
os_write_vll(fdt_out, (u64)(end - start));
|
1693
2013
|
last_end = end;
|
1694
2014
|
}
|
1695
2015
|
}
|
@@ -1706,16 +2026,16 @@ void fw_add_postings(FieldsWriter *fw,
|
|
1706
2026
|
|
1707
2027
|
char *te_get_term(TermEnum *te)
|
1708
2028
|
{
|
1709
|
-
return memcpy(ALLOC_N(char, te->curr_term_len + 1),
|
1710
|
-
|
2029
|
+
return (char *)memcpy(ALLOC_N(char, te->curr_term_len + 1),
|
2030
|
+
te->curr_term, te->curr_term_len + 1);
|
1711
2031
|
}
|
1712
2032
|
|
1713
2033
|
TermInfo *te_get_ti(TermEnum *te)
|
1714
2034
|
{
|
1715
|
-
return memcpy(ALLOC(TermInfo), &(te->curr_ti), sizeof(TermInfo));
|
2035
|
+
return (TermInfo*)memcpy(ALLOC(TermInfo), &(te->curr_ti), sizeof(TermInfo));
|
1716
2036
|
}
|
1717
2037
|
|
1718
|
-
char *te_skip_to(TermEnum *te, const char *term)
|
2038
|
+
static char *te_skip_to(TermEnum *te, const char *term)
|
1719
2039
|
{
|
1720
2040
|
char *curr_term = te->curr_term;
|
1721
2041
|
if (strcmp(curr_term, term) < 0) {
|
@@ -1742,8 +2062,8 @@ static void sti_destroy(SegmentTermIndex *sti)
|
|
1742
2062
|
{
|
1743
2063
|
if (sti->index_terms) {
|
1744
2064
|
int i;
|
1745
|
-
const int
|
1746
|
-
for (i = 0; i <
|
2065
|
+
const int sti_index_cnt = sti->index_cnt;
|
2066
|
+
for (i = 0; i < sti_index_cnt; i++) {
|
1747
2067
|
free(sti->index_terms[i]);
|
1748
2068
|
}
|
1749
2069
|
free(sti->index_terms);
|
@@ -1759,20 +2079,20 @@ static void sti_ensure_index_is_read(SegmentTermIndex *sti,
|
|
1759
2079
|
{
|
1760
2080
|
if (NULL == sti->index_terms) {
|
1761
2081
|
int i;
|
1762
|
-
int
|
2082
|
+
int index_cnt = sti->index_cnt;
|
1763
2083
|
off_t index_ptr = 0;
|
1764
2084
|
ste_reset(index_te);
|
1765
2085
|
is_seek(STE(index_te)->is, sti->index_ptr);
|
1766
|
-
STE(index_te)->size = sti->
|
1767
|
-
|
1768
|
-
sti->index_terms = ALLOC_N(char *,
|
1769
|
-
sti->index_term_lens = ALLOC_N(int,
|
1770
|
-
sti->index_term_infos = ALLOC_N(TermInfo,
|
1771
|
-
sti->index_ptrs = ALLOC_N(off_t,
|
1772
|
-
|
2086
|
+
STE(index_te)->size = sti->index_cnt;
|
2087
|
+
|
2088
|
+
sti->index_terms = ALLOC_N(char *, index_cnt);
|
2089
|
+
sti->index_term_lens = ALLOC_N(int, index_cnt);
|
2090
|
+
sti->index_term_infos = ALLOC_N(TermInfo, index_cnt);
|
2091
|
+
sti->index_ptrs = ALLOC_N(off_t, index_cnt);
|
2092
|
+
|
1773
2093
|
for (i = 0; NULL != ste_next(index_te); i++) {
|
1774
2094
|
#ifdef DEBUG
|
1775
|
-
if (i >=
|
2095
|
+
if (i >= index_cnt) {
|
1776
2096
|
RAISE(FERRET_ERROR, "index term enum read too many terms");
|
1777
2097
|
}
|
1778
2098
|
#endif
|
@@ -1788,7 +2108,7 @@ static void sti_ensure_index_is_read(SegmentTermIndex *sti,
|
|
1788
2108
|
static int sti_get_index_offset(SegmentTermIndex *sti, const char *term)
|
1789
2109
|
{
|
1790
2110
|
int lo = 0;
|
1791
|
-
int hi = sti->
|
2111
|
+
int hi = sti->index_cnt - 1;
|
1792
2112
|
int mid, delta;
|
1793
2113
|
char **index_terms = sti->index_terms;
|
1794
2114
|
|
@@ -1842,7 +2162,7 @@ SegmentFieldIndex *sfi_open(Store *store, const char *segment)
|
|
1842
2162
|
SegmentTermIndex *sti = ALLOC_AND_ZERO(SegmentTermIndex);
|
1843
2163
|
sti->index_ptr = is_read_voff_t(is);
|
1844
2164
|
sti->ptr = is_read_voff_t(is);
|
1845
|
-
sti->
|
2165
|
+
sti->index_cnt = is_read_vint(is);
|
1846
2166
|
sti->size = is_read_vint(is);
|
1847
2167
|
h_set_int(sfi->field_dict, field_num, sti);
|
1848
2168
|
}
|
@@ -1912,7 +2232,8 @@ static void ste_reset(TermEnum *te)
|
|
1912
2232
|
|
1913
2233
|
static TermEnum *ste_set_field(TermEnum *te, int field_num)
|
1914
2234
|
{
|
1915
|
-
SegmentTermIndex *sti
|
2235
|
+
SegmentTermIndex *sti
|
2236
|
+
= (SegmentTermIndex *)h_get_int(STE(te)->sfi->field_dict, field_num);
|
1916
2237
|
ste_reset(te);
|
1917
2238
|
te->field_num = field_num;
|
1918
2239
|
if (sti) {
|
@@ -1940,7 +2261,8 @@ static void ste_index_seek(TermEnum *te, SegmentTermIndex *sti, int idx_offset)
|
|
1940
2261
|
static char *ste_scan_to(TermEnum *te, const char *term)
|
1941
2262
|
{
|
1942
2263
|
SegmentFieldIndex *sfi = STE(te)->sfi;
|
1943
|
-
SegmentTermIndex *sti
|
2264
|
+
SegmentTermIndex *sti
|
2265
|
+
= (SegmentTermIndex *)h_get_int(sfi->field_dict, te->field_num);
|
1944
2266
|
if (sti && sti->size > 0) {
|
1945
2267
|
SFI_ENSURE_INDEX_IS_READ(sfi, sti);
|
1946
2268
|
if (term[0] == '\0') {
|
@@ -1952,8 +2274,8 @@ static char *ste_scan_to(TermEnum *te, const char *term)
|
|
1952
2274
|
int enum_offset = (int)(STE(te)->pos / sfi->index_interval) + 1;
|
1953
2275
|
/* if we are at the end of the index or before the next index
|
1954
2276
|
* ptr then a simple scan suffices */
|
1955
|
-
if (sti->
|
1956
|
-
strcmp(term, sti->index_terms[enum_offset]) < 0) {
|
2277
|
+
if (sti->index_cnt == enum_offset ||
|
2278
|
+
strcmp(term, sti->index_terms[enum_offset]) < 0) {
|
1957
2279
|
return te_skip_to(te, term);
|
1958
2280
|
}
|
1959
2281
|
}
|
@@ -2014,8 +2336,8 @@ static char *ste_get_term(TermEnum *te, int pos)
|
|
2014
2336
|
else if (pos != ste->pos) {
|
2015
2337
|
int idx_int = ste->sfi->index_interval;
|
2016
2338
|
if ((pos < ste->pos) || pos > (1 + ste->pos / idx_int) * idx_int) {
|
2017
|
-
SegmentTermIndex *sti = h_get_int(
|
2018
|
-
|
2339
|
+
SegmentTermIndex *sti = (SegmentTermIndex *)h_get_int(
|
2340
|
+
ste->sfi->field_dict, te->field_num);
|
2019
2341
|
SFI_ENSURE_INDEX_IS_READ(ste->sfi, sti);
|
2020
2342
|
ste_index_seek(te, sti, pos / idx_int);
|
2021
2343
|
}
|
@@ -2119,8 +2441,8 @@ static void tew_destroy(TermEnumWrapper *tew)
|
|
2119
2441
|
tew->te->close(tew->te);
|
2120
2442
|
}
|
2121
2443
|
|
2122
|
-
TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, TermEnum *te,
|
2123
|
-
|
2444
|
+
static TermEnumWrapper *tew_setup(TermEnumWrapper *tew, int index, TermEnum *te,
|
2445
|
+
IndexReader *ir)
|
2124
2446
|
{
|
2125
2447
|
tew->index = index;
|
2126
2448
|
tew->ir = ir;
|
@@ -2263,7 +2585,8 @@ TermEnum *mte_new(MultiReader *mr, int field_num, const char *term)
|
|
2263
2585
|
|| (tew->term && (tew->term[0] != '\0'))) {
|
2264
2586
|
pq_push(mte->tew_queue, tew); /* initialize queue */
|
2265
2587
|
}
|
2266
|
-
}
|
2588
|
+
}
|
2589
|
+
else {
|
2267
2590
|
/* add the term_enum_wrapper just in case */
|
2268
2591
|
sub_te = reader->terms(reader, 0);
|
2269
2592
|
sub_te->field_num = -1;
|
@@ -2303,7 +2626,7 @@ TermInfosReader *tir_open(Store *store,
|
|
2303
2626
|
static INLINE TermEnum *tir_enum(TermInfosReader *tir)
|
2304
2627
|
{
|
2305
2628
|
TermEnum *te;
|
2306
|
-
if (NULL == (te = thread_getspecific(tir->thread_te))) {
|
2629
|
+
if (NULL == (te = (TermEnum *)thread_getspecific(tir->thread_te))) {
|
2307
2630
|
te = ste_clone(tir->orig_te);
|
2308
2631
|
ste_set_field(te, tir->field_num);
|
2309
2632
|
ary_push(tir->te_bucket, te);
|
@@ -2333,8 +2656,8 @@ TermInfo *tir_get_ti(TermInfosReader *tir, const char *term)
|
|
2333
2656
|
return NULL;
|
2334
2657
|
}
|
2335
2658
|
|
2336
|
-
TermInfo *tir_get_ti_field(TermInfosReader *tir, int field_num,
|
2337
|
-
|
2659
|
+
static TermInfo *tir_get_ti_field(TermInfosReader *tir, int field_num,
|
2660
|
+
const char *term)
|
2338
2661
|
{
|
2339
2662
|
TermEnum *te = tir_enum(tir);
|
2340
2663
|
char *match;
|
@@ -2352,7 +2675,7 @@ TermInfo *tir_get_ti_field(TermInfosReader *tir, int field_num,
|
|
2352
2675
|
}
|
2353
2676
|
|
2354
2677
|
char *tir_get_term(TermInfosReader *tir, int pos)
|
2355
|
-
{
|
2678
|
+
{
|
2356
2679
|
if (pos < 0) {
|
2357
2680
|
return NULL;
|
2358
2681
|
}
|
@@ -2455,11 +2778,11 @@ static void tw_add(TermWriter *tw,
|
|
2455
2778
|
tw->last_term, term, *tw->last_term, *term);
|
2456
2779
|
}
|
2457
2780
|
if (ti->frq_ptr < tw->last_term_info.frq_ptr) {
|
2458
|
-
RAISE(STATE_ERROR, "%"
|
2781
|
+
RAISE(STATE_ERROR, "%"OFF_T_PFX"d > %"OFF_T_PFX"d", ti->frq_ptr,
|
2459
2782
|
tw->last_term_info.frq_ptr);
|
2460
2783
|
}
|
2461
2784
|
if (ti->prx_ptr < tw->last_term_info.prx_ptr) {
|
2462
|
-
RAISE(STATE_ERROR, "%"
|
2785
|
+
RAISE(STATE_ERROR, "%"OFF_T_PFX"d > %"OFF_T_PFX"d", ti->prx_ptr,
|
2463
2786
|
tw->last_term_info.prx_ptr);
|
2464
2787
|
}
|
2465
2788
|
#endif
|
@@ -2611,7 +2934,7 @@ static bool stde_next(TermDocEnum *tde)
|
|
2611
2934
|
int doc_code;
|
2612
2935
|
SegmentTermDocEnum *stde = STDE(tde);
|
2613
2936
|
|
2614
|
-
while (true) {
|
2937
|
+
while (true) {
|
2615
2938
|
if (stde->count >= stde->doc_freq) {
|
2616
2939
|
return false;
|
2617
2940
|
}
|
@@ -2726,7 +3049,7 @@ static bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
|
|
2726
3049
|
}
|
2727
3050
|
|
2728
3051
|
/* done skipping, now just scan */
|
2729
|
-
do {
|
3052
|
+
do {
|
2730
3053
|
if (!tde->next(tde)) {
|
2731
3054
|
return false;
|
2732
3055
|
}
|
@@ -2746,12 +3069,12 @@ static void stde_close(TermDocEnum *tde)
|
|
2746
3069
|
}
|
2747
3070
|
|
2748
3071
|
static void stde_skip_prox(SegmentTermDocEnum *stde)
|
2749
|
-
{
|
3072
|
+
{
|
2750
3073
|
(void)stde;
|
2751
3074
|
}
|
2752
3075
|
|
2753
3076
|
static void stde_seek_prox(SegmentTermDocEnum *stde, off_t prx_ptr)
|
2754
|
-
{
|
3077
|
+
{
|
2755
3078
|
(void)stde;
|
2756
3079
|
(void)prx_ptr;
|
2757
3080
|
}
|
@@ -2812,7 +3135,7 @@ static void stpe_seek(TermDocEnum *tde, int field_num, const char *term)
|
|
2812
3135
|
stde->prx_cnt = 0;
|
2813
3136
|
}
|
2814
3137
|
|
2815
|
-
bool stpe_next(TermDocEnum *tde)
|
3138
|
+
static bool stpe_next(TermDocEnum *tde)
|
2816
3139
|
{
|
2817
3140
|
SegmentTermDocEnum *stde = STDE(tde);
|
2818
3141
|
is_skip_vints(stde->prx_in, stde->prx_cnt);
|
@@ -2829,7 +3152,7 @@ bool stpe_next(TermDocEnum *tde)
|
|
2829
3152
|
}
|
2830
3153
|
}
|
2831
3154
|
|
2832
|
-
int stpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
3155
|
+
static int stpe_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
|
2833
3156
|
{
|
2834
3157
|
(void)tde; (void)docs; (void)freqs; (void)req_num;
|
2835
3158
|
RAISE(ARG_ERROR, "TermPosEnum does not handle processing multiple documents"
|
@@ -2944,9 +3267,11 @@ static void mtde_seek_te(TermDocEnum *tde, TermEnum *te)
|
|
2944
3267
|
mtde->state[index] = 1;
|
2945
3268
|
if (tde->close == stde_close) {
|
2946
3269
|
stde_seek_ti(STDE(tde), MTE(te)->tis + i);
|
2947
|
-
}
|
3270
|
+
}
|
3271
|
+
else if (tde->close == stpe_close) {
|
2948
3272
|
stpe_seek_ti(STDE(tde), MTE(te)->tis + i);
|
2949
|
-
}
|
3273
|
+
}
|
3274
|
+
else {
|
2950
3275
|
tde->seek(tde, MTE(te)->tews[index].te->field_num, te->curr_term);
|
2951
3276
|
}
|
2952
3277
|
}
|
@@ -2963,7 +3288,8 @@ static void mtde_seek(TermDocEnum *tde, int field_num, const char *term)
|
|
2963
3288
|
te->set_field(te, field_num);
|
2964
3289
|
if (NULL != (t = te->skip_to(te, term)) && 0 == strcmp(term, t)) {
|
2965
3290
|
mtde_seek_te(tde, te);
|
2966
|
-
}
|
3291
|
+
}
|
3292
|
+
else {
|
2967
3293
|
memset(mtde->state, 0, mtde->ir_cnt);
|
2968
3294
|
}
|
2969
3295
|
}
|
@@ -3051,7 +3377,7 @@ static void mtde_close(TermDocEnum *tde)
|
|
3051
3377
|
free(tde);
|
3052
3378
|
}
|
3053
3379
|
|
3054
|
-
TermDocEnum *mtxe_new(MultiReader *mr)
|
3380
|
+
static TermDocEnum *mtxe_new(MultiReader *mr)
|
3055
3381
|
{
|
3056
3382
|
MultiTermDocEnum *mtde = ALLOC_AND_ZERO(MultiTermDocEnum);
|
3057
3383
|
TermDocEnum *tde = TDE(mtde);
|
@@ -3074,7 +3400,7 @@ TermDocEnum *mtxe_new(MultiReader *mr)
|
|
3074
3400
|
return tde;
|
3075
3401
|
}
|
3076
3402
|
|
3077
|
-
TermDocEnum *mtde_new(MultiReader *mr)
|
3403
|
+
static TermDocEnum *mtde_new(MultiReader *mr)
|
3078
3404
|
{
|
3079
3405
|
int i;
|
3080
3406
|
TermDocEnum *tde = mtxe_new(mr);
|
@@ -3090,13 +3416,13 @@ TermDocEnum *mtde_new(MultiReader *mr)
|
|
3090
3416
|
* MultiTermPosEnum
|
3091
3417
|
****************************************************************************/
|
3092
3418
|
|
3093
|
-
int mtpe_next_position(TermDocEnum *tde)
|
3419
|
+
static int mtpe_next_position(TermDocEnum *tde)
|
3094
3420
|
{
|
3095
3421
|
CHECK_CURR_TDE("next_position");
|
3096
3422
|
return MTDE(tde)->curr_tde->next_position(MTDE(tde)->curr_tde);
|
3097
3423
|
}
|
3098
3424
|
|
3099
|
-
TermDocEnum *mtpe_new(MultiReader *mr)
|
3425
|
+
static TermDocEnum *mtpe_new(MultiReader *mr)
|
3100
3426
|
{
|
3101
3427
|
int i;
|
3102
3428
|
TermDocEnum *tde = mtxe_new(mr);
|
@@ -3184,7 +3510,7 @@ static bool mtdpe_next(TermDocEnum *tde)
|
|
3184
3510
|
pq_down(mtdpe->pq);
|
3185
3511
|
}
|
3186
3512
|
else {
|
3187
|
-
sub_tde = pq_pop(mtdpe->pq);
|
3513
|
+
sub_tde = (TermDocEnum *)pq_pop(mtdpe->pq);
|
3188
3514
|
sub_tde->close(sub_tde);
|
3189
3515
|
}
|
3190
3516
|
sub_tde = (TermDocEnum *)pq_top(mtdpe->pq);
|
@@ -3199,12 +3525,12 @@ static bool mtdpe_next(TermDocEnum *tde)
|
|
3199
3525
|
return true;
|
3200
3526
|
}
|
3201
3527
|
|
3202
|
-
bool tdpe_less_than(TermDocEnum *p1, TermDocEnum *p2)
|
3528
|
+
static bool tdpe_less_than(TermDocEnum *p1, TermDocEnum *p2)
|
3203
3529
|
{
|
3204
3530
|
return p1->doc_num(p1) < p2->doc_num(p2);
|
3205
3531
|
}
|
3206
3532
|
|
3207
|
-
bool mtdpe_skip_to(TermDocEnum *tde, int target_doc_num)
|
3533
|
+
static bool mtdpe_skip_to(TermDocEnum *tde, int target_doc_num)
|
3208
3534
|
{
|
3209
3535
|
TermDocEnum *sub_tde;
|
3210
3536
|
PriorityQueue *mtdpe_pq = MTDPE(tde)->pq;
|
@@ -3215,7 +3541,7 @@ bool mtdpe_skip_to(TermDocEnum *tde, int target_doc_num)
|
|
3215
3541
|
pq_down(mtdpe_pq);
|
3216
3542
|
}
|
3217
3543
|
else {
|
3218
|
-
sub_tde = pq_pop(mtdpe_pq);
|
3544
|
+
sub_tde = (TermDocEnum *)pq_pop(mtdpe_pq);
|
3219
3545
|
sub_tde->close(sub_tde);
|
3220
3546
|
}
|
3221
3547
|
}
|
@@ -3284,22 +3610,21 @@ TermDocEnum *mtdpe_new(IndexReader *ir, int field_num, char **terms, int t_cnt)
|
|
3284
3610
|
*
|
3285
3611
|
****************************************************************************/
|
3286
3612
|
|
3287
|
-
static
|
3613
|
+
static Hash *fn_extensions = NULL;
|
3288
3614
|
static void file_name_filter_init()
|
3289
3615
|
{
|
3290
|
-
|
3291
|
-
|
3292
|
-
|
3293
|
-
|
3294
|
-
h_set(fn_extensions, INDEX_EXTENSIONS[i], (char *)INDEX_EXTENSIONS[i]);
|
3295
|
-
}
|
3296
|
-
register_for_cleanup(fn_extensions, (free_ft)&h_destroy);
|
3616
|
+
int i;
|
3617
|
+
fn_extensions = h_new_str((free_ft)NULL, (free_ft)NULL);
|
3618
|
+
for (i = 0; i < NELEMS(INDEX_EXTENSIONS); i++) {
|
3619
|
+
h_set(fn_extensions, INDEX_EXTENSIONS[i], (char *)INDEX_EXTENSIONS[i]);
|
3297
3620
|
}
|
3621
|
+
register_for_cleanup(fn_extensions, (free_ft)&h_destroy);
|
3298
3622
|
}
|
3299
3623
|
|
3300
|
-
|
3624
|
+
bool file_name_filter_is_index_file(const char *file_name, bool include_locks)
|
3301
3625
|
{
|
3302
3626
|
char *p = strrchr(file_name, '.');
|
3627
|
+
if (NULL == fn_extensions) file_name_filter_init();
|
3303
3628
|
if (NULL != p) {
|
3304
3629
|
char *extension = p + 1;
|
3305
3630
|
if (NULL != h_get(fn_extensions, extension)) {
|
@@ -3310,6 +3635,10 @@ static bool file_name_filter_accept(char *file_name)
|
|
3310
3635
|
&& *(extension + 1) <= '9') {
|
3311
3636
|
return true;
|
3312
3637
|
}
|
3638
|
+
else if (include_locks && (strcmp(extension, "lck") == 0)
|
3639
|
+
&& (strncmp(file_name, "ferret", 6) == 0)) {
|
3640
|
+
return true;
|
3641
|
+
}
|
3313
3642
|
}
|
3314
3643
|
else if (0 == strncmp(SEGMENTS_FILE_NAME, file_name,
|
3315
3644
|
sizeof(SEGMENTS_FILE_NAME) - 1)) {
|
@@ -3323,7 +3652,7 @@ static bool file_name_filter_accept(char *file_name)
|
|
3323
3652
|
* function should only be called on files that pass the above "accept" (ie,
|
3324
3653
|
* are already known to be a Lucene index file).
|
3325
3654
|
*/
|
3326
|
-
static bool file_name_filter_is_cfs_file(char *file_name) {
|
3655
|
+
static bool file_name_filter_is_cfs_file(const char *file_name) {
|
3327
3656
|
char *p = strrchr(file_name, '.');
|
3328
3657
|
if (NULL != p) {
|
3329
3658
|
char *extension = p + 1;
|
@@ -3364,7 +3693,7 @@ void deleter_destroy(Deleter *dlr)
|
|
3364
3693
|
free(dlr);
|
3365
3694
|
}
|
3366
3695
|
|
3367
|
-
void deleter_queue_file(Deleter *dlr, char *file_name)
|
3696
|
+
static void deleter_queue_file(Deleter *dlr, const char *file_name)
|
3368
3697
|
{
|
3369
3698
|
hs_add(dlr->pending, estrdup(file_name));
|
3370
3699
|
}
|
@@ -3382,12 +3711,12 @@ void deleter_delete_file(Deleter *dlr, char *file_name)
|
|
3382
3711
|
XENDTRY
|
3383
3712
|
}
|
3384
3713
|
|
3385
|
-
void deleter_commit_pending_deletions(Deleter *dlr)
|
3714
|
+
static void deleter_commit_pending_deletions(Deleter *dlr)
|
3386
3715
|
{
|
3387
|
-
|
3388
|
-
|
3389
|
-
|
3390
|
-
deleter_delete_file(dlr,
|
3716
|
+
HashSetEntry *hse, *hse_next = dlr->pending->first;
|
3717
|
+
while ((hse = hse_next) != NULL) {
|
3718
|
+
hse_next = hse->next;
|
3719
|
+
deleter_delete_file(dlr, (char *)hse->elem);
|
3391
3720
|
}
|
3392
3721
|
}
|
3393
3722
|
|
@@ -3403,15 +3732,15 @@ void deleter_delete_files(Deleter *dlr, char **files, int file_cnt)
|
|
3403
3732
|
struct DelFilesArg {
|
3404
3733
|
char curr_seg_file_name[SEGMENT_NAME_MAX_LENGTH];
|
3405
3734
|
Deleter *dlr;
|
3406
|
-
|
3735
|
+
Hash *current;
|
3407
3736
|
};
|
3408
3737
|
|
3409
|
-
static void deleter_find_deletable_files_i(char *file_name, void *arg)
|
3738
|
+
static void deleter_find_deletable_files_i(const char *file_name, void *arg)
|
3410
3739
|
{
|
3411
3740
|
struct DelFilesArg *dfa = (struct DelFilesArg *)arg;
|
3412
3741
|
Deleter *dlr = dfa->dlr;
|
3413
3742
|
|
3414
|
-
if (
|
3743
|
+
if (file_name_filter_is_index_file(file_name, false)
|
3415
3744
|
&& 0 != strcmp(file_name, dfa->curr_seg_file_name)
|
3416
3745
|
&& 0 != strcmp(file_name, SEGMENTS_GEN_FILE_NAME)) {
|
3417
3746
|
|
@@ -3427,7 +3756,8 @@ static void deleter_find_deletable_files_i(char *file_name, void *arg)
|
|
3427
3756
|
if (NULL != p) {
|
3428
3757
|
*p = '\0';
|
3429
3758
|
extension = p + 1;
|
3430
|
-
}
|
3759
|
+
}
|
3760
|
+
else {
|
3431
3761
|
extension = NULL;
|
3432
3762
|
}
|
3433
3763
|
|
@@ -3439,7 +3769,7 @@ static void deleter_find_deletable_files_i(char *file_name, void *arg)
|
|
3439
3769
|
|
3440
3770
|
/* Delete this file if it's not a "current" segment, or, it is a
|
3441
3771
|
* single index file but there is now a corresponding compound file: */
|
3442
|
-
if (NULL == (si = h_get(dfa->current, segment_name))) {
|
3772
|
+
if (NULL == (si = (SegmentInfo *)h_get(dfa->current, segment_name))) {
|
3443
3773
|
/* Delete if segment is not referenced: */
|
3444
3774
|
do_delete = true;
|
3445
3775
|
}
|
@@ -3497,7 +3827,7 @@ void deleter_find_deletable_files(Deleter *dlr)
|
|
3497
3827
|
SegmentInfos *sis = dlr->sis;
|
3498
3828
|
Store *store = dlr->store;
|
3499
3829
|
struct DelFilesArg dfa;
|
3500
|
-
|
3830
|
+
Hash *current = dfa.current
|
3501
3831
|
= h_new_str((free_ft)NULL, (free_ft)si_deref);
|
3502
3832
|
dfa.dlr = dlr;
|
3503
3833
|
|
@@ -3511,22 +3841,24 @@ void deleter_find_deletable_files(Deleter *dlr)
|
|
3511
3841
|
* and add to deletable if they are not referenced by the current segments
|
3512
3842
|
* info: */
|
3513
3843
|
sis_curr_seg_file_name(dfa.curr_seg_file_name, store);
|
3514
|
-
file_name_filter_init();
|
3515
3844
|
|
3516
3845
|
store->each(store, &deleter_find_deletable_files_i, &dfa);
|
3517
3846
|
h_destroy(dfa.current);
|
3518
3847
|
}
|
3519
3848
|
|
3520
|
-
void deleter_delete_deletable_files(Deleter *dlr)
|
3849
|
+
static void deleter_delete_deletable_files(Deleter *dlr)
|
3521
3850
|
{
|
3522
3851
|
deleter_find_deletable_files(dlr);
|
3523
3852
|
deleter_commit_pending_deletions(dlr);
|
3524
3853
|
}
|
3525
3854
|
|
3526
|
-
|
3855
|
+
/*
|
3856
|
+
TODO: currently not used. Why not?
|
3857
|
+
static void deleter_clear_pending_deletions(Deleter *dlr)
|
3527
3858
|
{
|
3528
3859
|
hs_clear(dlr->pending);
|
3529
3860
|
}
|
3861
|
+
*/
|
3530
3862
|
|
3531
3863
|
/****************************************************************************
|
3532
3864
|
*
|
@@ -3534,13 +3866,13 @@ void deleter_clear_pending_deletions(Deleter *dlr)
|
|
3534
3866
|
*
|
3535
3867
|
****************************************************************************/
|
3536
3868
|
|
3537
|
-
void ir_acquire_not_necessary(IndexReader *ir)
|
3869
|
+
static void ir_acquire_not_necessary(IndexReader *ir)
|
3538
3870
|
{
|
3539
3871
|
(void)ir;
|
3540
3872
|
}
|
3541
3873
|
|
3542
3874
|
#define I64_PFX POSH_I64_PRINTF_PREFIX
|
3543
|
-
void ir_acquire_write_lock(IndexReader *ir)
|
3875
|
+
static void ir_acquire_write_lock(IndexReader *ir)
|
3544
3876
|
{
|
3545
3877
|
if (ir->is_stale) {
|
3546
3878
|
RAISE(STATE_ERROR, "IndexReader out of date and no longer valid for "
|
@@ -3579,10 +3911,11 @@ void ir_acquire_write_lock(IndexReader *ir)
|
|
3579
3911
|
}
|
3580
3912
|
}
|
3581
3913
|
|
3582
|
-
IndexReader *ir_setup(IndexReader *ir, Store *store, SegmentInfos *sis,
|
3914
|
+
static IndexReader *ir_setup(IndexReader *ir, Store *store, SegmentInfos *sis,
|
3583
3915
|
FieldInfos *fis, int is_owner)
|
3584
3916
|
{
|
3585
3917
|
mutex_init(&ir->mutex, NULL);
|
3918
|
+
mutex_init(&ir->field_index_mutex, NULL);
|
3586
3919
|
|
3587
3920
|
if (store) {
|
3588
3921
|
ir->store = store;
|
@@ -3608,16 +3941,17 @@ bool ir_index_exists(Store *store)
|
|
3608
3941
|
return sis_current_segment_generation(store) != 1;
|
3609
3942
|
}
|
3610
3943
|
|
3611
|
-
int ir_get_field_num(IndexReader *ir,
|
3944
|
+
int ir_get_field_num(IndexReader *ir, Symbol field)
|
3612
3945
|
{
|
3613
3946
|
int field_num = fis_get_field_num(ir->fis, field);
|
3614
3947
|
if (field_num < 0) {
|
3615
|
-
RAISE(ARG_ERROR,
|
3948
|
+
RAISE(ARG_ERROR,
|
3949
|
+
"Field :%s does not exist in this index", (char *)field);
|
3616
3950
|
}
|
3617
3951
|
return field_num;
|
3618
3952
|
}
|
3619
3953
|
|
3620
|
-
int ir_doc_freq(IndexReader *ir,
|
3954
|
+
int ir_doc_freq(IndexReader *ir, Symbol field, const char *term)
|
3621
3955
|
{
|
3622
3956
|
int field_num = fis_get_field_num(ir->fis, field);
|
3623
3957
|
if (field_num >= 0) {
|
@@ -3637,7 +3971,7 @@ static void ir_set_norm_i(IndexReader *ir, int doc_num, int field_num, uchar val
|
|
3637
3971
|
mutex_unlock(&ir->mutex);
|
3638
3972
|
}
|
3639
3973
|
|
3640
|
-
void ir_set_norm(IndexReader *ir, int doc_num,
|
3974
|
+
void ir_set_norm(IndexReader *ir, int doc_num, Symbol field, uchar val)
|
3641
3975
|
{
|
3642
3976
|
int field_num = fis_get_field_num(ir->fis, field);
|
3643
3977
|
if (field_num >= 0) {
|
@@ -3653,20 +3987,20 @@ uchar *ir_get_norms_i(IndexReader *ir, int field_num)
|
|
3653
3987
|
}
|
3654
3988
|
if (!norms) {
|
3655
3989
|
if (NULL == ir->fake_norms) {
|
3656
|
-
ir->fake_norms = (uchar
|
3990
|
+
ir->fake_norms = ALLOC_AND_ZERO_N(uchar, ir->max_doc(ir));
|
3657
3991
|
}
|
3658
3992
|
norms = ir->fake_norms;
|
3659
3993
|
}
|
3660
3994
|
return norms;
|
3661
3995
|
}
|
3662
3996
|
|
3663
|
-
uchar *ir_get_norms(IndexReader *ir,
|
3997
|
+
uchar *ir_get_norms(IndexReader *ir, Symbol field)
|
3664
3998
|
{
|
3665
3999
|
int field_num = fis_get_field_num(ir->fis, field);
|
3666
4000
|
return ir_get_norms_i(ir, field_num);
|
3667
4001
|
}
|
3668
4002
|
|
3669
|
-
uchar *ir_get_norms_into(IndexReader *ir,
|
4003
|
+
uchar *ir_get_norms_into(IndexReader *ir, Symbol field, uchar *buf)
|
3670
4004
|
{
|
3671
4005
|
int field_num = fis_get_field_num(ir->fis, field);
|
3672
4006
|
if (field_num >= 0) {
|
@@ -3698,7 +4032,7 @@ void ir_delete_doc(IndexReader *ir, int doc_num)
|
|
3698
4032
|
}
|
3699
4033
|
}
|
3700
4034
|
|
3701
|
-
Document *ir_get_doc_with_term(IndexReader *ir,
|
4035
|
+
Document *ir_get_doc_with_term(IndexReader *ir, Symbol field,
|
3702
4036
|
const char *term)
|
3703
4037
|
{
|
3704
4038
|
TermDocEnum *tde = ir_term_docs_for(ir, field, term);
|
@@ -3713,7 +4047,7 @@ Document *ir_get_doc_with_term(IndexReader *ir, const char *field,
|
|
3713
4047
|
return doc;
|
3714
4048
|
}
|
3715
4049
|
|
3716
|
-
TermEnum *ir_terms(IndexReader *ir,
|
4050
|
+
TermEnum *ir_terms(IndexReader *ir, Symbol field)
|
3717
4051
|
{
|
3718
4052
|
TermEnum *te = NULL;
|
3719
4053
|
int field_num = fis_get_field_num(ir->fis, field);
|
@@ -3723,7 +4057,7 @@ TermEnum *ir_terms(IndexReader *ir, const char *field)
|
|
3723
4057
|
return te;
|
3724
4058
|
}
|
3725
4059
|
|
3726
|
-
TermEnum *ir_terms_from(IndexReader *ir,
|
4060
|
+
TermEnum *ir_terms_from(IndexReader *ir, Symbol field,
|
3727
4061
|
const char *term)
|
3728
4062
|
{
|
3729
4063
|
TermEnum *te = NULL;
|
@@ -3734,7 +4068,7 @@ TermEnum *ir_terms_from(IndexReader *ir, const char *field,
|
|
3734
4068
|
return te;
|
3735
4069
|
}
|
3736
4070
|
|
3737
|
-
TermDocEnum *ir_term_docs_for(IndexReader *ir,
|
4071
|
+
TermDocEnum *ir_term_docs_for(IndexReader *ir, Symbol field,
|
3738
4072
|
const char *term)
|
3739
4073
|
{
|
3740
4074
|
int field_num = fis_get_field_num(ir->fis, field);
|
@@ -3745,7 +4079,7 @@ TermDocEnum *ir_term_docs_for(IndexReader *ir, const char *field,
|
|
3745
4079
|
return tde;
|
3746
4080
|
}
|
3747
4081
|
|
3748
|
-
TermDocEnum *ir_term_positions_for(IndexReader *ir,
|
4082
|
+
TermDocEnum *ir_term_positions_for(IndexReader *ir, Symbol field,
|
3749
4083
|
const char *term)
|
3750
4084
|
{
|
3751
4085
|
int field_num = fis_get_field_num(ir->fis, field);
|
@@ -3756,7 +4090,7 @@ TermDocEnum *ir_term_positions_for(IndexReader *ir, const char *field,
|
|
3756
4090
|
return tde;
|
3757
4091
|
}
|
3758
4092
|
|
3759
|
-
void ir_commit_i(IndexReader *ir)
|
4093
|
+
static void ir_commit_i(IndexReader *ir)
|
3760
4094
|
{
|
3761
4095
|
if (ir->has_changes) {
|
3762
4096
|
if (NULL == ir->deleter && NULL != ir->store) {
|
@@ -3769,7 +4103,7 @@ void ir_commit_i(IndexReader *ir)
|
|
3769
4103
|
mutex_lock(&ir->store->mutex);
|
3770
4104
|
|
3771
4105
|
sis_curr_seg_file_name(curr_seg_fn, ir->store);
|
3772
|
-
|
4106
|
+
|
3773
4107
|
ir->commit_i(ir);
|
3774
4108
|
sis_write(ir->sis, ir->store, ir->deleter);
|
3775
4109
|
|
@@ -3813,8 +4147,8 @@ void ir_close(IndexReader *ir)
|
|
3813
4147
|
if (ir->cache) {
|
3814
4148
|
h_destroy(ir->cache);
|
3815
4149
|
}
|
3816
|
-
if (ir->
|
3817
|
-
h_destroy(ir->
|
4150
|
+
if (ir->field_index_cache) {
|
4151
|
+
h_destroy(ir->field_index_cache);
|
3818
4152
|
}
|
3819
4153
|
if (ir->deleter && ir->is_owner) {
|
3820
4154
|
deleter_destroy(ir->deleter);
|
@@ -3822,8 +4156,10 @@ void ir_close(IndexReader *ir)
|
|
3822
4156
|
free(ir->fake_norms);
|
3823
4157
|
|
3824
4158
|
mutex_destroy(&ir->mutex);
|
4159
|
+
mutex_destroy(&ir->field_index_mutex);
|
3825
4160
|
free(ir);
|
3826
|
-
}
|
4161
|
+
}
|
4162
|
+
else {
|
3827
4163
|
mutex_unlock(&ir->mutex);
|
3828
4164
|
}
|
3829
4165
|
|
@@ -3910,7 +4246,7 @@ typedef struct SegmentReader {
|
|
3910
4246
|
TermInfosReader *tir;
|
3911
4247
|
thread_key_t thread_fr;
|
3912
4248
|
void **fr_bucket;
|
3913
|
-
|
4249
|
+
Hash *norms;
|
3914
4250
|
Store *cfs_store;
|
3915
4251
|
bool deleted_docs_dirty : 1;
|
3916
4252
|
bool undelete_all : 1;
|
@@ -3926,7 +4262,7 @@ static INLINE FieldsReader *sr_fr(SegmentReader *sr)
|
|
3926
4262
|
{
|
3927
4263
|
FieldsReader *fr;
|
3928
4264
|
|
3929
|
-
if (NULL == (fr = thread_getspecific(sr->thread_fr))) {
|
4265
|
+
if (NULL == (fr = (FieldsReader *)thread_getspecific(sr->thread_fr))) {
|
3930
4266
|
fr = fr_clone(sr->fr);
|
3931
4267
|
ary_push(sr->fr_bucket, fr);
|
3932
4268
|
thread_setspecific(sr->thread_fr, fr);
|
@@ -3942,7 +4278,7 @@ static INLINE bool sr_is_deleted_i(SegmentReader *sr, int doc_num)
|
|
3942
4278
|
static INLINE void sr_get_norms_into_i(SegmentReader *sr, int field_num,
|
3943
4279
|
uchar *buf)
|
3944
4280
|
{
|
3945
|
-
Norm *norm = h_get_int(sr->norms, field_num);
|
4281
|
+
Norm *norm = (Norm *)h_get_int(sr->norms, field_num);
|
3946
4282
|
if (NULL == norm) {
|
3947
4283
|
memset(buf, 0, SR_SIZE(sr));
|
3948
4284
|
}
|
@@ -3960,7 +4296,7 @@ static INLINE void sr_get_norms_into_i(SegmentReader *sr, int field_num,
|
|
3960
4296
|
|
3961
4297
|
static INLINE uchar *sr_get_norms_i(SegmentReader *sr, int field_num)
|
3962
4298
|
{
|
3963
|
-
Norm *norm = h_get_int(sr->norms, field_num);
|
4299
|
+
Norm *norm = (Norm *)h_get_int(sr->norms, field_num);
|
3964
4300
|
if (NULL == norm) { /* not an indexed field */
|
3965
4301
|
return NULL;
|
3966
4302
|
}
|
@@ -3975,7 +4311,7 @@ static INLINE uchar *sr_get_norms_i(SegmentReader *sr, int field_num)
|
|
3975
4311
|
|
3976
4312
|
static void sr_set_norm_i(IndexReader *ir, int doc_num, int field_num, uchar b)
|
3977
4313
|
{
|
3978
|
-
Norm *norm = h_get_int(SR(ir)->norms, field_num);
|
4314
|
+
Norm *norm = (Norm *)h_get_int(SR(ir)->norms, field_num);
|
3979
4315
|
if (NULL != norm) { /* has_norms */
|
3980
4316
|
ir->has_changes = true;
|
3981
4317
|
norm->is_dirty = true; /* mark it dirty */
|
@@ -3984,7 +4320,7 @@ static void sr_set_norm_i(IndexReader *ir, int doc_num, int field_num, uchar b)
|
|
3984
4320
|
}
|
3985
4321
|
}
|
3986
4322
|
|
3987
|
-
static void sr_delete_doc_i(IndexReader *ir, int doc_num)
|
4323
|
+
static void sr_delete_doc_i(IndexReader *ir, int doc_num)
|
3988
4324
|
{
|
3989
4325
|
if (NULL == SR(ir)->deleted_docs) {
|
3990
4326
|
SR(ir)->deleted_docs = bv_new();
|
@@ -4017,7 +4353,7 @@ static void bv_write(BitVector *bv, Store *store, char *name)
|
|
4017
4353
|
int i;
|
4018
4354
|
OutStream *os = store->new_output(store, name);
|
4019
4355
|
os_write_vint(os, bv->size);
|
4020
|
-
for (i = (bv->size >> 5); i >= 0; i--) {
|
4356
|
+
for (i = ((bv->size-1) >> 5); i >= 0; i--) {
|
4021
4357
|
os_write_u32(os, bv->bits[i]);
|
4022
4358
|
}
|
4023
4359
|
os_close(os);
|
@@ -4031,10 +4367,10 @@ static BitVector *bv_read(Store *store, char *name)
|
|
4031
4367
|
BitVector *volatile bv = ALLOC_AND_ZERO(BitVector);
|
4032
4368
|
bv->size = (int)is_read_vint(is);
|
4033
4369
|
bv->capa = (bv->size >> 5) + 1;
|
4034
|
-
bv->bits = ALLOC_AND_ZERO_N(
|
4370
|
+
bv->bits = ALLOC_AND_ZERO_N(u32, bv->capa);
|
4035
4371
|
bv->ref_cnt = 1;
|
4036
4372
|
TRY
|
4037
|
-
for (i = (bv->size >> 5); i >= 0; i--) {
|
4373
|
+
for (i = ((bv->size-1) >> 5); i >= 0; i--) {
|
4038
4374
|
bv->bits[i] = is_read_u32(is);
|
4039
4375
|
}
|
4040
4376
|
bv_recount(bv);
|
@@ -4065,7 +4401,8 @@ static void sr_commit_i(IndexReader *ir)
|
|
4065
4401
|
if (SR(ir)->undelete_all) {
|
4066
4402
|
si->del_gen = -1;
|
4067
4403
|
SR(ir)->undelete_all = false;
|
4068
|
-
}
|
4404
|
+
}
|
4405
|
+
else {
|
4069
4406
|
/* (SR(ir)->deleted_docs_dirty) re-write deleted */
|
4070
4407
|
si->del_gen++;
|
4071
4408
|
fn_for_generation(tmp_file_name, segment, "del", si->del_gen);
|
@@ -4080,7 +4417,7 @@ static void sr_commit_i(IndexReader *ir)
|
|
4080
4417
|
for (i = field_cnt - 1; i >= 0; i--) {
|
4081
4418
|
fi = ir->fis->fields[i];
|
4082
4419
|
if (fi_is_indexed(fi)) {
|
4083
|
-
Norm *norm = h_get_int(SR(ir)->norms, fi->number);
|
4420
|
+
Norm *norm = (Norm *)h_get_int(SR(ir)->norms, fi->number);
|
4084
4421
|
if (norm && norm->is_dirty) {
|
4085
4422
|
norm_rewrite(norm, ir->store, ir->deleter, SR(ir)->si,
|
4086
4423
|
SR_SIZE(ir));
|
@@ -4208,9 +4545,9 @@ static TermDocEnum *sr_term_positions(IndexReader *ir)
|
|
4208
4545
|
}
|
4209
4546
|
|
4210
4547
|
static TermVector *sr_term_vector(IndexReader *ir, int doc_num,
|
4211
|
-
|
4548
|
+
Symbol field)
|
4212
4549
|
{
|
4213
|
-
FieldInfo *fi = h_get(ir->fis->field_dict, field);
|
4550
|
+
FieldInfo *fi = (FieldInfo *)h_get(ir->fis->field_dict, field);
|
4214
4551
|
FieldsReader *fr;
|
4215
4552
|
|
4216
4553
|
if (!fi || !fi_store_term_vector(fi) || !SR(ir)->fr ||
|
@@ -4221,7 +4558,7 @@ static TermVector *sr_term_vector(IndexReader *ir, int doc_num,
|
|
4221
4558
|
return fr_get_field_tv(fr, doc_num, fi->number);
|
4222
4559
|
}
|
4223
4560
|
|
4224
|
-
static
|
4561
|
+
static Hash *sr_term_vectors(IndexReader *ir, int doc_num)
|
4225
4562
|
{
|
4226
4563
|
FieldsReader *fr;
|
4227
4564
|
if (!SR(ir)->fr || NULL == (fr = sr_fr(SR(ir)))) {
|
@@ -4266,7 +4603,7 @@ static void sr_open_norms(IndexReader *ir, Store *cfs_store)
|
|
4266
4603
|
|
4267
4604
|
static IndexReader *sr_setup_i(SegmentReader *sr)
|
4268
4605
|
{
|
4269
|
-
Store *store = sr->si->store;
|
4606
|
+
Store *volatile store = sr->si->store;
|
4270
4607
|
IndexReader *ir = IR(sr);
|
4271
4608
|
char file_name[SEGMENT_NAME_MAX_LENGTH];
|
4272
4609
|
char *sr_segment = sr->si->name;
|
@@ -4375,7 +4712,7 @@ static int mr_reader_index_i(MultiReader *mr, int doc_num)
|
|
4375
4712
|
return hi;
|
4376
4713
|
}
|
4377
4714
|
|
4378
|
-
int mr_num_docs(IndexReader *ir)
|
4715
|
+
static int mr_num_docs(IndexReader *ir)
|
4379
4716
|
{
|
4380
4717
|
int i, num_docs;
|
4381
4718
|
mutex_lock(&ir->mutex);
|
@@ -4429,7 +4766,7 @@ static uchar *mr_get_norms(IndexReader *ir, int field_num)
|
|
4429
4766
|
uchar *bytes;
|
4430
4767
|
|
4431
4768
|
mutex_lock(&ir->mutex);
|
4432
|
-
bytes = h_get_int(MR(ir)->norms_cache, field_num);
|
4769
|
+
bytes = (uchar *)h_get_int(MR(ir)->norms_cache, field_num);
|
4433
4770
|
if (NULL == bytes) {
|
4434
4771
|
int i;
|
4435
4772
|
const int mr_reader_cnt = MR(ir)->r_cnt;
|
@@ -4455,7 +4792,7 @@ static uchar *mr_get_norms_into(IndexReader *ir, int field_num, uchar *buf)
|
|
4455
4792
|
uchar *bytes;
|
4456
4793
|
|
4457
4794
|
mutex_lock(&ir->mutex);
|
4458
|
-
bytes = h_get_int(MR(ir)->norms_cache, field_num);
|
4795
|
+
bytes = (uchar *)h_get_int(MR(ir)->norms_cache, field_num);
|
4459
4796
|
if (NULL != bytes) {
|
4460
4797
|
memcpy(buf, bytes, MR(ir)->max_doc);
|
4461
4798
|
}
|
@@ -4509,13 +4846,13 @@ static TermDocEnum *mr_term_positions(IndexReader *ir)
|
|
4509
4846
|
}
|
4510
4847
|
|
4511
4848
|
static TermVector *mr_term_vector(IndexReader *ir, int doc_num,
|
4512
|
-
|
4849
|
+
Symbol field)
|
4513
4850
|
{
|
4514
4851
|
GET_READER();
|
4515
4852
|
return reader->term_vector(reader, doc_num - MR(ir)->starts[i], field);
|
4516
4853
|
}
|
4517
4854
|
|
4518
|
-
static
|
4855
|
+
static Hash *mr_term_vectors(IndexReader *ir, int doc_num)
|
4519
4856
|
{
|
4520
4857
|
GET_READER();
|
4521
4858
|
return reader->term_vectors(reader, doc_num - MR(ir)->starts[i]);
|
@@ -4667,7 +5004,7 @@ static IndexReader *mr_new(IndexReader **sub_readers, const int r_cnt)
|
|
4667
5004
|
return ir;
|
4668
5005
|
}
|
4669
5006
|
|
4670
|
-
IndexReader *mr_open_i(Store *store,
|
5007
|
+
static IndexReader *mr_open_i(Store *store,
|
4671
5008
|
SegmentInfos *sis,
|
4672
5009
|
FieldInfos *fis,
|
4673
5010
|
IndexReader **sub_readers,
|
@@ -4696,7 +5033,7 @@ IndexReader *mr_open(IndexReader **sub_readers, const int r_cnt)
|
|
4696
5033
|
IndexReader *ir = mr_new(sub_readers, r_cnt);
|
4697
5034
|
MultiReader *mr = MR(ir);
|
4698
5035
|
/* defaults don't matter, this is just for reading fields, not adding */
|
4699
|
-
FieldInfos *fis = fis_new(
|
5036
|
+
FieldInfos *fis = fis_new(STORE_NO, INDEX_NO, TERM_VECTOR_NO);
|
4700
5037
|
int i, j;
|
4701
5038
|
bool need_field_map = false;
|
4702
5039
|
|
@@ -4731,7 +5068,7 @@ IndexReader *mr_open(IndexReader **sub_readers, const int r_cnt)
|
|
4731
5068
|
mr->field_num_map[i][j] = fi_sub ? fi_sub->number : -1;
|
4732
5069
|
}
|
4733
5070
|
}
|
4734
|
-
/* print out the field map
|
5071
|
+
/* print out the field map
|
4735
5072
|
for (i = 0; i < r_cnt; i++) {
|
4736
5073
|
for (j = 0; j < fis->size; j++) {
|
4737
5074
|
printf("%d ", mr->field_num_map[i][j]);
|
@@ -4766,14 +5103,14 @@ static void ir_open_i(Store *store, FindSegmentsFile *fsf)
|
|
4766
5103
|
|
4767
5104
|
mutex_lock(&store->mutex);
|
4768
5105
|
sis_read_i(store, fsf);
|
4769
|
-
sis = fsf->
|
5106
|
+
sis = fsf->ret.sis;
|
4770
5107
|
fis = sis->fis;
|
4771
5108
|
|
4772
5109
|
if (sis->size == 1) {
|
4773
5110
|
ir = sr_open(sis, fis, 0, true);
|
4774
5111
|
}
|
4775
5112
|
else {
|
4776
|
-
int i;
|
5113
|
+
volatile int i;
|
4777
5114
|
IndexReader **readers = ALLOC_N(IndexReader *, sis->size);
|
4778
5115
|
int num_segments = sis->size;
|
4779
5116
|
for (i = num_segments - 1; i >= 0; i--) {
|
@@ -4788,7 +5125,7 @@ static void ir_open_i(Store *store, FindSegmentsFile *fsf)
|
|
4788
5125
|
}
|
4789
5126
|
ir = mr_open_i(store, sis, fis, readers, sis->size);
|
4790
5127
|
}
|
4791
|
-
fsf->
|
5128
|
+
fsf->ret.ir = ir;
|
4792
5129
|
success = true;
|
4793
5130
|
} while (0);
|
4794
5131
|
XFINALLY
|
@@ -4812,7 +5149,7 @@ IndexReader *ir_open(Store *store)
|
|
4812
5149
|
{
|
4813
5150
|
FindSegmentsFile fsf;
|
4814
5151
|
sis_find_segments_file(store, &fsf, &ir_open_i);
|
4815
|
-
return
|
5152
|
+
return fsf.ret.ir;
|
4816
5153
|
}
|
4817
5154
|
|
4818
5155
|
/****************************************************************************
|
@@ -4865,10 +5202,11 @@ Posting *p_new(MemoryPool *mp, int doc_num, int pos)
|
|
4865
5202
|
*
|
4866
5203
|
****************************************************************************/
|
4867
5204
|
|
4868
|
-
PostingList *pl_new(MemoryPool *mp, const char *term,
|
5205
|
+
PostingList *pl_new(MemoryPool *mp, const char *term,
|
5206
|
+
int term_len, Posting *p)
|
4869
5207
|
{
|
4870
5208
|
PostingList *pl = MP_ALLOC(mp, PostingList);
|
4871
|
-
pl->term = mp_memdup(mp, term, term_len + 1);
|
5209
|
+
pl->term = (char *)mp_memdup(mp, term, term_len + 1);
|
4872
5210
|
pl->term_len = term_len;
|
4873
5211
|
pl->first = pl->last = p;
|
4874
5212
|
pl->last_occ = p->first_occ;
|
@@ -4881,7 +5219,7 @@ void pl_add_occ(MemoryPool *mp, PostingList *pl, int pos)
|
|
4881
5219
|
pl->last->freq++;
|
4882
5220
|
}
|
4883
5221
|
|
4884
|
-
void pl_add_posting(PostingList *pl, Posting *p)
|
5222
|
+
static void pl_add_posting(PostingList *pl, Posting *p)
|
4885
5223
|
{
|
4886
5224
|
pl->last = pl->last->next = p;
|
4887
5225
|
pl->last_occ = p->first_occ;
|
@@ -4905,7 +5243,8 @@ static FieldInverter *fld_inv_new(DocWriter *dw, FieldInfo *fi)
|
|
4905
5243
|
fld_inv->store_term_vector = fi_store_term_vector(fi);
|
4906
5244
|
fld_inv->store_offsets = fi_store_offsets(fi);
|
4907
5245
|
if ((fld_inv->has_norms = fi_has_norms(fi)) == true) {
|
4908
|
-
fld_inv->norms = MP_ALLOC_AND_ZERO_N(dw->mp, uchar,
|
5246
|
+
fld_inv->norms = MP_ALLOC_AND_ZERO_N(dw->mp, uchar,
|
5247
|
+
dw->max_buffered_docs);
|
4909
5248
|
}
|
4910
5249
|
fld_inv->fi = fi;
|
4911
5250
|
|
@@ -4991,15 +5330,15 @@ static void dw_write_norms(DocWriter *dw, FieldInverter *fld_inv)
|
|
4991
5330
|
char file_name[SEGMENT_NAME_MAX_LENGTH];
|
4992
5331
|
OutStream *norms_out;
|
4993
5332
|
si_advance_norm_gen(dw->si, fld_inv->fi->number);
|
4994
|
-
si_norm_file_name(dw->si, file_name, fld_inv->fi->number);
|
5333
|
+
si_norm_file_name(dw->si, file_name, fld_inv->fi->number);
|
4995
5334
|
norms_out = dw->store->new_output(dw->store, file_name);
|
4996
5335
|
os_write_bytes(norms_out, fld_inv->norms, dw->doc_num);
|
4997
5336
|
os_close(norms_out);
|
4998
5337
|
}
|
4999
5338
|
|
5000
|
-
/* we'll use the postings
|
5339
|
+
/* we'll use the postings Hash's table area to sort the postings as it is
|
5001
5340
|
* going to be zeroset soon anyway */
|
5002
|
-
static PostingList **dw_sort_postings(
|
5341
|
+
static PostingList **dw_sort_postings(Hash *plists_ht)
|
5003
5342
|
{
|
5004
5343
|
int i, j;
|
5005
5344
|
HashEntry *he;
|
@@ -5054,8 +5393,8 @@ static void dw_flush(DocWriter *dw)
|
|
5054
5393
|
|
5055
5394
|
for (i = 0; i < fields_count; i++) {
|
5056
5395
|
fi = fis->fields[i];
|
5057
|
-
if (!fi_is_indexed(fi)
|
5058
|
-
|
5396
|
+
if (!fi_is_indexed(fi) || NULL ==
|
5397
|
+
(fld_inv = (FieldInverter*)h_get_int(dw->fields, fi->number))) {
|
5059
5398
|
continue;
|
5060
5399
|
}
|
5061
5400
|
if (!fi_omit_norms(fi)) {
|
@@ -5105,7 +5444,7 @@ static void dw_flush(DocWriter *dw)
|
|
5105
5444
|
tiw_close(tiw);
|
5106
5445
|
skip_buf_destroy(skip_buf);
|
5107
5446
|
dw_flush_streams(dw);
|
5108
|
-
}
|
5447
|
+
}
|
5109
5448
|
|
5110
5449
|
DocWriter *dw_open(IndexWriter *iw, SegmentInfo *si)
|
5111
5450
|
{
|
@@ -5130,7 +5469,7 @@ DocWriter *dw_open(IndexWriter *iw, SegmentInfo *si)
|
|
5130
5469
|
dw->skip_interval = iw->config.skip_interval;
|
5131
5470
|
dw->max_field_length = iw->config.max_field_length;
|
5132
5471
|
dw->max_buffered_docs = iw->config.max_buffered_docs;
|
5133
|
-
|
5472
|
+
|
5134
5473
|
dw->offsets = ALLOC_AND_ZERO_N(Offset, DW_OFFSET_INIT_CAPA);
|
5135
5474
|
dw->offsets_size = 0;
|
5136
5475
|
dw->offsets_capa = DW_OFFSET_INIT_CAPA;
|
@@ -5147,7 +5486,7 @@ void dw_new_segment(DocWriter *dw, SegmentInfo *si)
|
|
5147
5486
|
|
5148
5487
|
void dw_close(DocWriter *dw)
|
5149
5488
|
{
|
5150
|
-
if (dw->doc_num) {
|
5489
|
+
if (dw->doc_num) {
|
5151
5490
|
dw_flush(dw);
|
5152
5491
|
}
|
5153
5492
|
if (dw->fw) {
|
@@ -5162,7 +5501,7 @@ void dw_close(DocWriter *dw)
|
|
5162
5501
|
|
5163
5502
|
FieldInverter *dw_get_fld_inv(DocWriter *dw, FieldInfo *fi)
|
5164
5503
|
{
|
5165
|
-
FieldInverter *fld_inv = h_get_int(dw->fields, fi->number);
|
5504
|
+
FieldInverter *fld_inv = (FieldInverter*)h_get_int(dw->fields, fi->number);
|
5166
5505
|
|
5167
5506
|
if (!fld_inv) {
|
5168
5507
|
fld_inv = fld_inv_new(dw, fi);
|
@@ -5172,31 +5511,33 @@ FieldInverter *dw_get_fld_inv(DocWriter *dw, FieldInfo *fi)
|
|
5172
5511
|
}
|
5173
5512
|
|
5174
5513
|
static void dw_add_posting(MemoryPool *mp,
|
5175
|
-
|
5176
|
-
|
5514
|
+
Hash *curr_plists,
|
5515
|
+
Hash *fld_plists,
|
5177
5516
|
int doc_num,
|
5178
5517
|
const char *text,
|
5179
5518
|
int len,
|
5180
5519
|
int pos)
|
5181
5520
|
{
|
5182
|
-
HashEntry *pl_he
|
5183
|
-
if (pl_he
|
5184
|
-
pl_add_occ(mp, pl_he->value, pos);
|
5185
|
-
}
|
5186
|
-
else {
|
5187
|
-
HashEntry *fld_pl_he = h_set_ext(fld_plists, text);
|
5188
|
-
PostingList *pl = fld_pl_he->value;
|
5521
|
+
HashEntry *pl_he;
|
5522
|
+
if (h_set_ext(curr_plists, text, &pl_he)) {
|
5189
5523
|
Posting *p = p_new(mp, doc_num, pos);
|
5190
|
-
|
5191
|
-
|
5524
|
+
HashEntry *fld_pl_he;
|
5525
|
+
PostingList *pl;
|
5526
|
+
|
5527
|
+
if (h_set_ext(fld_plists, text, &fld_pl_he)) {
|
5528
|
+
fld_pl_he->value = pl = pl_new(mp, text, len, p);
|
5192
5529
|
pl_he->key = fld_pl_he->key = (char *)pl->term;
|
5193
5530
|
}
|
5194
5531
|
else {
|
5532
|
+
pl = (PostingList *)fld_pl_he->value;
|
5195
5533
|
pl_add_posting(pl, p);
|
5196
5534
|
pl_he->key = (char *)pl->term;
|
5197
5535
|
}
|
5198
5536
|
pl_he->value = pl;
|
5199
5537
|
}
|
5538
|
+
else {
|
5539
|
+
pl_add_occ(mp, (PostingList *)pl_he->value, pos);
|
5540
|
+
}
|
5200
5541
|
}
|
5201
5542
|
|
5202
5543
|
static INLINE void dw_add_offsets(DocWriter *dw, int pos, off_t start, off_t end)
|
@@ -5214,14 +5555,14 @@ static INLINE void dw_add_offsets(DocWriter *dw, int pos, off_t start, off_t end
|
|
5214
5555
|
dw->offsets_size = pos + 1;
|
5215
5556
|
}
|
5216
5557
|
|
5217
|
-
|
5558
|
+
Hash *dw_invert_field(DocWriter *dw,
|
5218
5559
|
FieldInverter *fld_inv,
|
5219
5560
|
DocField *df)
|
5220
5561
|
{
|
5221
5562
|
MemoryPool *mp = dw->mp;
|
5222
5563
|
Analyzer *a = dw->analyzer;
|
5223
|
-
|
5224
|
-
|
5564
|
+
Hash *curr_plists = dw->curr_plists;
|
5565
|
+
Hash *fld_plists = fld_inv->plists;
|
5225
5566
|
const bool store_offsets = fld_inv->store_offsets;
|
5226
5567
|
int doc_num = dw->doc_num;
|
5227
5568
|
int i;
|
@@ -5238,6 +5579,11 @@ HashTable *dw_invert_field(DocWriter *dw,
|
|
5238
5579
|
if (store_offsets) {
|
5239
5580
|
while (NULL != (tk = ts->next(ts))) {
|
5240
5581
|
pos += tk->pos_inc;
|
5582
|
+
/* if for some reason pos gets set to some number less
|
5583
|
+
* than 0 the we'll start pos at 0 */
|
5584
|
+
if (pos < 0) {
|
5585
|
+
pos = 0;
|
5586
|
+
}
|
5241
5587
|
dw_add_posting(mp, curr_plists, fld_plists, doc_num,
|
5242
5588
|
tk->text, tk->len, pos);
|
5243
5589
|
dw_add_offsets(dw, pos,
|
@@ -5271,7 +5617,7 @@ HashTable *dw_invert_field(DocWriter *dw,
|
|
5271
5617
|
char *data_ptr = df->data[i];
|
5272
5618
|
if (len > MAX_WORD_SIZE) {
|
5273
5619
|
len = MAX_WORD_SIZE - 1;
|
5274
|
-
data_ptr = memcpy(buf, df->data[i], len);
|
5620
|
+
data_ptr = (char *)memcpy(buf, df->data[i], len);
|
5275
5621
|
}
|
5276
5622
|
dw_add_posting(mp, curr_plists, fld_plists, doc_num, data_ptr,
|
5277
5623
|
len, i);
|
@@ -5286,7 +5632,7 @@ HashTable *dw_invert_field(DocWriter *dw,
|
|
5286
5632
|
return curr_plists;
|
5287
5633
|
}
|
5288
5634
|
|
5289
|
-
void dw_reset_postings(
|
5635
|
+
void dw_reset_postings(Hash *postings)
|
5290
5636
|
{
|
5291
5637
|
ZEROSET_N(postings->table, HashEntry, postings->mask + 1);
|
5292
5638
|
postings->fill = postings->size = 0;
|
@@ -5298,7 +5644,7 @@ void dw_add_doc(DocWriter *dw, Document *doc)
|
|
5298
5644
|
float boost;
|
5299
5645
|
DocField *df;
|
5300
5646
|
FieldInverter *fld_inv;
|
5301
|
-
|
5647
|
+
Hash *postings;
|
5302
5648
|
FieldInfo *fi;
|
5303
5649
|
const int doc_size = doc->size;
|
5304
5650
|
|
@@ -5543,7 +5889,7 @@ static void sm_merge_fields(SegmentMerger *sm)
|
|
5543
5889
|
end = (off_t)is_read_u64(fdx_in);
|
5544
5890
|
}
|
5545
5891
|
for (j = 0; j < max_doc; j++) {
|
5546
|
-
|
5892
|
+
u32 tv_idx_offset = is_read_u32(fdx_in);
|
5547
5893
|
start = end;
|
5548
5894
|
if (j == max_doc - 1) {
|
5549
5895
|
end = is_length(fdt_in);
|
@@ -5594,14 +5940,9 @@ static int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **matches,
|
|
5594
5940
|
doc = doc_map[doc]; /* work around deletions */
|
5595
5941
|
}
|
5596
5942
|
doc += base; /* convert to merged space */
|
5943
|
+
assert(doc == 0 || doc > last_doc);
|
5597
5944
|
|
5598
|
-
#ifdef DEBUG
|
5599
|
-
if (doc && doc <= last_doc) {
|
5600
|
-
RAISE(STATE_ERROR, "Docs not ordered, %d < %d", doc, last_doc);
|
5601
|
-
}
|
5602
|
-
#endif
|
5603
5945
|
df++;
|
5604
|
-
|
5605
5946
|
if (0 == (df % skip_interval)) {
|
5606
5947
|
skip_buf_add(skip_buf, last_doc);
|
5607
5948
|
}
|
@@ -5627,7 +5968,7 @@ static int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **matches,
|
|
5627
5968
|
|
5628
5969
|
static char *sm_cache_term(SegmentMerger *sm, char *term, int term_len)
|
5629
5970
|
{
|
5630
|
-
term = memcpy(sm->term_buf + sm->term_buf_ptr, term, term_len + 1);
|
5971
|
+
term = (char *)memcpy(sm->term_buf + sm->term_buf_ptr, term, term_len + 1);
|
5631
5972
|
sm->term_buf_ptr += term_len + 1;
|
5632
5973
|
if (sm->term_buf_ptr > sm->term_buf_size) {
|
5633
5974
|
sm->term_buf_ptr = 0;
|
@@ -5688,14 +6029,14 @@ static void sm_merge_term_infos(SegmentMerger *sm)
|
|
5688
6029
|
}printf("\n\n");
|
5689
6030
|
*/
|
5690
6031
|
match_size = 0; /* pop matching terms */
|
5691
|
-
matches[0] = pq_pop(sm->queue);
|
6032
|
+
matches[0] = (SegmentMergeInfo *)pq_pop(sm->queue);
|
5692
6033
|
match_size++;
|
5693
6034
|
term = matches[0]->term;
|
5694
|
-
top = pq_top(sm->queue);
|
6035
|
+
top = (SegmentMergeInfo *)pq_top(sm->queue);
|
5695
6036
|
while ((NULL != top) && (0 == strcmp(term, top->term))) {
|
5696
|
-
matches[match_size] = pq_pop(sm->queue);
|
6037
|
+
matches[match_size] = (SegmentMergeInfo *)pq_pop(sm->queue);
|
5697
6038
|
match_size++;
|
5698
|
-
top = pq_top(sm->queue);
|
6039
|
+
top = (SegmentMergeInfo *)pq_top(sm->queue);
|
5699
6040
|
}
|
5700
6041
|
|
5701
6042
|
/* printf(">%s:%s<\n", matches[0]->tb->field, matches[0]->tb->text); */
|
@@ -5849,7 +6190,7 @@ int iw_doc_count(IndexWriter *iw)
|
|
5849
6190
|
#define MOVE_TO_COMPOUND_DIR(file_name)\
|
5850
6191
|
deleter_queue_file(dlr, file_name);\
|
5851
6192
|
cw_add_file(cw, file_name)
|
5852
|
-
|
6193
|
+
|
5853
6194
|
static void iw_create_compound_file(Store *store, FieldInfos *fis,
|
5854
6195
|
SegmentInfo *si, char *cfs_file_name,
|
5855
6196
|
Deleter *dlr)
|
@@ -6015,7 +6356,7 @@ void iw_commit(IndexWriter *iw)
|
|
6015
6356
|
mutex_unlock(&iw->mutex);
|
6016
6357
|
}
|
6017
6358
|
|
6018
|
-
void iw_delete_term(IndexWriter *iw,
|
6359
|
+
void iw_delete_term(IndexWriter *iw, Symbol field, const char *term)
|
6019
6360
|
{
|
6020
6361
|
int field_num = fis_get_field_num(iw->fis, field);
|
6021
6362
|
if (field_num >= 0) {
|
@@ -6049,6 +6390,45 @@ void iw_delete_term(IndexWriter *iw, const char *field, const char *term)
|
|
6049
6390
|
}
|
6050
6391
|
}
|
6051
6392
|
|
6393
|
+
void iw_delete_terms(IndexWriter *iw, Symbol field,
|
6394
|
+
char **terms, const int term_cnt)
|
6395
|
+
{
|
6396
|
+
int field_num = fis_get_field_num(iw->fis, field);
|
6397
|
+
if (field_num >= 0) {
|
6398
|
+
int i;
|
6399
|
+
mutex_lock(&iw->mutex);
|
6400
|
+
iw_commit_i(iw);
|
6401
|
+
do {
|
6402
|
+
SegmentInfos *sis = iw->sis;
|
6403
|
+
const int seg_cnt = sis->size;
|
6404
|
+
bool did_delete = false;
|
6405
|
+
for (i = 0; i < seg_cnt; i++) {
|
6406
|
+
IndexReader *ir = sr_open(sis, iw->fis, i, false);
|
6407
|
+
TermDocEnum *tde = ir->term_docs(ir);
|
6408
|
+
int j;
|
6409
|
+
for (j = 0 ; j < term_cnt; j++) {
|
6410
|
+
const char *term = terms[j];
|
6411
|
+
ir->deleter = iw->deleter;
|
6412
|
+
stde_seek(tde, field_num, term);
|
6413
|
+
while (tde->next(tde)) {
|
6414
|
+
did_delete = true;
|
6415
|
+
sr_delete_doc_i(ir, STDE(tde)->doc_num);
|
6416
|
+
}
|
6417
|
+
}
|
6418
|
+
tde_destroy(tde);
|
6419
|
+
sr_commit_i(ir);
|
6420
|
+
ir_close(ir);
|
6421
|
+
}
|
6422
|
+
if (did_delete) {
|
6423
|
+
mutex_lock(&iw->store->mutex);
|
6424
|
+
sis_write(iw->sis, iw->store, iw->deleter);
|
6425
|
+
mutex_unlock(&iw->store->mutex);
|
6426
|
+
}
|
6427
|
+
} while (0);
|
6428
|
+
mutex_unlock(&iw->mutex);
|
6429
|
+
}
|
6430
|
+
}
|
6431
|
+
|
6052
6432
|
static void iw_optimize_i(IndexWriter *iw)
|
6053
6433
|
{
|
6054
6434
|
int min_segment;
|
@@ -6070,7 +6450,7 @@ void iw_optimize(IndexWriter *iw)
|
|
6070
6450
|
mutex_lock(&iw->mutex);
|
6071
6451
|
iw_optimize_i(iw);
|
6072
6452
|
mutex_unlock(&iw->mutex);
|
6073
|
-
}
|
6453
|
+
}
|
6074
6454
|
|
6075
6455
|
void iw_close(IndexWriter *iw)
|
6076
6456
|
{
|
@@ -6094,7 +6474,7 @@ void iw_close(IndexWriter *iw)
|
|
6094
6474
|
free(iw);
|
6095
6475
|
}
|
6096
6476
|
|
6097
|
-
IndexWriter *iw_open(Store *store,
|
6477
|
+
IndexWriter *iw_open(Store *store, Analyzer *volatile analyzer,
|
6098
6478
|
const Config *config)
|
6099
6479
|
{
|
6100
6480
|
IndexWriter *iw = ALLOC_AND_ZERO(IndexWriter);
|
@@ -6174,7 +6554,7 @@ static void iw_cp_fields(IndexWriter *iw, SegmentReader *sr,
|
|
6174
6554
|
int i;
|
6175
6555
|
const int max_doc = sr_max_doc(IR(sr));
|
6176
6556
|
for (i = 0; i < max_doc; i++) {
|
6177
|
-
int j;
|
6557
|
+
int j, data_len = 0;
|
6178
6558
|
const int field_cnt = is_read_vint(fdt_in);
|
6179
6559
|
int tv_cnt;
|
6180
6560
|
off_t doc_start_ptr = os_pos(fdt_out);
|
@@ -6186,7 +6566,6 @@ static void iw_cp_fields(IndexWriter *iw, SegmentReader *sr,
|
|
6186
6566
|
int k;
|
6187
6567
|
const int field_num = map[is_read_vint(fdt_in)];
|
6188
6568
|
const int df_size = is_read_vint(fdt_in);
|
6189
|
-
int data_len = 0;
|
6190
6569
|
os_write_vint(fdt_out, field_num);
|
6191
6570
|
os_write_vint(fdt_out, df_size);
|
6192
6571
|
/* sum total lengths of DocField */
|
@@ -6196,18 +6575,18 @@ static void iw_cp_fields(IndexWriter *iw, SegmentReader *sr,
|
|
6196
6575
|
os_write_vint(fdt_out, flen);
|
6197
6576
|
data_len += flen + 1;
|
6198
6577
|
}
|
6199
|
-
is2os_copy_bytes(fdt_in, fdt_out, data_len);
|
6200
6578
|
}
|
6579
|
+
is2os_copy_bytes(fdt_in, fdt_out, data_len);
|
6201
6580
|
|
6202
6581
|
/* Write TermVectors */
|
6203
6582
|
/* write TVs up to TV index */
|
6204
6583
|
is2os_copy_bytes(fdt_in, fdt_out,
|
6205
6584
|
(int)(is_read_u64(fdx_in)
|
6206
|
-
+ (
|
6207
|
-
- (
|
6585
|
+
+ (u64)is_read_u32(fdx_in)
|
6586
|
+
- (u64)is_pos(fdt_in)));
|
6208
6587
|
|
6209
6588
|
/* Write TV index pos */
|
6210
|
-
os_write_u32(fdx_out, (
|
6589
|
+
os_write_u32(fdx_out, (u32)(os_pos(fdt_out) - doc_start_ptr));
|
6211
6590
|
tv_cnt = is_read_vint(fdt_in);
|
6212
6591
|
os_write_vint(fdt_out, tv_cnt);
|
6213
6592
|
for (j = 0; j < tv_cnt; j++) {
|
@@ -6242,7 +6621,7 @@ static void iw_cp_terms(IndexWriter *iw, SegmentReader *sr,
|
|
6242
6621
|
tix_out = store_out->new_output(store_out, file_name);
|
6243
6622
|
sprintf(file_name, "%s.tix", sr_segment);
|
6244
6623
|
tix_in = store_in->open_input(store_in, file_name);
|
6245
|
-
|
6624
|
+
|
6246
6625
|
sprintf(file_name, "%s.tis", segment);
|
6247
6626
|
tis_out = store_out->new_output(store_out, file_name);
|
6248
6627
|
sprintf(file_name, "%s.tis", sr_segment);
|
@@ -6371,7 +6750,7 @@ static void iw_add_segment(IndexWriter *iw, SegmentReader *sr)
|
|
6371
6750
|
FieldInfo *fi = sub_fis->fields[j];
|
6372
6751
|
FieldInfo *new_fi = fis_get_field(fis, fi->name);
|
6373
6752
|
if (NULL == new_fi) {
|
6374
|
-
new_fi = fi_new(fi->name,
|
6753
|
+
new_fi = fi_new(fi->name, STORE_NO, INDEX_NO, TERM_VECTOR_NO);
|
6375
6754
|
new_fi->bits = fi->bits;
|
6376
6755
|
fis_add_field(fis, new_fi);
|
6377
6756
|
}
|