ferret 0.3.2 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/store.c
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#include "store.h"
|
2
|
+
|
3
|
+
void with_lock(Lock *lock, void (*func)(void *arg), void *arg)
|
4
|
+
{
|
5
|
+
if (!lock->obtain(lock))
|
6
|
+
eprintf(IO_ERROR, "Could not obtain lock: <%s>", lock->name);
|
7
|
+
func(arg);
|
8
|
+
lock->release(lock);
|
9
|
+
}
|
10
|
+
|
11
|
+
void with_lock_name(Store *store, char *lock_name,
|
12
|
+
void (*func)(void *arg), void *arg)
|
13
|
+
{
|
14
|
+
Lock *lock = store->open_lock(store, lock_name);
|
15
|
+
if (!lock->obtain(lock))
|
16
|
+
eprintf(IO_ERROR, "Could not obtain lock: <%s>", lock->name);
|
17
|
+
func(arg);
|
18
|
+
lock->release(lock);
|
19
|
+
store->close_lock(lock);
|
20
|
+
}
|
21
|
+
|
22
|
+
Store *store_create()
|
23
|
+
{
|
24
|
+
Store *store = ALLOC(Store);
|
25
|
+
mutex_init(&store->mutex, NULL);
|
26
|
+
mutex_init(&store->ext_mutex, NULL);
|
27
|
+
return store;
|
28
|
+
}
|
29
|
+
|
30
|
+
void store_destroy(Store *store)
|
31
|
+
{
|
32
|
+
mutex_destroy(&store->mutex);
|
33
|
+
mutex_destroy(&store->ext_mutex);
|
34
|
+
free(store);
|
35
|
+
}
|
data/ext/store.h
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
#include "global.h"
|
2
|
+
#include "hash.h"
|
3
|
+
|
4
|
+
#ifndef FRT_STORE_H
|
5
|
+
#define FRT_STORE_H
|
6
|
+
|
7
|
+
#define BUFFER_SIZE 1024
|
8
|
+
#define LOCK_PREFIX "ferret-"
|
9
|
+
|
10
|
+
#define VINT_MAX_LEN 10
|
11
|
+
#define VINT_END BUFFER_SIZE - VINT_MAX_LEN
|
12
|
+
|
13
|
+
typedef struct Buffer {
|
14
|
+
uchar buf[BUFFER_SIZE];
|
15
|
+
int start;
|
16
|
+
int pos;
|
17
|
+
int len;
|
18
|
+
} Buffer;
|
19
|
+
|
20
|
+
typedef struct OutStream {
|
21
|
+
Buffer buf;
|
22
|
+
void *file;
|
23
|
+
int pointer; // only used by RAMOut
|
24
|
+
void (*flush_internal)(struct OutStream *os, uchar *buf, int len);
|
25
|
+
void (*seek_internal)(struct OutStream *os, int pos);
|
26
|
+
void (*close_internal)(struct OutStream *os);
|
27
|
+
} OutStream;
|
28
|
+
|
29
|
+
typedef struct CompoundInStream CompoundInStream;
|
30
|
+
|
31
|
+
typedef struct InStream {
|
32
|
+
int is_clone;
|
33
|
+
Buffer buf;
|
34
|
+
void *file;
|
35
|
+
union {
|
36
|
+
int pointer; // only used by RAMIn
|
37
|
+
char *path; // only used by FSIn
|
38
|
+
CompoundInStream *cis;
|
39
|
+
} d;
|
40
|
+
void (*read_internal)(struct InStream *is, uchar *buf, int offset, int len);
|
41
|
+
void (*seek_internal)(struct InStream *is, int pos);
|
42
|
+
void (*close_internal)(struct InStream *is);
|
43
|
+
void (*clone_internal)(struct InStream *is, struct InStream *new_index_i);
|
44
|
+
int (*length_internal)(struct InStream *is);
|
45
|
+
} InStream;
|
46
|
+
|
47
|
+
struct CompoundInStream {
|
48
|
+
InStream *sub;
|
49
|
+
int offset;
|
50
|
+
int length;
|
51
|
+
};
|
52
|
+
|
53
|
+
#define is_length(mis) mis->length_internal(mis)
|
54
|
+
|
55
|
+
typedef struct Store Store;
|
56
|
+
typedef struct Lock Lock;
|
57
|
+
struct Lock {
|
58
|
+
char *name;
|
59
|
+
Store *store;
|
60
|
+
int (*obtain)(Lock *lock);
|
61
|
+
int (*is_locked)(Lock *lock);
|
62
|
+
void (*release)(Lock *lock);
|
63
|
+
};
|
64
|
+
|
65
|
+
typedef struct CompoundStore {
|
66
|
+
Store *store;
|
67
|
+
const char *name;
|
68
|
+
HshTable *entries;
|
69
|
+
InStream *stream;
|
70
|
+
} CompoundStore;
|
71
|
+
|
72
|
+
struct Store {
|
73
|
+
int ref_cnt; /* for fs_store only */
|
74
|
+
mutex_t mutex;
|
75
|
+
mutex_t ext_mutex;
|
76
|
+
union {
|
77
|
+
char *path; /* for fs_store only */
|
78
|
+
HshTable *ht; /* for ram_store only */
|
79
|
+
CompoundStore *cmpd; /* for compound_store only */
|
80
|
+
} dir;
|
81
|
+
void (*touch)(Store *store, char *filename);
|
82
|
+
int (*exists)(Store *store, char *filename);
|
83
|
+
int (*remove)(Store *store, char *filename);
|
84
|
+
int (*rename)(Store *store, char *from, char *to);
|
85
|
+
int (*count)(Store *store);
|
86
|
+
void (*close)(Store *store);
|
87
|
+
void (*clear)(Store *store);
|
88
|
+
void (*clear_all)(Store *store);
|
89
|
+
void (*clear_locks)(Store *store);
|
90
|
+
int (*length)(Store *store, char *filename);
|
91
|
+
void (*each)(Store *store, void (*func)(char *fname, void *arg), void *arg);
|
92
|
+
OutStream *(*create_output)(Store *store, const char *filename);
|
93
|
+
InStream *(*open_input)(Store *store, const char *filename);
|
94
|
+
Lock *(*open_lock)(Store *store, char *lockname);
|
95
|
+
void (*close_lock)(Lock *lock);
|
96
|
+
};
|
97
|
+
|
98
|
+
#define store_close(mstore) mstore->close(mstore)
|
99
|
+
|
100
|
+
Store *store_create();
|
101
|
+
void store_destroy(Store *store);
|
102
|
+
Store *open_fs_store(const char *pathname);
|
103
|
+
Store *open_ram_store();
|
104
|
+
Store *open_ram_store_and_copy(Store *store, bool close_dir);
|
105
|
+
Store *open_cmpd_store(Store *sub, const char *filename);
|
106
|
+
void ram_close(Store *store);
|
107
|
+
Buffer *buf_create();
|
108
|
+
void os_flush(OutStream *os);
|
109
|
+
void os_close(OutStream *os);
|
110
|
+
int os_pos(OutStream *os);
|
111
|
+
void os_seek(OutStream *os, int new_pos);
|
112
|
+
void os_write_byte(OutStream *os, uchar b);
|
113
|
+
void os_write_bytes(OutStream *os, uchar *b, int len);
|
114
|
+
uchar is_read_byte(InStream *is);
|
115
|
+
int is_pos(InStream *is);
|
116
|
+
uchar *is_read_bytes(InStream *is, uchar *b, int offset, int len);
|
117
|
+
void is_seek(InStream *is, int pos);
|
118
|
+
InStream *is_clone(InStream *is);
|
119
|
+
void is_close(InStream *is);
|
120
|
+
int is_read_int(InStream *is);
|
121
|
+
long long is_read_long(InStream *is);
|
122
|
+
unsigned int is_read_uint(InStream *is);
|
123
|
+
unsigned long long is_read_ulong(InStream *is);
|
124
|
+
unsigned long long is_read_vint(InStream *is);
|
125
|
+
void is_read_chars(InStream *is, char* buffer, int off, int len) ;
|
126
|
+
char *is_read_string(InStream *is);
|
127
|
+
void os_write_int(OutStream *os, int l);
|
128
|
+
void os_write_long(OutStream *os, long long l);
|
129
|
+
void os_write_uint(OutStream *os, unsigned int l);
|
130
|
+
void os_write_ulong(OutStream *os, unsigned long long l);
|
131
|
+
void os_write_vint(OutStream *os, register unsigned long long i);
|
132
|
+
void os_write_chars(OutStream *os, char *buf, int start, int length);
|
133
|
+
void os_write_string(OutStream *os, char *str);
|
134
|
+
OutStream *os_create();
|
135
|
+
InStream *is_create();
|
136
|
+
void buf_destroy(Buffer *buf);
|
137
|
+
|
138
|
+
// RamStore functions
|
139
|
+
int ramo_length(OutStream *os);
|
140
|
+
void ramo_reset(OutStream *os);
|
141
|
+
int rami_length(InStream *is);
|
142
|
+
void ramo_write_to(OutStream *os, OutStream *other_o);
|
143
|
+
OutStream *ram_create_buffer();
|
144
|
+
void ram_destroy_buffer(OutStream *os);
|
145
|
+
|
146
|
+
int file_is_lock(char *filename);
|
147
|
+
|
148
|
+
void with_lock(Lock *lock, void (*func)(void *arg), void *arg);
|
149
|
+
void with_lock_name(Store *store, char *lock_name,
|
150
|
+
void (*func)(void *arg), void *arg);
|
151
|
+
|
152
|
+
#endif
|
data/ext/term.c
CHANGED
@@ -1,222 +1,783 @@
|
|
1
|
-
#include
|
1
|
+
#include <index.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <helper.h>
|
4
|
+
#include <hash.h>
|
2
5
|
|
6
|
+
/****************************************************************************
|
7
|
+
*
|
8
|
+
* Term
|
9
|
+
*
|
10
|
+
****************************************************************************/
|
11
|
+
|
12
|
+
Term *term_clone(Term *term)
|
13
|
+
{
|
14
|
+
Term *t = ALLOC(Term);
|
15
|
+
|
16
|
+
t->field = term->field;
|
17
|
+
t->text = estrdup(term->text);
|
18
|
+
return t;
|
19
|
+
}
|
20
|
+
|
21
|
+
Term *term_create(const char *field, char *text)
|
22
|
+
{
|
23
|
+
Term *t = ALLOC(Term);
|
24
|
+
|
25
|
+
t->field = (char *)field;
|
26
|
+
t->text = estrdup(text);
|
27
|
+
return t;
|
28
|
+
}
|
29
|
+
|
30
|
+
void term_destroy(void *p)
|
31
|
+
{
|
32
|
+
Term *t = (Term *)p;
|
33
|
+
free(t->text);
|
34
|
+
free(t);
|
35
|
+
}
|
36
|
+
|
37
|
+
int term_cmp(void *t1, void *t2)
|
38
|
+
{
|
39
|
+
int res = strcmp(((Term *)t1)->field, ((Term *)t2)->field);
|
40
|
+
if (res != 0) {
|
41
|
+
return res;
|
42
|
+
} else {
|
43
|
+
return strcmp(((Term *)t1)->text, ((Term *)t2)->text);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
int term_eq(const void *t1, const void *t2)
|
48
|
+
{
|
49
|
+
return (strcmp(((Term *)t1)->text, ((Term *)t2)->text)) == 0 &&
|
50
|
+
(strcmp(((Term *)t1)->field, ((Term *)t2)->field) == 0);
|
51
|
+
}
|
52
|
+
|
53
|
+
unsigned int term_hash(const void *t)
|
54
|
+
{
|
55
|
+
return str_hash(((Term *)t)->text) * str_hash(((Term *)t)->field);
|
56
|
+
}
|
57
|
+
|
58
|
+
char *term_to_s(Term *term)
|
59
|
+
{
|
60
|
+
char *string = ALLOC_N(char, strlen(term->field) + strlen(term->text) + 2);
|
61
|
+
sprintf(string, "%s:%s", term->field, term->text);
|
62
|
+
return string;
|
63
|
+
}
|
3
64
|
|
4
65
|
/****************************************************************************
|
5
66
|
*
|
6
|
-
*
|
67
|
+
* TermBuffer
|
7
68
|
*
|
8
69
|
****************************************************************************/
|
9
70
|
|
10
|
-
void
|
11
|
-
|
71
|
+
void tb_reset(TermBuffer *tb)
|
72
|
+
{
|
73
|
+
tb->field = (char *)EMPTY_STRING;
|
74
|
+
tb->text[0] = '\0';
|
75
|
+
}
|
76
|
+
|
77
|
+
TermBuffer *tb_create()
|
78
|
+
{
|
79
|
+
TermBuffer *tb = ALLOC(TermBuffer);
|
80
|
+
tb->field = (char *)EMPTY_STRING;
|
81
|
+
tb->text[0] = '\0';
|
82
|
+
return tb;
|
83
|
+
}
|
84
|
+
|
85
|
+
void tb_destroy(void *p)
|
12
86
|
{
|
13
|
-
Term *term = (Term *)p;
|
14
|
-
free(term->text);
|
15
87
|
free(p);
|
16
88
|
}
|
17
89
|
|
18
|
-
|
19
|
-
|
90
|
+
TermBuffer *tb_set_term(TermBuffer *tb, Term *t)
|
91
|
+
{
|
92
|
+
tb->field = t->field;
|
93
|
+
strcpy(tb->text, t->text);
|
94
|
+
return tb;
|
95
|
+
}
|
96
|
+
|
97
|
+
Term *tb_get_term(TermBuffer *tb)
|
20
98
|
{
|
21
|
-
|
22
|
-
rb_gc_mark(term->field);
|
99
|
+
return term_create(tb->field, tb->text);
|
23
100
|
}
|
24
101
|
|
25
|
-
|
26
|
-
frt_term_alloc(VALUE klass)
|
102
|
+
int tb_cmp(TermBuffer *tb1, TermBuffer *tb2)
|
27
103
|
{
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
104
|
+
int res = strcmp(tb1->field, tb2->field);
|
105
|
+
if (res != 0) {
|
106
|
+
return res;
|
107
|
+
} else {
|
108
|
+
return strcmp(tb1->text, tb2->text);
|
109
|
+
}
|
32
110
|
}
|
33
111
|
|
34
|
-
|
35
|
-
VALUE
|
36
|
-
frt_term_set(VALUE self, VALUE rfield, VALUE rtext)
|
112
|
+
int tb_term_cmp(TermBuffer *tb, Term *t)
|
37
113
|
{
|
38
|
-
int
|
39
|
-
|
114
|
+
int res = strcmp(tb->field, t->field);
|
115
|
+
if (res != 0) {
|
116
|
+
return res;
|
117
|
+
} else {
|
118
|
+
return strcmp(tb->text, t->text);
|
119
|
+
}
|
120
|
+
}
|
40
121
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
return Qnil;
|
122
|
+
TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2)
|
123
|
+
{
|
124
|
+
tb1->field = tb2->field;
|
125
|
+
strcpy(tb1->text, tb2->text);
|
126
|
+
return tb1;
|
48
127
|
}
|
49
128
|
|
50
|
-
|
51
|
-
frt_term_init(VALUE self, VALUE rfield, VALUE rtext)
|
129
|
+
TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis)
|
52
130
|
{
|
53
|
-
|
54
|
-
|
131
|
+
int start = is_read_vint(is);
|
132
|
+
int length = is_read_vint(is);
|
133
|
+
int total_length = start + length;
|
134
|
+
is_read_bytes(is, (uchar *)tb->text, start, length);
|
135
|
+
tb->text[total_length] = '\0';
|
136
|
+
int fnum = is_read_vint(is);
|
137
|
+
if (fnum < 0)
|
138
|
+
tb->field = (char *)EMPTY_STRING;
|
139
|
+
else
|
140
|
+
tb->field = fis->by_number[fnum]->name;
|
141
|
+
return tb;
|
55
142
|
}
|
56
143
|
|
57
|
-
|
58
|
-
|
144
|
+
/****************************************************************************
|
145
|
+
*
|
146
|
+
* TermInfo
|
147
|
+
*
|
148
|
+
****************************************************************************/
|
149
|
+
|
150
|
+
TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
|
59
151
|
{
|
60
|
-
|
61
|
-
|
152
|
+
TermInfo *ti = ALLOC(TermInfo);
|
153
|
+
ti->doc_freq = doc_freq;
|
154
|
+
ti->freq_pointer = freq_pointer;
|
155
|
+
ti->prox_pointer = prox_pointer;
|
156
|
+
ti->skip_offset = skip_offset;
|
157
|
+
return ti;
|
62
158
|
}
|
63
159
|
|
64
|
-
|
65
|
-
frt_term_set_text(VALUE self, VALUE rtext)
|
160
|
+
TermInfo *ti_set(TermInfo *ti, int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
|
66
161
|
{
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
162
|
+
ti->doc_freq = doc_freq;
|
163
|
+
ti->freq_pointer = freq_pointer;
|
164
|
+
ti->prox_pointer = prox_pointer;
|
165
|
+
ti->skip_offset = skip_offset;
|
166
|
+
return ti;
|
167
|
+
}
|
72
168
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
term->tlen = tlen;
|
77
|
-
|
78
|
-
return Qnil;
|
169
|
+
void ti_destroy(void *p)
|
170
|
+
{
|
171
|
+
free(p);
|
79
172
|
}
|
80
173
|
|
81
|
-
|
82
|
-
frt_term_get_field(VALUE self)
|
174
|
+
TermInfo *ti_cpy(TermInfo *ti, TermInfo *other)
|
83
175
|
{
|
84
|
-
|
85
|
-
return
|
176
|
+
memcpy(ti, other, sizeof(TermInfo));
|
177
|
+
return ti;
|
86
178
|
}
|
87
179
|
|
88
|
-
|
89
|
-
frt_term_set_field(VALUE self, VALUE rfield)
|
180
|
+
TermInfo *ti_clone(TermInfo *other)
|
90
181
|
{
|
91
|
-
|
92
|
-
|
93
|
-
return Qnil;
|
182
|
+
return ti_create(other->doc_freq,
|
183
|
+
other->freq_pointer, other->prox_pointer, other->skip_offset);
|
94
184
|
}
|
95
185
|
|
96
|
-
|
97
|
-
frt_term_to_s(VALUE self)
|
186
|
+
int ti_eq(TermInfo *ti, TermInfo *other)
|
98
187
|
{
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
188
|
+
return (memcmp(ti, other, sizeof(TermInfo)) == 0);
|
189
|
+
}
|
190
|
+
|
191
|
+
/****************************************************************************
|
192
|
+
*
|
193
|
+
* TermEnum
|
194
|
+
*
|
195
|
+
****************************************************************************/
|
196
|
+
|
197
|
+
TermEnum *te_create()
|
198
|
+
{
|
199
|
+
TermEnum *te = ALLOC(TermEnum);
|
200
|
+
te->tb_curr = tb_create();
|
201
|
+
te->tb_prev = tb_create();
|
202
|
+
te->ti_curr = ti_create(0, 0, 0, 0);
|
203
|
+
return te;
|
204
|
+
}
|
205
|
+
|
206
|
+
void te_destroy(void *p)
|
207
|
+
{
|
208
|
+
TermEnum *te = (TermEnum *)p;
|
209
|
+
tb_destroy(te->tb_curr);
|
210
|
+
tb_destroy(te->tb_prev);
|
211
|
+
ti_destroy(te->ti_curr);
|
212
|
+
free(p);
|
213
|
+
}
|
214
|
+
|
215
|
+
Term *te_get_term(TermEnum *te)
|
216
|
+
{
|
217
|
+
return tb_get_term(te->tb_curr);
|
218
|
+
}
|
219
|
+
|
220
|
+
TermInfo *te_get_ti(TermEnum *te)
|
221
|
+
{
|
222
|
+
TermInfo *ti = te->ti_curr;
|
223
|
+
return ti_create(ti->doc_freq, ti->freq_pointer, ti->prox_pointer, ti->skip_offset);
|
224
|
+
}
|
225
|
+
|
226
|
+
TermBuffer *te_skip_to(TermEnum *te, Term *t)
|
227
|
+
{
|
228
|
+
TermBuffer *tb_curr;
|
229
|
+
if (tb_term_cmp(te->tb_curr, t) == 0)
|
230
|
+
return te->tb_curr;
|
231
|
+
|
232
|
+
while (((tb_curr = te->next(te)) != NULL) &&
|
233
|
+
(tb_term_cmp(tb_curr, t) < 0)) {
|
234
|
+
}
|
235
|
+
return tb_curr;
|
236
|
+
}
|
237
|
+
|
238
|
+
/****************************************************************************
|
239
|
+
*
|
240
|
+
* SegmentTermEnum
|
241
|
+
*
|
242
|
+
****************************************************************************/
|
243
|
+
|
244
|
+
#define GET_STE SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
245
|
+
|
246
|
+
TermBuffer *ste_next(TermEnum *te)
|
247
|
+
{
|
248
|
+
GET_STE;
|
249
|
+
InStream *is = ste->is;
|
250
|
+
ste->pos++;
|
251
|
+
if (ste->pos > ste->size - 1) {
|
252
|
+
tb_reset(te->tb_curr);
|
253
|
+
return NULL;
|
254
|
+
}
|
255
|
+
|
256
|
+
tb_cpy(te->tb_prev, te->tb_curr);
|
257
|
+
tb_read(te->tb_curr, is, ste->fis);
|
106
258
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
259
|
+
TermInfo *ti = te->ti_curr;
|
260
|
+
ti->doc_freq = is_read_vint(is); // read doc freq
|
261
|
+
ti->freq_pointer += is_read_vint(is); // read freq pointer
|
262
|
+
ti->prox_pointer += is_read_vint(is); // read prox pointer
|
263
|
+
|
264
|
+
if (ste->format == -1) {
|
265
|
+
// just read skip_offset in order to increment file pointer
|
266
|
+
// value is never used since skip_to is switched off
|
267
|
+
if (!ste->is_index) {
|
268
|
+
if (ti->doc_freq > ste->format_m1skip_interval)
|
269
|
+
ti->skip_offset = is_read_vint(is);
|
270
|
+
}
|
271
|
+
} else {
|
272
|
+
if (ti->doc_freq >= ste->skip_interval)
|
273
|
+
ti->skip_offset = is_read_vint(is);
|
274
|
+
}
|
275
|
+
|
276
|
+
if (ste->is_index)
|
277
|
+
ste->index_pointer += is_read_vint(is); // read index pointer
|
278
|
+
|
279
|
+
return te->tb_curr;
|
111
280
|
}
|
112
281
|
|
113
|
-
|
114
|
-
|
282
|
+
TermEnum *ste_clone(TermEnum *other_te);
|
283
|
+
TermEnum *ste_allocate()
|
115
284
|
{
|
116
|
-
|
285
|
+
TermEnum *te = te_create();
|
286
|
+
te->next = &ste_next;
|
287
|
+
te->close = &ste_close;
|
288
|
+
te->clone = &ste_clone;
|
289
|
+
SegmentTermEnum *ste =
|
290
|
+
ALLOC(SegmentTermEnum);
|
291
|
+
te->data = ste;
|
292
|
+
return te;
|
293
|
+
}
|
294
|
+
|
295
|
+
TermEnum *ste_clone(TermEnum *other_te)
|
296
|
+
{
|
297
|
+
SegmentTermEnum *other_ste = (SegmentTermEnum *)other_te->data;
|
298
|
+
TermEnum *te = ste_allocate();
|
299
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
300
|
+
memcpy(ste, other_ste, sizeof(SegmentTermEnum));
|
301
|
+
ste->is = is_clone(other_ste->is);
|
302
|
+
tb_cpy(te->tb_curr, other_te->tb_curr);
|
303
|
+
tb_cpy(te->tb_prev, other_te->tb_prev);
|
304
|
+
ti_cpy(te->ti_curr, other_te->ti_curr);
|
305
|
+
return te;
|
306
|
+
}
|
307
|
+
|
308
|
+
void ste_close(TermEnum *te)
|
309
|
+
{
|
310
|
+
GET_STE;
|
311
|
+
is_close(ste->is);
|
312
|
+
free(ste);
|
313
|
+
te->data = NULL;
|
314
|
+
te_destroy(te);
|
315
|
+
}
|
316
|
+
|
317
|
+
TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
|
318
|
+
{
|
319
|
+
TermEnum *te = ste_allocate();
|
320
|
+
GET_STE;
|
321
|
+
ste->fis = fis;
|
322
|
+
ste->is_index = is_index;
|
323
|
+
ste->is = is;
|
324
|
+
ste->pos = -1;
|
325
|
+
ste->index_pointer = 0;
|
326
|
+
ste->format_m1skip_interval = -1;
|
327
|
+
|
328
|
+
int first_int = is_read_int(is);
|
329
|
+
|
330
|
+
if (first_int >= 0) {
|
331
|
+
// original-format file, without explicit format version number
|
332
|
+
ste->format = 0;
|
333
|
+
ste->size = first_int;
|
334
|
+
|
335
|
+
// back-compatible settings
|
336
|
+
ste->index_interval = 128;
|
337
|
+
ste->skip_interval = INT_MAX; // switch off skip_to optimization
|
338
|
+
|
339
|
+
} else {
|
340
|
+
// check that it is a format we can understand
|
341
|
+
if (first_int < TERM_INFO_FORMAT)
|
342
|
+
eprintf(ERROR, "Unknown format version:%d", first_int);
|
343
|
+
|
344
|
+
// we have a format version number
|
345
|
+
ste->format = first_int;
|
346
|
+
|
347
|
+
|
348
|
+
ste->size = is_read_long(is); // read the size
|
117
349
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
comp = memcmp(t1->text, t2->text, size);
|
128
|
-
if(comp == 0 && my_len != o_len)
|
129
|
-
comp = my_len > o_len ? 1 : -1;
|
350
|
+
if (ste->format == -1) {
|
351
|
+
if (!ste->is_index) {
|
352
|
+
ste->index_interval = is_read_int(is);
|
353
|
+
ste->format_m1skip_interval = is_read_int(is);
|
354
|
+
}
|
355
|
+
// switch off skip_to optimization for file format prior to
|
356
|
+
// 1.4rc2 in order to avoid a bug in skip_to implementation
|
357
|
+
// of these versions
|
358
|
+
ste->skip_interval = INT_MAX;
|
130
359
|
} else {
|
131
|
-
|
360
|
+
ste->index_interval = is_read_int(is);
|
361
|
+
ste->skip_interval = is_read_int(is);
|
132
362
|
}
|
133
363
|
}
|
134
|
-
return
|
364
|
+
return te;
|
365
|
+
}
|
366
|
+
|
367
|
+
void ste_seek(TermEnum *te, int pointer, int pos, Term *t, TermInfo *ti)
|
368
|
+
{
|
369
|
+
GET_STE;
|
370
|
+
is_seek(ste->is, pointer);
|
371
|
+
ste->pos = pos;
|
372
|
+
tb_set_term(te->tb_curr, t);
|
373
|
+
tb_reset(te->tb_prev);
|
374
|
+
ti_cpy(te->ti_curr, ti);
|
375
|
+
}
|
376
|
+
|
377
|
+
TermInfo *ste_scan_for_term_info(TermEnum *te, Term *t)
|
378
|
+
{
|
379
|
+
te_skip_to(te, t);
|
380
|
+
|
381
|
+
if (tb_term_cmp(te->tb_curr, t) == 0) {
|
382
|
+
return te_get_ti(te);
|
383
|
+
} else {
|
384
|
+
return NULL;
|
385
|
+
}
|
135
386
|
}
|
136
387
|
|
137
|
-
int
|
138
|
-
frt_term_compare_to_int(VALUE self, VALUE rother)
|
388
|
+
Term *ste_scan_for_term(TermEnum *te, int pos)
|
139
389
|
{
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
390
|
+
GET_STE;
|
391
|
+
while (ste->pos < pos) {
|
392
|
+
if (ste_next(te) == NULL)
|
393
|
+
return NULL;
|
394
|
+
}
|
395
|
+
|
396
|
+
return te_get_term(te);
|
144
397
|
}
|
145
398
|
|
146
|
-
|
147
|
-
|
399
|
+
/****************************************************************************
|
400
|
+
*
|
401
|
+
* MultiTermEnum
|
402
|
+
*
|
403
|
+
****************************************************************************/
|
404
|
+
|
405
|
+
#define GET_MTE MultiTermEnum *mte = (MultiTermEnum *)te->data;
|
406
|
+
|
407
|
+
TermBuffer *mte_next(TermEnum *te)
|
148
408
|
{
|
149
|
-
|
409
|
+
GET_MTE;
|
410
|
+
SegmentMergeInfo *top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
|
411
|
+
|
412
|
+
if (top == NULL) {
|
413
|
+
tb_reset(te->tb_curr);
|
414
|
+
return false;
|
415
|
+
}
|
416
|
+
|
417
|
+
tb_cpy(te->tb_prev, te->tb_curr);
|
418
|
+
tb_cpy(te->tb_curr, top->tb);
|
419
|
+
|
420
|
+
te->ti_curr->doc_freq = 0;
|
421
|
+
|
422
|
+
while ((top != NULL) && (tb_cmp(te->tb_curr, top->tb) == 0)) {
|
423
|
+
pq_pop(mte->smi_queue);
|
424
|
+
te->ti_curr->doc_freq += top->te->ti_curr->doc_freq; // increment freq
|
425
|
+
if (smi_next(top)) {
|
426
|
+
pq_push(mte->smi_queue, top); // restore queue
|
427
|
+
} else {
|
428
|
+
smi_destroy(top); // done with a segment
|
429
|
+
}
|
430
|
+
top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
|
431
|
+
}
|
432
|
+
return te->tb_curr;
|
150
433
|
}
|
151
434
|
|
152
|
-
|
153
|
-
frt_term_gt(VALUE self, VALUE rother)
|
435
|
+
void mte_close(TermEnum *te)
|
154
436
|
{
|
155
|
-
|
437
|
+
GET_MTE;
|
438
|
+
pq_clear(mte->smi_queue);
|
439
|
+
pq_destroy(mte->smi_queue);
|
440
|
+
free(mte);
|
441
|
+
te_destroy(te);
|
156
442
|
}
|
157
443
|
|
158
|
-
|
159
|
-
frt_term_le(VALUE self, VALUE rother)
|
444
|
+
TermEnum *mte_clone(TermEnum *te)
|
160
445
|
{
|
161
|
-
|
446
|
+
eprintf(ERROR, "MultiTermEnum does not support cloning");
|
447
|
+
return NULL;
|
162
448
|
}
|
163
449
|
|
164
|
-
|
165
|
-
frt_term_ge(VALUE self, VALUE rother)
|
450
|
+
TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *t)
|
166
451
|
{
|
167
|
-
|
452
|
+
int i;
|
453
|
+
TermEnum *te = te_create();
|
454
|
+
te->next = &mte_next;
|
455
|
+
te->clone = &mte_clone;
|
456
|
+
te->close = &mte_close;
|
457
|
+
|
458
|
+
MultiTermEnum *mte = ALLOC(MultiTermEnum);
|
459
|
+
te->data = mte;
|
460
|
+
|
461
|
+
IndexReader *reader;
|
462
|
+
TermEnum *sub_te;
|
463
|
+
|
464
|
+
mte->smi_queue = pq_create(rcnt, &smi_lt);
|
465
|
+
mte->smi_queue->free_elem = &smi_destroy;
|
466
|
+
|
467
|
+
for (i = 0; i < rcnt; i++) {
|
468
|
+
reader = readers[i];
|
469
|
+
|
470
|
+
if (t != NULL) {
|
471
|
+
sub_te = reader->terms_from(reader, t);
|
472
|
+
} else {
|
473
|
+
sub_te = reader->terms(reader);
|
474
|
+
}
|
475
|
+
|
476
|
+
SegmentMergeInfo *smi = smi_create(starts[i], sub_te, reader);
|
477
|
+
if (((t == NULL) && smi_next(smi)) ||
|
478
|
+
(sub_te->tb_curr->field != (char *)EMPTY_STRING)) {
|
479
|
+
pq_push(mte->smi_queue, smi); // initialize queue
|
480
|
+
} else {
|
481
|
+
smi_destroy(smi);
|
482
|
+
}
|
483
|
+
}
|
484
|
+
|
485
|
+
if ((t != NULL) && (mte->smi_queue->count > 0)) {
|
486
|
+
mte_next(te);
|
487
|
+
}
|
488
|
+
|
489
|
+
return te;
|
168
490
|
}
|
169
491
|
|
170
|
-
|
171
|
-
|
492
|
+
/****************************************************************************
|
493
|
+
*
|
494
|
+
* TermInfosWriter
|
495
|
+
*
|
496
|
+
****************************************************************************/
|
497
|
+
|
498
|
+
const Term EmptyTerm = {"", ""};
|
499
|
+
|
500
|
+
TermInfosWriter *tiw_open_internal(Store *store,
|
501
|
+
char *segment,
|
502
|
+
FieldInfos *fis,
|
503
|
+
int interval,
|
504
|
+
int is_index)
|
172
505
|
{
|
173
|
-
|
174
|
-
|
175
|
-
|
506
|
+
TermInfosWriter *tiw = ALLOC(TermInfosWriter);
|
507
|
+
tiw->index_interval = interval;
|
508
|
+
tiw->skip_interval = 16;
|
509
|
+
tiw->last_index_pointer = 0;
|
510
|
+
tiw->last_term = (Term *)&EmptyTerm;
|
511
|
+
tiw->last_term_info = ti_create(0,0,0,0);
|
512
|
+
tiw->size = 0;
|
513
|
+
tiw->is_index = is_index;
|
514
|
+
tiw->fis = fis;
|
515
|
+
tiw->curr_field = NULL;
|
516
|
+
tiw->curr_field_num = -1;
|
517
|
+
|
518
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
519
|
+
strcpy(fname, segment);
|
520
|
+
strcat(fname, (is_index ? ".tii" : ".tis"));
|
521
|
+
OutStream *os = tiw->os = store->create_output(store, fname);
|
522
|
+
os_write_int(os, TERM_INFO_FORMAT); // write format
|
523
|
+
os_write_long(os, 0); // leave space for size
|
524
|
+
os_write_int(os, tiw->index_interval); // write index_interval
|
525
|
+
os_write_int(os, tiw->skip_interval); // write skip_interval
|
526
|
+
if (!is_index) {
|
527
|
+
tiw->other = tiw_open_internal(store, segment, fis, interval, true);
|
528
|
+
tiw->other->other = tiw;
|
529
|
+
}
|
530
|
+
return tiw;
|
176
531
|
}
|
177
532
|
|
533
|
+
TermInfosWriter *tiw_open(Store *store, char *segment, FieldInfos *fis, int interval)
|
534
|
+
{
|
535
|
+
return tiw_open_internal(store, segment, fis, interval, false);
|
536
|
+
}
|
178
537
|
|
179
|
-
|
180
|
-
frt_term_compare_to(VALUE self, VALUE other)
|
538
|
+
void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
|
181
539
|
{
|
182
|
-
|
540
|
+
//printf("%s, %s\n", tiw->last_term->text, t->text);
|
541
|
+
int start = hlp_string_diff(tiw->last_term->text, t->text);
|
542
|
+
int length = strlen(t->text) - start;
|
543
|
+
|
544
|
+
os_write_vint(os, start); // write shared prefix length
|
545
|
+
os_write_vint(os, length); // write delta length
|
546
|
+
os_write_chars(os, t->text, start, length); // write delta chars
|
547
|
+
if (tiw->curr_field != t->field) {
|
548
|
+
tiw->curr_field = t->field;
|
549
|
+
tiw->curr_field_num = fis_get_number(tiw->fis, t->field);
|
550
|
+
}
|
551
|
+
os_write_vint(os, tiw->curr_field_num);
|
552
|
+
tiw->last_term = t;
|
183
553
|
}
|
184
554
|
|
185
|
-
|
186
|
-
frt_term_hash(VALUE self)
|
555
|
+
void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
|
187
556
|
{
|
188
|
-
|
189
|
-
|
190
|
-
|
557
|
+
if (tiw->is_index && term_cmp(tiw->last_term, t) > 0) {
|
558
|
+
eprintf(STATE_ERROR,
|
559
|
+
"term out of order %s < %s", t->text, tiw->last_term->text);
|
560
|
+
}
|
561
|
+
if (ti->freq_pointer < tiw->last_term_info->freq_pointer) {
|
562
|
+
eprintf(STATE_ERROR, "freq pointer out of order");
|
563
|
+
}
|
564
|
+
if (ti->prox_pointer < tiw->last_term_info->prox_pointer) {
|
565
|
+
eprintf(STATE_ERROR, "prox pointer out of order");
|
566
|
+
}
|
567
|
+
|
568
|
+
if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0)
|
569
|
+
tiw_add(tiw->other, tiw->last_term, tiw->last_term_info); // add an index term
|
570
|
+
|
571
|
+
tiw_write_term(tiw, tiw->os, t); // write term
|
572
|
+
os_write_vint(tiw->os, ti->doc_freq); // write doc freq
|
573
|
+
os_write_vint(tiw->os, ti->freq_pointer - tiw->last_term_info->freq_pointer);
|
574
|
+
os_write_vint(tiw->os, ti->prox_pointer - tiw->last_term_info->prox_pointer);
|
575
|
+
if (ti->doc_freq >= tiw->skip_interval)
|
576
|
+
os_write_vint(tiw->os, ti->skip_offset);
|
577
|
+
|
578
|
+
if (tiw->is_index) {
|
579
|
+
OutStream *other_os = tiw->other->os;
|
580
|
+
int other_pos = os_pos(other_os);
|
581
|
+
os_write_vint(tiw->os, other_pos - tiw->last_index_pointer);
|
582
|
+
tiw->last_index_pointer = other_pos; // write pointer
|
583
|
+
}
|
584
|
+
|
585
|
+
ti_cpy(tiw->last_term_info, ti);
|
586
|
+
tiw->size++;
|
587
|
+
}
|
588
|
+
|
589
|
+
void tiw_close(TermInfosWriter *tiw)
|
590
|
+
{
|
591
|
+
OutStream *os = tiw->os;
|
592
|
+
os_seek(os, 4); // write @size after format
|
593
|
+
os_write_long(os, tiw->size);
|
594
|
+
os_close(os);
|
595
|
+
|
596
|
+
if (!tiw->is_index)
|
597
|
+
tiw_close(tiw->other);
|
598
|
+
|
599
|
+
ti_destroy(tiw->last_term_info);
|
600
|
+
free(tiw);
|
191
601
|
}
|
192
602
|
|
193
603
|
/****************************************************************************
|
194
604
|
*
|
195
|
-
*
|
605
|
+
* TermInfosReader
|
196
606
|
*
|
197
607
|
****************************************************************************/
|
198
608
|
|
199
|
-
void
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
609
|
+
void tir_close(TermInfosReader *tir)
|
610
|
+
{
|
611
|
+
int i;
|
612
|
+
if (tir->index_terms != NULL) {
|
613
|
+
for (i = 0; i < tir->index_size; i++) {
|
614
|
+
term_destroy(tir->index_terms[i]);
|
615
|
+
ti_destroy(tir->index_term_infos[i]);
|
616
|
+
}
|
617
|
+
free(tir->index_terms);
|
618
|
+
free(tir->index_term_infos);
|
619
|
+
free(tir->index_pointers);
|
620
|
+
}
|
621
|
+
if (tir->orig_te) tir->orig_te->close(tir->orig_te);
|
622
|
+
thread_key_delete(tir->thread_te);
|
623
|
+
ary_destroy(tir->te_bucket);
|
624
|
+
if (tir->index_te) tir->index_te->close(tir->index_te);
|
625
|
+
mutex_destroy(&tir->mutex);
|
626
|
+
free(tir);
|
627
|
+
}
|
628
|
+
|
629
|
+
TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis)
|
630
|
+
{
|
631
|
+
TermInfosReader *tir = ALLOC(TermInfosReader);
|
632
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
633
|
+
mutex_init(&tir->mutex, NULL);
|
634
|
+
strcpy(fname, segment);
|
635
|
+
strcpy(fname + strlen(segment), ".tis");
|
636
|
+
InStream *is = store->open_input(store, fname);
|
637
|
+
tir->orig_te = ste_create(is, fis, false);
|
638
|
+
thread_key_create(&tir->thread_te, NULL);
|
639
|
+
tir->te_bucket = ary_create(1, (destroy_func_t)tir->orig_te->close);
|
640
|
+
|
641
|
+
SegmentTermEnum *ste = tir->orig_te->data;
|
642
|
+
tir->size = ste->size;
|
643
|
+
tir->skip_interval = ste->skip_interval;
|
644
|
+
|
645
|
+
strcpy(fname + strlen(segment), ".tii");
|
646
|
+
is = store->open_input(store, fname);
|
647
|
+
tir->index_te = ste_create(is, fis, true);
|
648
|
+
tir->index_terms = NULL;
|
649
|
+
tir->index_term_infos = NULL;
|
650
|
+
tir->index_pointers = NULL;
|
651
|
+
return tir;
|
652
|
+
}
|
653
|
+
|
654
|
+
void tir_ensure_index_is_read(TermInfosReader *tir)
|
655
|
+
{
|
656
|
+
mutex_lock(&tir->mutex);
|
657
|
+
if (tir->index_terms == NULL) {
|
658
|
+
int index_size = ((SegmentTermEnum *)tir->index_te->data)->size;
|
659
|
+
tir->index_size = index_size;
|
660
|
+
|
661
|
+
tir->index_terms = ALLOC_N(Term *, index_size);
|
662
|
+
tir->index_term_infos = ALLOC_N(TermInfo *, index_size);
|
663
|
+
tir->index_pointers = ALLOC_N(int, index_size);
|
664
|
+
|
665
|
+
int i = 0;
|
666
|
+
TermEnum *index_te = tir->index_te;
|
667
|
+
SegmentTermEnum *ste = index_te->data;
|
668
|
+
|
669
|
+
while (ste_next(index_te) != NULL) {
|
670
|
+
tir->index_terms[i] = te_get_term(index_te);
|
671
|
+
tir->index_term_infos[i] = te_get_ti(index_te);
|
672
|
+
tir->index_pointers[i] = ste->index_pointer;
|
673
|
+
i++;
|
674
|
+
}
|
675
|
+
|
676
|
+
index_te->close(index_te);
|
677
|
+
tir->index_te = NULL;
|
678
|
+
}
|
679
|
+
mutex_unlock(&tir->mutex);
|
680
|
+
}
|
681
|
+
|
682
|
+
static inline TermEnum *tir_enum(TermInfosReader *tir)
|
683
|
+
{
|
684
|
+
TermEnum *te;
|
685
|
+
if ((te = thread_getspecific(tir->thread_te)) == NULL) {
|
686
|
+
te = tir->orig_te->clone(tir->orig_te);
|
687
|
+
ary_append(tir->te_bucket, te);
|
688
|
+
thread_setspecific(tir->thread_te, te);
|
689
|
+
}
|
690
|
+
return te;
|
222
691
|
}
|
692
|
+
|
693
|
+
void tir_seek_enum(TermInfosReader *tir, int ind_offset)
|
694
|
+
{
|
695
|
+
TermEnum *te = tir_enum(tir);
|
696
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
697
|
+
ste_seek(te, tir->index_pointers[ind_offset],
|
698
|
+
(ind_offset * ste->index_interval) - 1,
|
699
|
+
tir->index_terms[ind_offset],
|
700
|
+
tir->index_term_infos[ind_offset]);
|
701
|
+
}
|
702
|
+
|
703
|
+
int tir_get_index_offset(TermInfosReader *tir, Term *t)
|
704
|
+
{
|
705
|
+
int lo = 0; // binary search tir->index_terms[]
|
706
|
+
int hi = tir->index_size - 1;
|
707
|
+
int mid, delta;
|
708
|
+
Term **index_terms = tir->index_terms;
|
709
|
+
|
710
|
+
while (hi >= lo) {
|
711
|
+
mid = (lo + hi) >> 1;
|
712
|
+
delta = term_cmp(t, index_terms[mid]);
|
713
|
+
if (delta < 0) {
|
714
|
+
hi = mid - 1;
|
715
|
+
} else if (delta > 0) {
|
716
|
+
lo = mid + 1;
|
717
|
+
} else {
|
718
|
+
return mid;
|
719
|
+
}
|
720
|
+
}
|
721
|
+
return hi;
|
722
|
+
}
|
723
|
+
|
724
|
+
TermInfo *tir_get_ti(TermInfosReader *tir, Term *t)
|
725
|
+
{
|
726
|
+
if (tir->size == 0)
|
727
|
+
return NULL;
|
728
|
+
|
729
|
+
tir_ensure_index_is_read(tir);
|
730
|
+
|
731
|
+
// optimize sequential access: first try scanning cached enum w/o seeking
|
732
|
+
TermEnum *te = tir_enum(tir);
|
733
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
734
|
+
if (ste->pos < ste->size && tb_term_cmp(te->tb_curr, t) <= 0) {
|
735
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
736
|
+
int enum_offset = (int)(ste->pos / ste->index_interval) + 1;
|
737
|
+
if (tir->index_size == enum_offset ||
|
738
|
+
term_cmp(t, tir->index_terms[enum_offset]) < 0) { // but before end of block
|
739
|
+
return ste_scan_for_term_info(te, t); // no need to seek
|
740
|
+
}
|
741
|
+
}
|
742
|
+
|
743
|
+
// random-access: must seek
|
744
|
+
tir_seek_enum(tir, tir_get_index_offset(tir, t));
|
745
|
+
return ste_scan_for_term_info(te, t);
|
746
|
+
}
|
747
|
+
|
748
|
+
Term *tir_get_term(TermInfosReader *tir, int pos)
|
749
|
+
{
|
750
|
+
if (tir->size == 0)
|
751
|
+
return NULL;
|
752
|
+
|
753
|
+
TermEnum *te = tir_enum(tir);
|
754
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
755
|
+
if (pos >= ste->pos &&
|
756
|
+
pos < (ste->pos + ste->index_interval)) {
|
757
|
+
return ste_scan_for_term(te, pos); // can avoid seek
|
758
|
+
}
|
759
|
+
|
760
|
+
tir_seek_enum(tir, (int)(pos / ste->index_interval)); // must seek
|
761
|
+
return ste_scan_for_term(te, pos);
|
762
|
+
}
|
763
|
+
|
764
|
+
int tir_get_term_pos(TermInfosReader *tir, Term *t)
|
765
|
+
{
|
766
|
+
if (tir->size == 0)
|
767
|
+
return -1;
|
768
|
+
|
769
|
+
tir_ensure_index_is_read(tir);
|
770
|
+
|
771
|
+
int ind_offset = tir_get_index_offset(tir, t);
|
772
|
+
tir_seek_enum(tir, ind_offset);
|
773
|
+
|
774
|
+
TermEnum *te = tir_enum(tir);
|
775
|
+
while ((tb_term_cmp(te->tb_curr, t) < 0) && (ste_next(te) != NULL))
|
776
|
+
;
|
777
|
+
|
778
|
+
if (tb_term_cmp(te->tb_curr, t) == 0)
|
779
|
+
return ((SegmentTermEnum *)te->data)->pos;
|
780
|
+
else
|
781
|
+
return -1;
|
782
|
+
}
|
783
|
+
|