ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/store.c
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#include "store.h"
|
2
|
+
|
3
|
+
void with_lock(Lock *lock, void (*func)(void *arg), void *arg)
|
4
|
+
{
|
5
|
+
if (!lock->obtain(lock))
|
6
|
+
eprintf(IO_ERROR, "Could not obtain lock: <%s>", lock->name);
|
7
|
+
func(arg);
|
8
|
+
lock->release(lock);
|
9
|
+
}
|
10
|
+
|
11
|
+
void with_lock_name(Store *store, char *lock_name,
|
12
|
+
void (*func)(void *arg), void *arg)
|
13
|
+
{
|
14
|
+
Lock *lock = store->open_lock(store, lock_name);
|
15
|
+
if (!lock->obtain(lock))
|
16
|
+
eprintf(IO_ERROR, "Could not obtain lock: <%s>", lock->name);
|
17
|
+
func(arg);
|
18
|
+
lock->release(lock);
|
19
|
+
store->close_lock(lock);
|
20
|
+
}
|
21
|
+
|
22
|
+
Store *store_create()
|
23
|
+
{
|
24
|
+
Store *store = ALLOC(Store);
|
25
|
+
mutex_init(&store->mutex, NULL);
|
26
|
+
mutex_init(&store->ext_mutex, NULL);
|
27
|
+
return store;
|
28
|
+
}
|
29
|
+
|
30
|
+
void store_destroy(Store *store)
|
31
|
+
{
|
32
|
+
mutex_destroy(&store->mutex);
|
33
|
+
mutex_destroy(&store->ext_mutex);
|
34
|
+
free(store);
|
35
|
+
}
|
data/ext/store.h
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
#include "global.h"
|
2
|
+
#include "hash.h"
|
3
|
+
|
4
|
+
#ifndef FRT_STORE_H
|
5
|
+
#define FRT_STORE_H
|
6
|
+
|
7
|
+
#define BUFFER_SIZE 1024
|
8
|
+
#define LOCK_PREFIX "ferret-"
|
9
|
+
|
10
|
+
#define VINT_MAX_LEN 10
|
11
|
+
#define VINT_END BUFFER_SIZE - VINT_MAX_LEN
|
12
|
+
|
13
|
+
typedef struct Buffer {
|
14
|
+
uchar buf[BUFFER_SIZE];
|
15
|
+
int start;
|
16
|
+
int pos;
|
17
|
+
int len;
|
18
|
+
} Buffer;
|
19
|
+
|
20
|
+
typedef struct OutStream {
|
21
|
+
Buffer buf;
|
22
|
+
void *file;
|
23
|
+
int pointer; // only used by RAMOut
|
24
|
+
void (*flush_internal)(struct OutStream *os, uchar *buf, int len);
|
25
|
+
void (*seek_internal)(struct OutStream *os, int pos);
|
26
|
+
void (*close_internal)(struct OutStream *os);
|
27
|
+
} OutStream;
|
28
|
+
|
29
|
+
typedef struct CompoundInStream CompoundInStream;
|
30
|
+
|
31
|
+
typedef struct InStream {
|
32
|
+
int is_clone;
|
33
|
+
Buffer buf;
|
34
|
+
void *file;
|
35
|
+
union {
|
36
|
+
int pointer; // only used by RAMIn
|
37
|
+
char *path; // only used by FSIn
|
38
|
+
CompoundInStream *cis;
|
39
|
+
} d;
|
40
|
+
void (*read_internal)(struct InStream *is, uchar *buf, int offset, int len);
|
41
|
+
void (*seek_internal)(struct InStream *is, int pos);
|
42
|
+
void (*close_internal)(struct InStream *is);
|
43
|
+
void (*clone_internal)(struct InStream *is, struct InStream *new_index_i);
|
44
|
+
int (*length_internal)(struct InStream *is);
|
45
|
+
} InStream;
|
46
|
+
|
47
|
+
struct CompoundInStream {
|
48
|
+
InStream *sub;
|
49
|
+
int offset;
|
50
|
+
int length;
|
51
|
+
};
|
52
|
+
|
53
|
+
#define is_length(mis) mis->length_internal(mis)
|
54
|
+
|
55
|
+
typedef struct Store Store;
|
56
|
+
typedef struct Lock Lock;
|
57
|
+
struct Lock {
|
58
|
+
char *name;
|
59
|
+
Store *store;
|
60
|
+
int (*obtain)(Lock *lock);
|
61
|
+
int (*is_locked)(Lock *lock);
|
62
|
+
void (*release)(Lock *lock);
|
63
|
+
};
|
64
|
+
|
65
|
+
typedef struct CompoundStore {
|
66
|
+
Store *store;
|
67
|
+
const char *name;
|
68
|
+
HshTable *entries;
|
69
|
+
InStream *stream;
|
70
|
+
} CompoundStore;
|
71
|
+
|
72
|
+
struct Store {
|
73
|
+
int ref_cnt; /* for fs_store only */
|
74
|
+
mutex_t mutex;
|
75
|
+
mutex_t ext_mutex;
|
76
|
+
union {
|
77
|
+
char *path; /* for fs_store only */
|
78
|
+
HshTable *ht; /* for ram_store only */
|
79
|
+
CompoundStore *cmpd; /* for compound_store only */
|
80
|
+
} dir;
|
81
|
+
void (*touch)(Store *store, char *filename);
|
82
|
+
int (*exists)(Store *store, char *filename);
|
83
|
+
int (*remove)(Store *store, char *filename);
|
84
|
+
int (*rename)(Store *store, char *from, char *to);
|
85
|
+
int (*count)(Store *store);
|
86
|
+
void (*close)(Store *store);
|
87
|
+
void (*clear)(Store *store);
|
88
|
+
void (*clear_all)(Store *store);
|
89
|
+
void (*clear_locks)(Store *store);
|
90
|
+
int (*length)(Store *store, char *filename);
|
91
|
+
void (*each)(Store *store, void (*func)(char *fname, void *arg), void *arg);
|
92
|
+
OutStream *(*create_output)(Store *store, const char *filename);
|
93
|
+
InStream *(*open_input)(Store *store, const char *filename);
|
94
|
+
Lock *(*open_lock)(Store *store, char *lockname);
|
95
|
+
void (*close_lock)(Lock *lock);
|
96
|
+
};
|
97
|
+
|
98
|
+
#define store_close(mstore) mstore->close(mstore)
|
99
|
+
|
100
|
+
Store *store_create();
|
101
|
+
void store_destroy(Store *store);
|
102
|
+
Store *open_fs_store(const char *pathname);
|
103
|
+
Store *open_ram_store();
|
104
|
+
Store *open_ram_store_and_copy(Store *store, bool close_dir);
|
105
|
+
Store *open_cmpd_store(Store *sub, const char *filename);
|
106
|
+
void ram_close(Store *store);
|
107
|
+
Buffer *buf_create();
|
108
|
+
void os_flush(OutStream *os);
|
109
|
+
void os_close(OutStream *os);
|
110
|
+
int os_pos(OutStream *os);
|
111
|
+
void os_seek(OutStream *os, int new_pos);
|
112
|
+
void os_write_byte(OutStream *os, uchar b);
|
113
|
+
void os_write_bytes(OutStream *os, uchar *b, int len);
|
114
|
+
uchar is_read_byte(InStream *is);
|
115
|
+
int is_pos(InStream *is);
|
116
|
+
uchar *is_read_bytes(InStream *is, uchar *b, int offset, int len);
|
117
|
+
void is_seek(InStream *is, int pos);
|
118
|
+
InStream *is_clone(InStream *is);
|
119
|
+
void is_close(InStream *is);
|
120
|
+
int is_read_int(InStream *is);
|
121
|
+
long long is_read_long(InStream *is);
|
122
|
+
unsigned int is_read_uint(InStream *is);
|
123
|
+
unsigned long long is_read_ulong(InStream *is);
|
124
|
+
unsigned long long is_read_vint(InStream *is);
|
125
|
+
void is_read_chars(InStream *is, char* buffer, int off, int len) ;
|
126
|
+
char *is_read_string(InStream *is);
|
127
|
+
void os_write_int(OutStream *os, int l);
|
128
|
+
void os_write_long(OutStream *os, long long l);
|
129
|
+
void os_write_uint(OutStream *os, unsigned int l);
|
130
|
+
void os_write_ulong(OutStream *os, unsigned long long l);
|
131
|
+
void os_write_vint(OutStream *os, register unsigned long long i);
|
132
|
+
void os_write_chars(OutStream *os, char *buf, int start, int length);
|
133
|
+
void os_write_string(OutStream *os, char *str);
|
134
|
+
OutStream *os_create();
|
135
|
+
InStream *is_create();
|
136
|
+
void buf_destroy(Buffer *buf);
|
137
|
+
|
138
|
+
// RamStore functions
|
139
|
+
int ramo_length(OutStream *os);
|
140
|
+
void ramo_reset(OutStream *os);
|
141
|
+
int rami_length(InStream *is);
|
142
|
+
void ramo_write_to(OutStream *os, OutStream *other_o);
|
143
|
+
OutStream *ram_create_buffer();
|
144
|
+
void ram_destroy_buffer(OutStream *os);
|
145
|
+
|
146
|
+
int file_is_lock(char *filename);
|
147
|
+
|
148
|
+
void with_lock(Lock *lock, void (*func)(void *arg), void *arg);
|
149
|
+
void with_lock_name(Store *store, char *lock_name,
|
150
|
+
void (*func)(void *arg), void *arg);
|
151
|
+
|
152
|
+
#endif
|
data/ext/term.c
CHANGED
@@ -1,222 +1,783 @@
|
|
1
|
-
#include
|
1
|
+
#include <index.h>
|
2
|
+
#include <string.h>
|
3
|
+
#include <helper.h>
|
4
|
+
#include <hash.h>
|
2
5
|
|
6
|
+
/****************************************************************************
|
7
|
+
*
|
8
|
+
* Term
|
9
|
+
*
|
10
|
+
****************************************************************************/
|
11
|
+
|
12
|
+
Term *term_clone(Term *term)
|
13
|
+
{
|
14
|
+
Term *t = ALLOC(Term);
|
15
|
+
|
16
|
+
t->field = term->field;
|
17
|
+
t->text = estrdup(term->text);
|
18
|
+
return t;
|
19
|
+
}
|
20
|
+
|
21
|
+
Term *term_create(const char *field, char *text)
|
22
|
+
{
|
23
|
+
Term *t = ALLOC(Term);
|
24
|
+
|
25
|
+
t->field = (char *)field;
|
26
|
+
t->text = estrdup(text);
|
27
|
+
return t;
|
28
|
+
}
|
29
|
+
|
30
|
+
void term_destroy(void *p)
|
31
|
+
{
|
32
|
+
Term *t = (Term *)p;
|
33
|
+
free(t->text);
|
34
|
+
free(t);
|
35
|
+
}
|
36
|
+
|
37
|
+
int term_cmp(void *t1, void *t2)
|
38
|
+
{
|
39
|
+
int res = strcmp(((Term *)t1)->field, ((Term *)t2)->field);
|
40
|
+
if (res != 0) {
|
41
|
+
return res;
|
42
|
+
} else {
|
43
|
+
return strcmp(((Term *)t1)->text, ((Term *)t2)->text);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
int term_eq(const void *t1, const void *t2)
|
48
|
+
{
|
49
|
+
return (strcmp(((Term *)t1)->text, ((Term *)t2)->text)) == 0 &&
|
50
|
+
(strcmp(((Term *)t1)->field, ((Term *)t2)->field) == 0);
|
51
|
+
}
|
52
|
+
|
53
|
+
unsigned int term_hash(const void *t)
|
54
|
+
{
|
55
|
+
return str_hash(((Term *)t)->text) * str_hash(((Term *)t)->field);
|
56
|
+
}
|
57
|
+
|
58
|
+
char *term_to_s(Term *term)
|
59
|
+
{
|
60
|
+
char *string = ALLOC_N(char, strlen(term->field) + strlen(term->text) + 2);
|
61
|
+
sprintf(string, "%s:%s", term->field, term->text);
|
62
|
+
return string;
|
63
|
+
}
|
3
64
|
|
4
65
|
/****************************************************************************
|
5
66
|
*
|
6
|
-
*
|
67
|
+
* TermBuffer
|
7
68
|
*
|
8
69
|
****************************************************************************/
|
9
70
|
|
10
|
-
void
|
11
|
-
|
71
|
+
void tb_reset(TermBuffer *tb)
|
72
|
+
{
|
73
|
+
tb->field = (char *)EMPTY_STRING;
|
74
|
+
tb->text[0] = '\0';
|
75
|
+
}
|
76
|
+
|
77
|
+
TermBuffer *tb_create()
|
78
|
+
{
|
79
|
+
TermBuffer *tb = ALLOC(TermBuffer);
|
80
|
+
tb->field = (char *)EMPTY_STRING;
|
81
|
+
tb->text[0] = '\0';
|
82
|
+
return tb;
|
83
|
+
}
|
84
|
+
|
85
|
+
void tb_destroy(void *p)
|
12
86
|
{
|
13
|
-
Term *term = (Term *)p;
|
14
|
-
free(term->text);
|
15
87
|
free(p);
|
16
88
|
}
|
17
89
|
|
18
|
-
|
19
|
-
|
90
|
+
TermBuffer *tb_set_term(TermBuffer *tb, Term *t)
|
91
|
+
{
|
92
|
+
tb->field = t->field;
|
93
|
+
strcpy(tb->text, t->text);
|
94
|
+
return tb;
|
95
|
+
}
|
96
|
+
|
97
|
+
Term *tb_get_term(TermBuffer *tb)
|
20
98
|
{
|
21
|
-
|
22
|
-
rb_gc_mark(term->field);
|
99
|
+
return term_create(tb->field, tb->text);
|
23
100
|
}
|
24
101
|
|
25
|
-
|
26
|
-
frt_term_alloc(VALUE klass)
|
102
|
+
int tb_cmp(TermBuffer *tb1, TermBuffer *tb2)
|
27
103
|
{
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
104
|
+
int res = strcmp(tb1->field, tb2->field);
|
105
|
+
if (res != 0) {
|
106
|
+
return res;
|
107
|
+
} else {
|
108
|
+
return strcmp(tb1->text, tb2->text);
|
109
|
+
}
|
32
110
|
}
|
33
111
|
|
34
|
-
|
35
|
-
VALUE
|
36
|
-
frt_term_set(VALUE self, VALUE rfield, VALUE rtext)
|
112
|
+
int tb_term_cmp(TermBuffer *tb, Term *t)
|
37
113
|
{
|
38
|
-
int
|
39
|
-
|
114
|
+
int res = strcmp(tb->field, t->field);
|
115
|
+
if (res != 0) {
|
116
|
+
return res;
|
117
|
+
} else {
|
118
|
+
return strcmp(tb->text, t->text);
|
119
|
+
}
|
120
|
+
}
|
40
121
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
return Qnil;
|
122
|
+
TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2)
|
123
|
+
{
|
124
|
+
tb1->field = tb2->field;
|
125
|
+
strcpy(tb1->text, tb2->text);
|
126
|
+
return tb1;
|
48
127
|
}
|
49
128
|
|
50
|
-
|
51
|
-
frt_term_init(VALUE self, VALUE rfield, VALUE rtext)
|
129
|
+
TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis)
|
52
130
|
{
|
53
|
-
|
54
|
-
|
131
|
+
int start = is_read_vint(is);
|
132
|
+
int length = is_read_vint(is);
|
133
|
+
int total_length = start + length;
|
134
|
+
is_read_bytes(is, (uchar *)tb->text, start, length);
|
135
|
+
tb->text[total_length] = '\0';
|
136
|
+
int fnum = is_read_vint(is);
|
137
|
+
if (fnum < 0)
|
138
|
+
tb->field = (char *)EMPTY_STRING;
|
139
|
+
else
|
140
|
+
tb->field = fis->by_number[fnum]->name;
|
141
|
+
return tb;
|
55
142
|
}
|
56
143
|
|
57
|
-
|
58
|
-
|
144
|
+
/****************************************************************************
|
145
|
+
*
|
146
|
+
* TermInfo
|
147
|
+
*
|
148
|
+
****************************************************************************/
|
149
|
+
|
150
|
+
TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
|
59
151
|
{
|
60
|
-
|
61
|
-
|
152
|
+
TermInfo *ti = ALLOC(TermInfo);
|
153
|
+
ti->doc_freq = doc_freq;
|
154
|
+
ti->freq_pointer = freq_pointer;
|
155
|
+
ti->prox_pointer = prox_pointer;
|
156
|
+
ti->skip_offset = skip_offset;
|
157
|
+
return ti;
|
62
158
|
}
|
63
159
|
|
64
|
-
|
65
|
-
frt_term_set_text(VALUE self, VALUE rtext)
|
160
|
+
TermInfo *ti_set(TermInfo *ti, int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
|
66
161
|
{
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
162
|
+
ti->doc_freq = doc_freq;
|
163
|
+
ti->freq_pointer = freq_pointer;
|
164
|
+
ti->prox_pointer = prox_pointer;
|
165
|
+
ti->skip_offset = skip_offset;
|
166
|
+
return ti;
|
167
|
+
}
|
72
168
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
term->tlen = tlen;
|
77
|
-
|
78
|
-
return Qnil;
|
169
|
+
void ti_destroy(void *p)
|
170
|
+
{
|
171
|
+
free(p);
|
79
172
|
}
|
80
173
|
|
81
|
-
|
82
|
-
frt_term_get_field(VALUE self)
|
174
|
+
TermInfo *ti_cpy(TermInfo *ti, TermInfo *other)
|
83
175
|
{
|
84
|
-
|
85
|
-
return
|
176
|
+
memcpy(ti, other, sizeof(TermInfo));
|
177
|
+
return ti;
|
86
178
|
}
|
87
179
|
|
88
|
-
|
89
|
-
frt_term_set_field(VALUE self, VALUE rfield)
|
180
|
+
TermInfo *ti_clone(TermInfo *other)
|
90
181
|
{
|
91
|
-
|
92
|
-
|
93
|
-
return Qnil;
|
182
|
+
return ti_create(other->doc_freq,
|
183
|
+
other->freq_pointer, other->prox_pointer, other->skip_offset);
|
94
184
|
}
|
95
185
|
|
96
|
-
|
97
|
-
frt_term_to_s(VALUE self)
|
186
|
+
int ti_eq(TermInfo *ti, TermInfo *other)
|
98
187
|
{
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
188
|
+
return (memcmp(ti, other, sizeof(TermInfo)) == 0);
|
189
|
+
}
|
190
|
+
|
191
|
+
/****************************************************************************
|
192
|
+
*
|
193
|
+
* TermEnum
|
194
|
+
*
|
195
|
+
****************************************************************************/
|
196
|
+
|
197
|
+
TermEnum *te_create()
|
198
|
+
{
|
199
|
+
TermEnum *te = ALLOC(TermEnum);
|
200
|
+
te->tb_curr = tb_create();
|
201
|
+
te->tb_prev = tb_create();
|
202
|
+
te->ti_curr = ti_create(0, 0, 0, 0);
|
203
|
+
return te;
|
204
|
+
}
|
205
|
+
|
206
|
+
void te_destroy(void *p)
|
207
|
+
{
|
208
|
+
TermEnum *te = (TermEnum *)p;
|
209
|
+
tb_destroy(te->tb_curr);
|
210
|
+
tb_destroy(te->tb_prev);
|
211
|
+
ti_destroy(te->ti_curr);
|
212
|
+
free(p);
|
213
|
+
}
|
214
|
+
|
215
|
+
Term *te_get_term(TermEnum *te)
|
216
|
+
{
|
217
|
+
return tb_get_term(te->tb_curr);
|
218
|
+
}
|
219
|
+
|
220
|
+
TermInfo *te_get_ti(TermEnum *te)
|
221
|
+
{
|
222
|
+
TermInfo *ti = te->ti_curr;
|
223
|
+
return ti_create(ti->doc_freq, ti->freq_pointer, ti->prox_pointer, ti->skip_offset);
|
224
|
+
}
|
225
|
+
|
226
|
+
TermBuffer *te_skip_to(TermEnum *te, Term *t)
|
227
|
+
{
|
228
|
+
TermBuffer *tb_curr;
|
229
|
+
if (tb_term_cmp(te->tb_curr, t) == 0)
|
230
|
+
return te->tb_curr;
|
231
|
+
|
232
|
+
while (((tb_curr = te->next(te)) != NULL) &&
|
233
|
+
(tb_term_cmp(tb_curr, t) < 0)) {
|
234
|
+
}
|
235
|
+
return tb_curr;
|
236
|
+
}
|
237
|
+
|
238
|
+
/****************************************************************************
|
239
|
+
*
|
240
|
+
* SegmentTermEnum
|
241
|
+
*
|
242
|
+
****************************************************************************/
|
243
|
+
|
244
|
+
#define GET_STE SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
245
|
+
|
246
|
+
TermBuffer *ste_next(TermEnum *te)
|
247
|
+
{
|
248
|
+
GET_STE;
|
249
|
+
InStream *is = ste->is;
|
250
|
+
ste->pos++;
|
251
|
+
if (ste->pos > ste->size - 1) {
|
252
|
+
tb_reset(te->tb_curr);
|
253
|
+
return NULL;
|
254
|
+
}
|
255
|
+
|
256
|
+
tb_cpy(te->tb_prev, te->tb_curr);
|
257
|
+
tb_read(te->tb_curr, is, ste->fis);
|
106
258
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
259
|
+
TermInfo *ti = te->ti_curr;
|
260
|
+
ti->doc_freq = is_read_vint(is); // read doc freq
|
261
|
+
ti->freq_pointer += is_read_vint(is); // read freq pointer
|
262
|
+
ti->prox_pointer += is_read_vint(is); // read prox pointer
|
263
|
+
|
264
|
+
if (ste->format == -1) {
|
265
|
+
// just read skip_offset in order to increment file pointer
|
266
|
+
// value is never used since skip_to is switched off
|
267
|
+
if (!ste->is_index) {
|
268
|
+
if (ti->doc_freq > ste->format_m1skip_interval)
|
269
|
+
ti->skip_offset = is_read_vint(is);
|
270
|
+
}
|
271
|
+
} else {
|
272
|
+
if (ti->doc_freq >= ste->skip_interval)
|
273
|
+
ti->skip_offset = is_read_vint(is);
|
274
|
+
}
|
275
|
+
|
276
|
+
if (ste->is_index)
|
277
|
+
ste->index_pointer += is_read_vint(is); // read index pointer
|
278
|
+
|
279
|
+
return te->tb_curr;
|
111
280
|
}
|
112
281
|
|
113
|
-
|
114
|
-
|
282
|
+
TermEnum *ste_clone(TermEnum *other_te);
|
283
|
+
TermEnum *ste_allocate()
|
115
284
|
{
|
116
|
-
|
285
|
+
TermEnum *te = te_create();
|
286
|
+
te->next = &ste_next;
|
287
|
+
te->close = &ste_close;
|
288
|
+
te->clone = &ste_clone;
|
289
|
+
SegmentTermEnum *ste =
|
290
|
+
ALLOC(SegmentTermEnum);
|
291
|
+
te->data = ste;
|
292
|
+
return te;
|
293
|
+
}
|
294
|
+
|
295
|
+
TermEnum *ste_clone(TermEnum *other_te)
|
296
|
+
{
|
297
|
+
SegmentTermEnum *other_ste = (SegmentTermEnum *)other_te->data;
|
298
|
+
TermEnum *te = ste_allocate();
|
299
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
300
|
+
memcpy(ste, other_ste, sizeof(SegmentTermEnum));
|
301
|
+
ste->is = is_clone(other_ste->is);
|
302
|
+
tb_cpy(te->tb_curr, other_te->tb_curr);
|
303
|
+
tb_cpy(te->tb_prev, other_te->tb_prev);
|
304
|
+
ti_cpy(te->ti_curr, other_te->ti_curr);
|
305
|
+
return te;
|
306
|
+
}
|
307
|
+
|
308
|
+
void ste_close(TermEnum *te)
|
309
|
+
{
|
310
|
+
GET_STE;
|
311
|
+
is_close(ste->is);
|
312
|
+
free(ste);
|
313
|
+
te->data = NULL;
|
314
|
+
te_destroy(te);
|
315
|
+
}
|
316
|
+
|
317
|
+
TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
|
318
|
+
{
|
319
|
+
TermEnum *te = ste_allocate();
|
320
|
+
GET_STE;
|
321
|
+
ste->fis = fis;
|
322
|
+
ste->is_index = is_index;
|
323
|
+
ste->is = is;
|
324
|
+
ste->pos = -1;
|
325
|
+
ste->index_pointer = 0;
|
326
|
+
ste->format_m1skip_interval = -1;
|
327
|
+
|
328
|
+
int first_int = is_read_int(is);
|
329
|
+
|
330
|
+
if (first_int >= 0) {
|
331
|
+
// original-format file, without explicit format version number
|
332
|
+
ste->format = 0;
|
333
|
+
ste->size = first_int;
|
334
|
+
|
335
|
+
// back-compatible settings
|
336
|
+
ste->index_interval = 128;
|
337
|
+
ste->skip_interval = INT_MAX; // switch off skip_to optimization
|
338
|
+
|
339
|
+
} else {
|
340
|
+
// check that it is a format we can understand
|
341
|
+
if (first_int < TERM_INFO_FORMAT)
|
342
|
+
eprintf(ERROR, "Unknown format version:%d", first_int);
|
343
|
+
|
344
|
+
// we have a format version number
|
345
|
+
ste->format = first_int;
|
346
|
+
|
347
|
+
|
348
|
+
ste->size = is_read_long(is); // read the size
|
117
349
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
comp = memcmp(t1->text, t2->text, size);
|
128
|
-
if(comp == 0 && my_len != o_len)
|
129
|
-
comp = my_len > o_len ? 1 : -1;
|
350
|
+
if (ste->format == -1) {
|
351
|
+
if (!ste->is_index) {
|
352
|
+
ste->index_interval = is_read_int(is);
|
353
|
+
ste->format_m1skip_interval = is_read_int(is);
|
354
|
+
}
|
355
|
+
// switch off skip_to optimization for file format prior to
|
356
|
+
// 1.4rc2 in order to avoid a bug in skip_to implementation
|
357
|
+
// of these versions
|
358
|
+
ste->skip_interval = INT_MAX;
|
130
359
|
} else {
|
131
|
-
|
360
|
+
ste->index_interval = is_read_int(is);
|
361
|
+
ste->skip_interval = is_read_int(is);
|
132
362
|
}
|
133
363
|
}
|
134
|
-
return
|
364
|
+
return te;
|
365
|
+
}
|
366
|
+
|
367
|
+
void ste_seek(TermEnum *te, int pointer, int pos, Term *t, TermInfo *ti)
|
368
|
+
{
|
369
|
+
GET_STE;
|
370
|
+
is_seek(ste->is, pointer);
|
371
|
+
ste->pos = pos;
|
372
|
+
tb_set_term(te->tb_curr, t);
|
373
|
+
tb_reset(te->tb_prev);
|
374
|
+
ti_cpy(te->ti_curr, ti);
|
375
|
+
}
|
376
|
+
|
377
|
+
TermInfo *ste_scan_for_term_info(TermEnum *te, Term *t)
|
378
|
+
{
|
379
|
+
te_skip_to(te, t);
|
380
|
+
|
381
|
+
if (tb_term_cmp(te->tb_curr, t) == 0) {
|
382
|
+
return te_get_ti(te);
|
383
|
+
} else {
|
384
|
+
return NULL;
|
385
|
+
}
|
135
386
|
}
|
136
387
|
|
137
|
-
int
|
138
|
-
frt_term_compare_to_int(VALUE self, VALUE rother)
|
388
|
+
Term *ste_scan_for_term(TermEnum *te, int pos)
|
139
389
|
{
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
390
|
+
GET_STE;
|
391
|
+
while (ste->pos < pos) {
|
392
|
+
if (ste_next(te) == NULL)
|
393
|
+
return NULL;
|
394
|
+
}
|
395
|
+
|
396
|
+
return te_get_term(te);
|
144
397
|
}
|
145
398
|
|
146
|
-
|
147
|
-
|
399
|
+
/****************************************************************************
|
400
|
+
*
|
401
|
+
* MultiTermEnum
|
402
|
+
*
|
403
|
+
****************************************************************************/
|
404
|
+
|
405
|
+
#define GET_MTE MultiTermEnum *mte = (MultiTermEnum *)te->data;
|
406
|
+
|
407
|
+
TermBuffer *mte_next(TermEnum *te)
|
148
408
|
{
|
149
|
-
|
409
|
+
GET_MTE;
|
410
|
+
SegmentMergeInfo *top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
|
411
|
+
|
412
|
+
if (top == NULL) {
|
413
|
+
tb_reset(te->tb_curr);
|
414
|
+
return false;
|
415
|
+
}
|
416
|
+
|
417
|
+
tb_cpy(te->tb_prev, te->tb_curr);
|
418
|
+
tb_cpy(te->tb_curr, top->tb);
|
419
|
+
|
420
|
+
te->ti_curr->doc_freq = 0;
|
421
|
+
|
422
|
+
while ((top != NULL) && (tb_cmp(te->tb_curr, top->tb) == 0)) {
|
423
|
+
pq_pop(mte->smi_queue);
|
424
|
+
te->ti_curr->doc_freq += top->te->ti_curr->doc_freq; // increment freq
|
425
|
+
if (smi_next(top)) {
|
426
|
+
pq_push(mte->smi_queue, top); // restore queue
|
427
|
+
} else {
|
428
|
+
smi_destroy(top); // done with a segment
|
429
|
+
}
|
430
|
+
top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
|
431
|
+
}
|
432
|
+
return te->tb_curr;
|
150
433
|
}
|
151
434
|
|
152
|
-
|
153
|
-
frt_term_gt(VALUE self, VALUE rother)
|
435
|
+
void mte_close(TermEnum *te)
|
154
436
|
{
|
155
|
-
|
437
|
+
GET_MTE;
|
438
|
+
pq_clear(mte->smi_queue);
|
439
|
+
pq_destroy(mte->smi_queue);
|
440
|
+
free(mte);
|
441
|
+
te_destroy(te);
|
156
442
|
}
|
157
443
|
|
158
|
-
|
159
|
-
frt_term_le(VALUE self, VALUE rother)
|
444
|
+
TermEnum *mte_clone(TermEnum *te)
|
160
445
|
{
|
161
|
-
|
446
|
+
eprintf(ERROR, "MultiTermEnum does not support cloning");
|
447
|
+
return NULL;
|
162
448
|
}
|
163
449
|
|
164
|
-
|
165
|
-
frt_term_ge(VALUE self, VALUE rother)
|
450
|
+
TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *t)
|
166
451
|
{
|
167
|
-
|
452
|
+
int i;
|
453
|
+
TermEnum *te = te_create();
|
454
|
+
te->next = &mte_next;
|
455
|
+
te->clone = &mte_clone;
|
456
|
+
te->close = &mte_close;
|
457
|
+
|
458
|
+
MultiTermEnum *mte = ALLOC(MultiTermEnum);
|
459
|
+
te->data = mte;
|
460
|
+
|
461
|
+
IndexReader *reader;
|
462
|
+
TermEnum *sub_te;
|
463
|
+
|
464
|
+
mte->smi_queue = pq_create(rcnt, &smi_lt);
|
465
|
+
mte->smi_queue->free_elem = &smi_destroy;
|
466
|
+
|
467
|
+
for (i = 0; i < rcnt; i++) {
|
468
|
+
reader = readers[i];
|
469
|
+
|
470
|
+
if (t != NULL) {
|
471
|
+
sub_te = reader->terms_from(reader, t);
|
472
|
+
} else {
|
473
|
+
sub_te = reader->terms(reader);
|
474
|
+
}
|
475
|
+
|
476
|
+
SegmentMergeInfo *smi = smi_create(starts[i], sub_te, reader);
|
477
|
+
if (((t == NULL) && smi_next(smi)) ||
|
478
|
+
(sub_te->tb_curr->field != (char *)EMPTY_STRING)) {
|
479
|
+
pq_push(mte->smi_queue, smi); // initialize queue
|
480
|
+
} else {
|
481
|
+
smi_destroy(smi);
|
482
|
+
}
|
483
|
+
}
|
484
|
+
|
485
|
+
if ((t != NULL) && (mte->smi_queue->count > 0)) {
|
486
|
+
mte_next(te);
|
487
|
+
}
|
488
|
+
|
489
|
+
return te;
|
168
490
|
}
|
169
491
|
|
170
|
-
|
171
|
-
|
492
|
+
/****************************************************************************
|
493
|
+
*
|
494
|
+
* TermInfosWriter
|
495
|
+
*
|
496
|
+
****************************************************************************/
|
497
|
+
|
498
|
+
const Term EmptyTerm = {"", ""};
|
499
|
+
|
500
|
+
TermInfosWriter *tiw_open_internal(Store *store,
|
501
|
+
char *segment,
|
502
|
+
FieldInfos *fis,
|
503
|
+
int interval,
|
504
|
+
int is_index)
|
172
505
|
{
|
173
|
-
|
174
|
-
|
175
|
-
|
506
|
+
TermInfosWriter *tiw = ALLOC(TermInfosWriter);
|
507
|
+
tiw->index_interval = interval;
|
508
|
+
tiw->skip_interval = 16;
|
509
|
+
tiw->last_index_pointer = 0;
|
510
|
+
tiw->last_term = (Term *)&EmptyTerm;
|
511
|
+
tiw->last_term_info = ti_create(0,0,0,0);
|
512
|
+
tiw->size = 0;
|
513
|
+
tiw->is_index = is_index;
|
514
|
+
tiw->fis = fis;
|
515
|
+
tiw->curr_field = NULL;
|
516
|
+
tiw->curr_field_num = -1;
|
517
|
+
|
518
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
519
|
+
strcpy(fname, segment);
|
520
|
+
strcat(fname, (is_index ? ".tii" : ".tis"));
|
521
|
+
OutStream *os = tiw->os = store->create_output(store, fname);
|
522
|
+
os_write_int(os, TERM_INFO_FORMAT); // write format
|
523
|
+
os_write_long(os, 0); // leave space for size
|
524
|
+
os_write_int(os, tiw->index_interval); // write index_interval
|
525
|
+
os_write_int(os, tiw->skip_interval); // write skip_interval
|
526
|
+
if (!is_index) {
|
527
|
+
tiw->other = tiw_open_internal(store, segment, fis, interval, true);
|
528
|
+
tiw->other->other = tiw;
|
529
|
+
}
|
530
|
+
return tiw;
|
176
531
|
}
|
177
532
|
|
533
|
+
TermInfosWriter *tiw_open(Store *store, char *segment, FieldInfos *fis, int interval)
|
534
|
+
{
|
535
|
+
return tiw_open_internal(store, segment, fis, interval, false);
|
536
|
+
}
|
178
537
|
|
179
|
-
|
180
|
-
frt_term_compare_to(VALUE self, VALUE other)
|
538
|
+
void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
|
181
539
|
{
|
182
|
-
|
540
|
+
//printf("%s, %s\n", tiw->last_term->text, t->text);
|
541
|
+
int start = hlp_string_diff(tiw->last_term->text, t->text);
|
542
|
+
int length = strlen(t->text) - start;
|
543
|
+
|
544
|
+
os_write_vint(os, start); // write shared prefix length
|
545
|
+
os_write_vint(os, length); // write delta length
|
546
|
+
os_write_chars(os, t->text, start, length); // write delta chars
|
547
|
+
if (tiw->curr_field != t->field) {
|
548
|
+
tiw->curr_field = t->field;
|
549
|
+
tiw->curr_field_num = fis_get_number(tiw->fis, t->field);
|
550
|
+
}
|
551
|
+
os_write_vint(os, tiw->curr_field_num);
|
552
|
+
tiw->last_term = t;
|
183
553
|
}
|
184
554
|
|
185
|
-
|
186
|
-
frt_term_hash(VALUE self)
|
555
|
+
void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
|
187
556
|
{
|
188
|
-
|
189
|
-
|
190
|
-
|
557
|
+
if (tiw->is_index && term_cmp(tiw->last_term, t) > 0) {
|
558
|
+
eprintf(STATE_ERROR,
|
559
|
+
"term out of order %s < %s", t->text, tiw->last_term->text);
|
560
|
+
}
|
561
|
+
if (ti->freq_pointer < tiw->last_term_info->freq_pointer) {
|
562
|
+
eprintf(STATE_ERROR, "freq pointer out of order");
|
563
|
+
}
|
564
|
+
if (ti->prox_pointer < tiw->last_term_info->prox_pointer) {
|
565
|
+
eprintf(STATE_ERROR, "prox pointer out of order");
|
566
|
+
}
|
567
|
+
|
568
|
+
if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0)
|
569
|
+
tiw_add(tiw->other, tiw->last_term, tiw->last_term_info); // add an index term
|
570
|
+
|
571
|
+
tiw_write_term(tiw, tiw->os, t); // write term
|
572
|
+
os_write_vint(tiw->os, ti->doc_freq); // write doc freq
|
573
|
+
os_write_vint(tiw->os, ti->freq_pointer - tiw->last_term_info->freq_pointer);
|
574
|
+
os_write_vint(tiw->os, ti->prox_pointer - tiw->last_term_info->prox_pointer);
|
575
|
+
if (ti->doc_freq >= tiw->skip_interval)
|
576
|
+
os_write_vint(tiw->os, ti->skip_offset);
|
577
|
+
|
578
|
+
if (tiw->is_index) {
|
579
|
+
OutStream *other_os = tiw->other->os;
|
580
|
+
int other_pos = os_pos(other_os);
|
581
|
+
os_write_vint(tiw->os, other_pos - tiw->last_index_pointer);
|
582
|
+
tiw->last_index_pointer = other_pos; // write pointer
|
583
|
+
}
|
584
|
+
|
585
|
+
ti_cpy(tiw->last_term_info, ti);
|
586
|
+
tiw->size++;
|
587
|
+
}
|
588
|
+
|
589
|
+
void tiw_close(TermInfosWriter *tiw)
|
590
|
+
{
|
591
|
+
OutStream *os = tiw->os;
|
592
|
+
os_seek(os, 4); // write @size after format
|
593
|
+
os_write_long(os, tiw->size);
|
594
|
+
os_close(os);
|
595
|
+
|
596
|
+
if (!tiw->is_index)
|
597
|
+
tiw_close(tiw->other);
|
598
|
+
|
599
|
+
ti_destroy(tiw->last_term_info);
|
600
|
+
free(tiw);
|
191
601
|
}
|
192
602
|
|
193
603
|
/****************************************************************************
|
194
604
|
*
|
195
|
-
*
|
605
|
+
* TermInfosReader
|
196
606
|
*
|
197
607
|
****************************************************************************/
|
198
608
|
|
199
|
-
void
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
609
|
+
void tir_close(TermInfosReader *tir)
|
610
|
+
{
|
611
|
+
int i;
|
612
|
+
if (tir->index_terms != NULL) {
|
613
|
+
for (i = 0; i < tir->index_size; i++) {
|
614
|
+
term_destroy(tir->index_terms[i]);
|
615
|
+
ti_destroy(tir->index_term_infos[i]);
|
616
|
+
}
|
617
|
+
free(tir->index_terms);
|
618
|
+
free(tir->index_term_infos);
|
619
|
+
free(tir->index_pointers);
|
620
|
+
}
|
621
|
+
if (tir->orig_te) tir->orig_te->close(tir->orig_te);
|
622
|
+
thread_key_delete(tir->thread_te);
|
623
|
+
ary_destroy(tir->te_bucket);
|
624
|
+
if (tir->index_te) tir->index_te->close(tir->index_te);
|
625
|
+
mutex_destroy(&tir->mutex);
|
626
|
+
free(tir);
|
627
|
+
}
|
628
|
+
|
629
|
+
TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis)
|
630
|
+
{
|
631
|
+
TermInfosReader *tir = ALLOC(TermInfosReader);
|
632
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
633
|
+
mutex_init(&tir->mutex, NULL);
|
634
|
+
strcpy(fname, segment);
|
635
|
+
strcpy(fname + strlen(segment), ".tis");
|
636
|
+
InStream *is = store->open_input(store, fname);
|
637
|
+
tir->orig_te = ste_create(is, fis, false);
|
638
|
+
thread_key_create(&tir->thread_te, NULL);
|
639
|
+
tir->te_bucket = ary_create(1, (destroy_func_t)tir->orig_te->close);
|
640
|
+
|
641
|
+
SegmentTermEnum *ste = tir->orig_te->data;
|
642
|
+
tir->size = ste->size;
|
643
|
+
tir->skip_interval = ste->skip_interval;
|
644
|
+
|
645
|
+
strcpy(fname + strlen(segment), ".tii");
|
646
|
+
is = store->open_input(store, fname);
|
647
|
+
tir->index_te = ste_create(is, fis, true);
|
648
|
+
tir->index_terms = NULL;
|
649
|
+
tir->index_term_infos = NULL;
|
650
|
+
tir->index_pointers = NULL;
|
651
|
+
return tir;
|
652
|
+
}
|
653
|
+
|
654
|
+
void tir_ensure_index_is_read(TermInfosReader *tir)
|
655
|
+
{
|
656
|
+
mutex_lock(&tir->mutex);
|
657
|
+
if (tir->index_terms == NULL) {
|
658
|
+
int index_size = ((SegmentTermEnum *)tir->index_te->data)->size;
|
659
|
+
tir->index_size = index_size;
|
660
|
+
|
661
|
+
tir->index_terms = ALLOC_N(Term *, index_size);
|
662
|
+
tir->index_term_infos = ALLOC_N(TermInfo *, index_size);
|
663
|
+
tir->index_pointers = ALLOC_N(int, index_size);
|
664
|
+
|
665
|
+
int i = 0;
|
666
|
+
TermEnum *index_te = tir->index_te;
|
667
|
+
SegmentTermEnum *ste = index_te->data;
|
668
|
+
|
669
|
+
while (ste_next(index_te) != NULL) {
|
670
|
+
tir->index_terms[i] = te_get_term(index_te);
|
671
|
+
tir->index_term_infos[i] = te_get_ti(index_te);
|
672
|
+
tir->index_pointers[i] = ste->index_pointer;
|
673
|
+
i++;
|
674
|
+
}
|
675
|
+
|
676
|
+
index_te->close(index_te);
|
677
|
+
tir->index_te = NULL;
|
678
|
+
}
|
679
|
+
mutex_unlock(&tir->mutex);
|
680
|
+
}
|
681
|
+
|
682
|
+
static inline TermEnum *tir_enum(TermInfosReader *tir)
|
683
|
+
{
|
684
|
+
TermEnum *te;
|
685
|
+
if ((te = thread_getspecific(tir->thread_te)) == NULL) {
|
686
|
+
te = tir->orig_te->clone(tir->orig_te);
|
687
|
+
ary_append(tir->te_bucket, te);
|
688
|
+
thread_setspecific(tir->thread_te, te);
|
689
|
+
}
|
690
|
+
return te;
|
222
691
|
}
|
692
|
+
|
693
|
+
void tir_seek_enum(TermInfosReader *tir, int ind_offset)
|
694
|
+
{
|
695
|
+
TermEnum *te = tir_enum(tir);
|
696
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
697
|
+
ste_seek(te, tir->index_pointers[ind_offset],
|
698
|
+
(ind_offset * ste->index_interval) - 1,
|
699
|
+
tir->index_terms[ind_offset],
|
700
|
+
tir->index_term_infos[ind_offset]);
|
701
|
+
}
|
702
|
+
|
703
|
+
int tir_get_index_offset(TermInfosReader *tir, Term *t)
|
704
|
+
{
|
705
|
+
int lo = 0; // binary search tir->index_terms[]
|
706
|
+
int hi = tir->index_size - 1;
|
707
|
+
int mid, delta;
|
708
|
+
Term **index_terms = tir->index_terms;
|
709
|
+
|
710
|
+
while (hi >= lo) {
|
711
|
+
mid = (lo + hi) >> 1;
|
712
|
+
delta = term_cmp(t, index_terms[mid]);
|
713
|
+
if (delta < 0) {
|
714
|
+
hi = mid - 1;
|
715
|
+
} else if (delta > 0) {
|
716
|
+
lo = mid + 1;
|
717
|
+
} else {
|
718
|
+
return mid;
|
719
|
+
}
|
720
|
+
}
|
721
|
+
return hi;
|
722
|
+
}
|
723
|
+
|
724
|
+
TermInfo *tir_get_ti(TermInfosReader *tir, Term *t)
|
725
|
+
{
|
726
|
+
if (tir->size == 0)
|
727
|
+
return NULL;
|
728
|
+
|
729
|
+
tir_ensure_index_is_read(tir);
|
730
|
+
|
731
|
+
// optimize sequential access: first try scanning cached enum w/o seeking
|
732
|
+
TermEnum *te = tir_enum(tir);
|
733
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
734
|
+
if (ste->pos < ste->size && tb_term_cmp(te->tb_curr, t) <= 0) {
|
735
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
736
|
+
int enum_offset = (int)(ste->pos / ste->index_interval) + 1;
|
737
|
+
if (tir->index_size == enum_offset ||
|
738
|
+
term_cmp(t, tir->index_terms[enum_offset]) < 0) { // but before end of block
|
739
|
+
return ste_scan_for_term_info(te, t); // no need to seek
|
740
|
+
}
|
741
|
+
}
|
742
|
+
|
743
|
+
// random-access: must seek
|
744
|
+
tir_seek_enum(tir, tir_get_index_offset(tir, t));
|
745
|
+
return ste_scan_for_term_info(te, t);
|
746
|
+
}
|
747
|
+
|
748
|
+
Term *tir_get_term(TermInfosReader *tir, int pos)
|
749
|
+
{
|
750
|
+
if (tir->size == 0)
|
751
|
+
return NULL;
|
752
|
+
|
753
|
+
TermEnum *te = tir_enum(tir);
|
754
|
+
SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
|
755
|
+
if (pos >= ste->pos &&
|
756
|
+
pos < (ste->pos + ste->index_interval)) {
|
757
|
+
return ste_scan_for_term(te, pos); // can avoid seek
|
758
|
+
}
|
759
|
+
|
760
|
+
tir_seek_enum(tir, (int)(pos / ste->index_interval)); // must seek
|
761
|
+
return ste_scan_for_term(te, pos);
|
762
|
+
}
|
763
|
+
|
764
|
+
int tir_get_term_pos(TermInfosReader *tir, Term *t)
|
765
|
+
{
|
766
|
+
if (tir->size == 0)
|
767
|
+
return -1;
|
768
|
+
|
769
|
+
tir_ensure_index_is_read(tir);
|
770
|
+
|
771
|
+
int ind_offset = tir_get_index_offset(tir, t);
|
772
|
+
tir_seek_enum(tir, ind_offset);
|
773
|
+
|
774
|
+
TermEnum *te = tir_enum(tir);
|
775
|
+
while ((tb_term_cmp(te->tb_curr, t) < 0) && (ste_next(te) != NULL))
|
776
|
+
;
|
777
|
+
|
778
|
+
if (tb_term_cmp(te->tb_curr, t) == 0)
|
779
|
+
return ((SegmentTermEnum *)te->data)->pos;
|
780
|
+
else
|
781
|
+
return -1;
|
782
|
+
}
|
783
|
+
|