ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/store.c ADDED
@@ -0,0 +1,35 @@
1
+ #include "store.h"
2
+
3
+ void with_lock(Lock *lock, void (*func)(void *arg), void *arg)
4
+ {
5
+ if (!lock->obtain(lock))
6
+ eprintf(IO_ERROR, "Could not obtain lock: <%s>", lock->name);
7
+ func(arg);
8
+ lock->release(lock);
9
+ }
10
+
11
+ void with_lock_name(Store *store, char *lock_name,
12
+ void (*func)(void *arg), void *arg)
13
+ {
14
+ Lock *lock = store->open_lock(store, lock_name);
15
+ if (!lock->obtain(lock))
16
+ eprintf(IO_ERROR, "Could not obtain lock: <%s>", lock->name);
17
+ func(arg);
18
+ lock->release(lock);
19
+ store->close_lock(lock);
20
+ }
21
+
22
+ Store *store_create()
23
+ {
24
+ Store *store = ALLOC(Store);
25
+ mutex_init(&store->mutex, NULL);
26
+ mutex_init(&store->ext_mutex, NULL);
27
+ return store;
28
+ }
29
+
30
+ void store_destroy(Store *store)
31
+ {
32
+ mutex_destroy(&store->mutex);
33
+ mutex_destroy(&store->ext_mutex);
34
+ free(store);
35
+ }
data/ext/store.h ADDED
@@ -0,0 +1,152 @@
1
+ #include "global.h"
2
+ #include "hash.h"
3
+
4
+ #ifndef FRT_STORE_H
5
+ #define FRT_STORE_H
6
+
7
+ #define BUFFER_SIZE 1024
8
+ #define LOCK_PREFIX "ferret-"
9
+
10
+ #define VINT_MAX_LEN 10
11
+ #define VINT_END BUFFER_SIZE - VINT_MAX_LEN
12
+
13
+ typedef struct Buffer {
14
+ uchar buf[BUFFER_SIZE];
15
+ int start;
16
+ int pos;
17
+ int len;
18
+ } Buffer;
19
+
20
+ typedef struct OutStream {
21
+ Buffer buf;
22
+ void *file;
23
+ int pointer; // only used by RAMOut
24
+ void (*flush_internal)(struct OutStream *os, uchar *buf, int len);
25
+ void (*seek_internal)(struct OutStream *os, int pos);
26
+ void (*close_internal)(struct OutStream *os);
27
+ } OutStream;
28
+
29
+ typedef struct CompoundInStream CompoundInStream;
30
+
31
+ typedef struct InStream {
32
+ int is_clone;
33
+ Buffer buf;
34
+ void *file;
35
+ union {
36
+ int pointer; // only used by RAMIn
37
+ char *path; // only used by FSIn
38
+ CompoundInStream *cis;
39
+ } d;
40
+ void (*read_internal)(struct InStream *is, uchar *buf, int offset, int len);
41
+ void (*seek_internal)(struct InStream *is, int pos);
42
+ void (*close_internal)(struct InStream *is);
43
+ void (*clone_internal)(struct InStream *is, struct InStream *new_index_i);
44
+ int (*length_internal)(struct InStream *is);
45
+ } InStream;
46
+
47
+ struct CompoundInStream {
48
+ InStream *sub;
49
+ int offset;
50
+ int length;
51
+ };
52
+
53
+ #define is_length(mis) mis->length_internal(mis)
54
+
55
+ typedef struct Store Store;
56
+ typedef struct Lock Lock;
57
+ struct Lock {
58
+ char *name;
59
+ Store *store;
60
+ int (*obtain)(Lock *lock);
61
+ int (*is_locked)(Lock *lock);
62
+ void (*release)(Lock *lock);
63
+ };
64
+
65
+ typedef struct CompoundStore {
66
+ Store *store;
67
+ const char *name;
68
+ HshTable *entries;
69
+ InStream *stream;
70
+ } CompoundStore;
71
+
72
+ struct Store {
73
+ int ref_cnt; /* for fs_store only */
74
+ mutex_t mutex;
75
+ mutex_t ext_mutex;
76
+ union {
77
+ char *path; /* for fs_store only */
78
+ HshTable *ht; /* for ram_store only */
79
+ CompoundStore *cmpd; /* for compound_store only */
80
+ } dir;
81
+ void (*touch)(Store *store, char *filename);
82
+ int (*exists)(Store *store, char *filename);
83
+ int (*remove)(Store *store, char *filename);
84
+ int (*rename)(Store *store, char *from, char *to);
85
+ int (*count)(Store *store);
86
+ void (*close)(Store *store);
87
+ void (*clear)(Store *store);
88
+ void (*clear_all)(Store *store);
89
+ void (*clear_locks)(Store *store);
90
+ int (*length)(Store *store, char *filename);
91
+ void (*each)(Store *store, void (*func)(char *fname, void *arg), void *arg);
92
+ OutStream *(*create_output)(Store *store, const char *filename);
93
+ InStream *(*open_input)(Store *store, const char *filename);
94
+ Lock *(*open_lock)(Store *store, char *lockname);
95
+ void (*close_lock)(Lock *lock);
96
+ };
97
+
98
+ #define store_close(mstore) mstore->close(mstore)
99
+
100
+ Store *store_create();
101
+ void store_destroy(Store *store);
102
+ Store *open_fs_store(const char *pathname);
103
+ Store *open_ram_store();
104
+ Store *open_ram_store_and_copy(Store *store, bool close_dir);
105
+ Store *open_cmpd_store(Store *sub, const char *filename);
106
+ void ram_close(Store *store);
107
+ Buffer *buf_create();
108
+ void os_flush(OutStream *os);
109
+ void os_close(OutStream *os);
110
+ int os_pos(OutStream *os);
111
+ void os_seek(OutStream *os, int new_pos);
112
+ void os_write_byte(OutStream *os, uchar b);
113
+ void os_write_bytes(OutStream *os, uchar *b, int len);
114
+ uchar is_read_byte(InStream *is);
115
+ int is_pos(InStream *is);
116
+ uchar *is_read_bytes(InStream *is, uchar *b, int offset, int len);
117
+ void is_seek(InStream *is, int pos);
118
+ InStream *is_clone(InStream *is);
119
+ void is_close(InStream *is);
120
+ int is_read_int(InStream *is);
121
+ long long is_read_long(InStream *is);
122
+ unsigned int is_read_uint(InStream *is);
123
+ unsigned long long is_read_ulong(InStream *is);
124
+ unsigned long long is_read_vint(InStream *is);
125
+ void is_read_chars(InStream *is, char* buffer, int off, int len) ;
126
+ char *is_read_string(InStream *is);
127
+ void os_write_int(OutStream *os, int l);
128
+ void os_write_long(OutStream *os, long long l);
129
+ void os_write_uint(OutStream *os, unsigned int l);
130
+ void os_write_ulong(OutStream *os, unsigned long long l);
131
+ void os_write_vint(OutStream *os, register unsigned long long i);
132
+ void os_write_chars(OutStream *os, char *buf, int start, int length);
133
+ void os_write_string(OutStream *os, char *str);
134
+ OutStream *os_create();
135
+ InStream *is_create();
136
+ void buf_destroy(Buffer *buf);
137
+
138
+ // RamStore functions
139
+ int ramo_length(OutStream *os);
140
+ void ramo_reset(OutStream *os);
141
+ int rami_length(InStream *is);
142
+ void ramo_write_to(OutStream *os, OutStream *other_o);
143
+ OutStream *ram_create_buffer();
144
+ void ram_destroy_buffer(OutStream *os);
145
+
146
+ int file_is_lock(char *filename);
147
+
148
+ void with_lock(Lock *lock, void (*func)(void *arg), void *arg);
149
+ void with_lock_name(Store *store, char *lock_name,
150
+ void (*func)(void *arg), void *arg);
151
+
152
+ #endif
data/ext/term.c CHANGED
@@ -1,222 +1,783 @@
1
- #include "ferret.h"
1
+ #include <index.h>
2
+ #include <string.h>
3
+ #include <helper.h>
4
+ #include <hash.h>
2
5
 
6
+ /****************************************************************************
7
+ *
8
+ * Term
9
+ *
10
+ ****************************************************************************/
11
+
12
+ Term *term_clone(Term *term)
13
+ {
14
+ Term *t = ALLOC(Term);
15
+
16
+ t->field = term->field;
17
+ t->text = estrdup(term->text);
18
+ return t;
19
+ }
20
+
21
+ Term *term_create(const char *field, char *text)
22
+ {
23
+ Term *t = ALLOC(Term);
24
+
25
+ t->field = (char *)field;
26
+ t->text = estrdup(text);
27
+ return t;
28
+ }
29
+
30
+ void term_destroy(void *p)
31
+ {
32
+ Term *t = (Term *)p;
33
+ free(t->text);
34
+ free(t);
35
+ }
36
+
37
+ int term_cmp(void *t1, void *t2)
38
+ {
39
+ int res = strcmp(((Term *)t1)->field, ((Term *)t2)->field);
40
+ if (res != 0) {
41
+ return res;
42
+ } else {
43
+ return strcmp(((Term *)t1)->text, ((Term *)t2)->text);
44
+ }
45
+ }
46
+
47
+ int term_eq(const void *t1, const void *t2)
48
+ {
49
+ return (strcmp(((Term *)t1)->text, ((Term *)t2)->text)) == 0 &&
50
+ (strcmp(((Term *)t1)->field, ((Term *)t2)->field) == 0);
51
+ }
52
+
53
+ unsigned int term_hash(const void *t)
54
+ {
55
+ return str_hash(((Term *)t)->text) * str_hash(((Term *)t)->field);
56
+ }
57
+
58
+ char *term_to_s(Term *term)
59
+ {
60
+ char *string = ALLOC_N(char, strlen(term->field) + strlen(term->text) + 2);
61
+ sprintf(string, "%s:%s", term->field, term->text);
62
+ return string;
63
+ }
3
64
 
4
65
  /****************************************************************************
5
66
  *
6
- * Term Methods
67
+ * TermBuffer
7
68
  *
8
69
  ****************************************************************************/
9
70
 
10
- void
11
- frt_term_free(void *p)
71
+ void tb_reset(TermBuffer *tb)
72
+ {
73
+ tb->field = (char *)EMPTY_STRING;
74
+ tb->text[0] = '\0';
75
+ }
76
+
77
+ TermBuffer *tb_create()
78
+ {
79
+ TermBuffer *tb = ALLOC(TermBuffer);
80
+ tb->field = (char *)EMPTY_STRING;
81
+ tb->text[0] = '\0';
82
+ return tb;
83
+ }
84
+
85
+ void tb_destroy(void *p)
12
86
  {
13
- Term *term = (Term *)p;
14
- free(term->text);
15
87
  free(p);
16
88
  }
17
89
 
18
- void
19
- frt_term_mark(void *p)
90
+ TermBuffer *tb_set_term(TermBuffer *tb, Term *t)
91
+ {
92
+ tb->field = t->field;
93
+ strcpy(tb->text, t->text);
94
+ return tb;
95
+ }
96
+
97
+ Term *tb_get_term(TermBuffer *tb)
20
98
  {
21
- Term *term = (Term *)p;
22
- rb_gc_mark(term->field);
99
+ return term_create(tb->field, tb->text);
23
100
  }
24
101
 
25
- static VALUE
26
- frt_term_alloc(VALUE klass)
102
+ int tb_cmp(TermBuffer *tb1, TermBuffer *tb2)
27
103
  {
28
- Term *term = ALLOC(Term);
29
- MEMZERO(term, Term, 1);
30
- term->field = Qnil;
31
- return Data_Wrap_Struct(klass, frt_term_mark, frt_term_free, term);
104
+ int res = strcmp(tb1->field, tb2->field);
105
+ if (res != 0) {
106
+ return res;
107
+ } else {
108
+ return strcmp(tb1->text, tb2->text);
109
+ }
32
110
  }
33
111
 
34
- #define GET_TERM Term *term; Data_Get_Struct(self, Term, term)
35
- VALUE
36
- frt_term_set(VALUE self, VALUE rfield, VALUE rtext)
112
+ int tb_term_cmp(TermBuffer *tb, Term *t)
37
113
  {
38
- int tlen;
39
- GET_TERM;
114
+ int res = strcmp(tb->field, t->field);
115
+ if (res != 0) {
116
+ return res;
117
+ } else {
118
+ return strcmp(tb->text, t->text);
119
+ }
120
+ }
40
121
 
41
- tlen = RSTRING(rtext)->len;
42
- term->field = rfield;
43
- REALLOC_N(term->text, char, tlen + 1);
44
- MEMCPY(term->text, RSTRING(rtext)->ptr, char, tlen);
45
- term->tlen = tlen;
46
-
47
- return Qnil;
122
+ TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2)
123
+ {
124
+ tb1->field = tb2->field;
125
+ strcpy(tb1->text, tb2->text);
126
+ return tb1;
48
127
  }
49
128
 
50
- static VALUE
51
- frt_term_init(VALUE self, VALUE rfield, VALUE rtext)
129
+ TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis)
52
130
  {
53
- frt_term_set(self, rfield, rtext);
54
- return self;
131
+ int start = is_read_vint(is);
132
+ int length = is_read_vint(is);
133
+ int total_length = start + length;
134
+ is_read_bytes(is, (uchar *)tb->text, start, length);
135
+ tb->text[total_length] = '\0';
136
+ int fnum = is_read_vint(is);
137
+ if (fnum < 0)
138
+ tb->field = (char *)EMPTY_STRING;
139
+ else
140
+ tb->field = fis->by_number[fnum]->name;
141
+ return tb;
55
142
  }
56
143
 
57
- static VALUE
58
- frt_term_get_text(VALUE self)
144
+ /****************************************************************************
145
+ *
146
+ * TermInfo
147
+ *
148
+ ****************************************************************************/
149
+
150
+ TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
59
151
  {
60
- GET_TERM;
61
- return rb_str_new(term->text, term->tlen);
152
+ TermInfo *ti = ALLOC(TermInfo);
153
+ ti->doc_freq = doc_freq;
154
+ ti->freq_pointer = freq_pointer;
155
+ ti->prox_pointer = prox_pointer;
156
+ ti->skip_offset = skip_offset;
157
+ return ti;
62
158
  }
63
159
 
64
- static VALUE
65
- frt_term_set_text(VALUE self, VALUE rtext)
160
+ TermInfo *ti_set(TermInfo *ti, int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
66
161
  {
67
- int tlen;
68
- char *text;
69
- GET_TERM;
70
- tlen = RSTRING(rtext)->len;
71
- text = RSTRING(rtext)->ptr;
162
+ ti->doc_freq = doc_freq;
163
+ ti->freq_pointer = freq_pointer;
164
+ ti->prox_pointer = prox_pointer;
165
+ ti->skip_offset = skip_offset;
166
+ return ti;
167
+ }
72
168
 
73
- REALLOC_N(term->text, char, tlen + 1);
74
-
75
- MEMCPY(term->text, text, char, tlen);
76
- term->tlen = tlen;
77
-
78
- return Qnil;
169
+ void ti_destroy(void *p)
170
+ {
171
+ free(p);
79
172
  }
80
173
 
81
- static VALUE
82
- frt_term_get_field(VALUE self)
174
+ TermInfo *ti_cpy(TermInfo *ti, TermInfo *other)
83
175
  {
84
- GET_TERM;
85
- return term->field;
176
+ memcpy(ti, other, sizeof(TermInfo));
177
+ return ti;
86
178
  }
87
179
 
88
- static VALUE
89
- frt_term_set_field(VALUE self, VALUE rfield)
180
+ TermInfo *ti_clone(TermInfo *other)
90
181
  {
91
- GET_TERM;
92
- term->field = rfield;
93
- return Qnil;
182
+ return ti_create(other->doc_freq,
183
+ other->freq_pointer, other->prox_pointer, other->skip_offset);
94
184
  }
95
185
 
96
- VALUE
97
- frt_term_to_s(VALUE self)
186
+ int ti_eq(TermInfo *ti, TermInfo *other)
98
187
  {
99
- int tlen, flen;
100
- char delim[] = ":";
101
- char *res;
102
- GET_TERM;
103
- tlen = term->tlen;
104
- flen = RSTRING(term->field)->len;
105
- res = alloca(flen + tlen + 1);
188
+ return (memcmp(ti, other, sizeof(TermInfo)) == 0);
189
+ }
190
+
191
+ /****************************************************************************
192
+ *
193
+ * TermEnum
194
+ *
195
+ ****************************************************************************/
196
+
197
+ TermEnum *te_create()
198
+ {
199
+ TermEnum *te = ALLOC(TermEnum);
200
+ te->tb_curr = tb_create();
201
+ te->tb_prev = tb_create();
202
+ te->ti_curr = ti_create(0, 0, 0, 0);
203
+ return te;
204
+ }
205
+
206
+ void te_destroy(void *p)
207
+ {
208
+ TermEnum *te = (TermEnum *)p;
209
+ tb_destroy(te->tb_curr);
210
+ tb_destroy(te->tb_prev);
211
+ ti_destroy(te->ti_curr);
212
+ free(p);
213
+ }
214
+
215
+ Term *te_get_term(TermEnum *te)
216
+ {
217
+ return tb_get_term(te->tb_curr);
218
+ }
219
+
220
+ TermInfo *te_get_ti(TermEnum *te)
221
+ {
222
+ TermInfo *ti = te->ti_curr;
223
+ return ti_create(ti->doc_freq, ti->freq_pointer, ti->prox_pointer, ti->skip_offset);
224
+ }
225
+
226
+ TermBuffer *te_skip_to(TermEnum *te, Term *t)
227
+ {
228
+ TermBuffer *tb_curr;
229
+ if (tb_term_cmp(te->tb_curr, t) == 0)
230
+ return te->tb_curr;
231
+
232
+ while (((tb_curr = te->next(te)) != NULL) &&
233
+ (tb_term_cmp(tb_curr, t) < 0)) {
234
+ }
235
+ return tb_curr;
236
+ }
237
+
238
+ /****************************************************************************
239
+ *
240
+ * SegmentTermEnum
241
+ *
242
+ ****************************************************************************/
243
+
244
+ #define GET_STE SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
245
+
246
+ TermBuffer *ste_next(TermEnum *te)
247
+ {
248
+ GET_STE;
249
+ InStream *is = ste->is;
250
+ ste->pos++;
251
+ if (ste->pos > ste->size - 1) {
252
+ tb_reset(te->tb_curr);
253
+ return NULL;
254
+ }
255
+
256
+ tb_cpy(te->tb_prev, te->tb_curr);
257
+ tb_read(te->tb_curr, is, ste->fis);
106
258
 
107
- MEMCPY(res, StringValuePtr(term->field), char, flen);
108
- MEMCPY(res + flen, delim, char, 1);
109
- MEMCPY(res + flen + 1, term->text, char, tlen);
110
- return rb_str_new(res, tlen + flen + 1 );
259
+ TermInfo *ti = te->ti_curr;
260
+ ti->doc_freq = is_read_vint(is); // read doc freq
261
+ ti->freq_pointer += is_read_vint(is); // read freq pointer
262
+ ti->prox_pointer += is_read_vint(is); // read prox pointer
263
+
264
+ if (ste->format == -1) {
265
+ // just read skip_offset in order to increment file pointer
266
+ // value is never used since skip_to is switched off
267
+ if (!ste->is_index) {
268
+ if (ti->doc_freq > ste->format_m1skip_interval)
269
+ ti->skip_offset = is_read_vint(is);
270
+ }
271
+ } else {
272
+ if (ti->doc_freq >= ste->skip_interval)
273
+ ti->skip_offset = is_read_vint(is);
274
+ }
275
+
276
+ if (ste->is_index)
277
+ ste->index_pointer += is_read_vint(is); // read index pointer
278
+
279
+ return te->tb_curr;
111
280
  }
112
281
 
113
- inline int
114
- frt_term_cmp(Term *t1, Term *t2)
282
+ TermEnum *ste_clone(TermEnum *other_te);
283
+ TermEnum *ste_allocate()
115
284
  {
116
- int comp, size, my_len, o_len;
285
+ TermEnum *te = te_create();
286
+ te->next = &ste_next;
287
+ te->close = &ste_close;
288
+ te->clone = &ste_clone;
289
+ SegmentTermEnum *ste =
290
+ ALLOC(SegmentTermEnum);
291
+ te->data = ste;
292
+ return te;
293
+ }
294
+
295
+ TermEnum *ste_clone(TermEnum *other_te)
296
+ {
297
+ SegmentTermEnum *other_ste = (SegmentTermEnum *)other_te->data;
298
+ TermEnum *te = ste_allocate();
299
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
300
+ memcpy(ste, other_ste, sizeof(SegmentTermEnum));
301
+ ste->is = is_clone(other_ste->is);
302
+ tb_cpy(te->tb_curr, other_te->tb_curr);
303
+ tb_cpy(te->tb_prev, other_te->tb_prev);
304
+ ti_cpy(te->ti_curr, other_te->ti_curr);
305
+ return te;
306
+ }
307
+
308
+ void ste_close(TermEnum *te)
309
+ {
310
+ GET_STE;
311
+ is_close(ste->is);
312
+ free(ste);
313
+ te->data = NULL;
314
+ te_destroy(te);
315
+ }
316
+
317
+ TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
318
+ {
319
+ TermEnum *te = ste_allocate();
320
+ GET_STE;
321
+ ste->fis = fis;
322
+ ste->is_index = is_index;
323
+ ste->is = is;
324
+ ste->pos = -1;
325
+ ste->index_pointer = 0;
326
+ ste->format_m1skip_interval = -1;
327
+
328
+ int first_int = is_read_int(is);
329
+
330
+ if (first_int >= 0) {
331
+ // original-format file, without explicit format version number
332
+ ste->format = 0;
333
+ ste->size = first_int;
334
+
335
+ // back-compatible settings
336
+ ste->index_interval = 128;
337
+ ste->skip_interval = INT_MAX; // switch off skip_to optimization
338
+
339
+ } else {
340
+ // check that it is a format we can understand
341
+ if (first_int < TERM_INFO_FORMAT)
342
+ eprintf(ERROR, "Unknown format version:%d", first_int);
343
+
344
+ // we have a format version number
345
+ ste->format = first_int;
346
+
347
+
348
+ ste->size = is_read_long(is); // read the size
117
349
 
118
- my_len = RSTRING(t1->field)->len;
119
- o_len = RSTRING(t2->field)->len;
120
- size = my_len >= o_len ? o_len : my_len;
121
- comp = memcmp(RSTRING(t1->field)->ptr, RSTRING(t2->field)->ptr, size);
122
- if (comp == 0) {
123
- if (my_len == o_len) {
124
- my_len = t1->tlen;
125
- o_len = t2->tlen;
126
- size = my_len >= o_len ? o_len : my_len;
127
- comp = memcmp(t1->text, t2->text, size);
128
- if(comp == 0 && my_len != o_len)
129
- comp = my_len > o_len ? 1 : -1;
350
+ if (ste->format == -1) {
351
+ if (!ste->is_index) {
352
+ ste->index_interval = is_read_int(is);
353
+ ste->format_m1skip_interval = is_read_int(is);
354
+ }
355
+ // switch off skip_to optimization for file format prior to
356
+ // 1.4rc2 in order to avoid a bug in skip_to implementation
357
+ // of these versions
358
+ ste->skip_interval = INT_MAX;
130
359
  } else {
131
- comp = my_len > o_len ? 1 : -1;
360
+ ste->index_interval = is_read_int(is);
361
+ ste->skip_interval = is_read_int(is);
132
362
  }
133
363
  }
134
- return comp;
364
+ return te;
365
+ }
366
+
367
+ void ste_seek(TermEnum *te, int pointer, int pos, Term *t, TermInfo *ti)
368
+ {
369
+ GET_STE;
370
+ is_seek(ste->is, pointer);
371
+ ste->pos = pos;
372
+ tb_set_term(te->tb_curr, t);
373
+ tb_reset(te->tb_prev);
374
+ ti_cpy(te->ti_curr, ti);
375
+ }
376
+
377
+ TermInfo *ste_scan_for_term_info(TermEnum *te, Term *t)
378
+ {
379
+ te_skip_to(te, t);
380
+
381
+ if (tb_term_cmp(te->tb_curr, t) == 0) {
382
+ return te_get_ti(te);
383
+ } else {
384
+ return NULL;
385
+ }
135
386
  }
136
387
 
137
- int
138
- frt_term_compare_to_int(VALUE self, VALUE rother)
388
+ Term *ste_scan_for_term(TermEnum *te, int pos)
139
389
  {
140
- Term *other;
141
- GET_TERM;
142
- Data_Get_Struct(rother, Term, other);
143
- return frt_term_cmp(term, other);
390
+ GET_STE;
391
+ while (ste->pos < pos) {
392
+ if (ste_next(te) == NULL)
393
+ return NULL;
394
+ }
395
+
396
+ return te_get_term(te);
144
397
  }
145
398
 
146
- VALUE
147
- frt_term_lt(VALUE self, VALUE rother)
399
+ /****************************************************************************
400
+ *
401
+ * MultiTermEnum
402
+ *
403
+ ****************************************************************************/
404
+
405
+ #define GET_MTE MultiTermEnum *mte = (MultiTermEnum *)te->data;
406
+
407
+ TermBuffer *mte_next(TermEnum *te)
148
408
  {
149
- return frt_term_compare_to_int(self, rother) < 0 ? Qtrue : Qfalse;
409
+ GET_MTE;
410
+ SegmentMergeInfo *top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
411
+
412
+ if (top == NULL) {
413
+ tb_reset(te->tb_curr);
414
+ return false;
415
+ }
416
+
417
+ tb_cpy(te->tb_prev, te->tb_curr);
418
+ tb_cpy(te->tb_curr, top->tb);
419
+
420
+ te->ti_curr->doc_freq = 0;
421
+
422
+ while ((top != NULL) && (tb_cmp(te->tb_curr, top->tb) == 0)) {
423
+ pq_pop(mte->smi_queue);
424
+ te->ti_curr->doc_freq += top->te->ti_curr->doc_freq; // increment freq
425
+ if (smi_next(top)) {
426
+ pq_push(mte->smi_queue, top); // restore queue
427
+ } else {
428
+ smi_destroy(top); // done with a segment
429
+ }
430
+ top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
431
+ }
432
+ return te->tb_curr;
150
433
  }
151
434
 
152
- VALUE
153
- frt_term_gt(VALUE self, VALUE rother)
435
+ void mte_close(TermEnum *te)
154
436
  {
155
- return frt_term_compare_to_int(self, rother) > 0 ? Qtrue : Qfalse;
437
+ GET_MTE;
438
+ pq_clear(mte->smi_queue);
439
+ pq_destroy(mte->smi_queue);
440
+ free(mte);
441
+ te_destroy(te);
156
442
  }
157
443
 
158
- VALUE
159
- frt_term_le(VALUE self, VALUE rother)
444
+ TermEnum *mte_clone(TermEnum *te)
160
445
  {
161
- return frt_term_compare_to_int(self, rother) <= 0 ? Qtrue : Qfalse;
446
+ eprintf(ERROR, "MultiTermEnum does not support cloning");
447
+ return NULL;
162
448
  }
163
449
 
164
- VALUE
165
- frt_term_ge(VALUE self, VALUE rother)
450
+ TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *t)
166
451
  {
167
- return frt_term_compare_to_int(self, rother) >= 0 ? Qtrue : Qfalse;
452
+ int i;
453
+ TermEnum *te = te_create();
454
+ te->next = &mte_next;
455
+ te->clone = &mte_clone;
456
+ te->close = &mte_close;
457
+
458
+ MultiTermEnum *mte = ALLOC(MultiTermEnum);
459
+ te->data = mte;
460
+
461
+ IndexReader *reader;
462
+ TermEnum *sub_te;
463
+
464
+ mte->smi_queue = pq_create(rcnt, &smi_lt);
465
+ mte->smi_queue->free_elem = &smi_destroy;
466
+
467
+ for (i = 0; i < rcnt; i++) {
468
+ reader = readers[i];
469
+
470
+ if (t != NULL) {
471
+ sub_te = reader->terms_from(reader, t);
472
+ } else {
473
+ sub_te = reader->terms(reader);
474
+ }
475
+
476
+ SegmentMergeInfo *smi = smi_create(starts[i], sub_te, reader);
477
+ if (((t == NULL) && smi_next(smi)) ||
478
+ (sub_te->tb_curr->field != (char *)EMPTY_STRING)) {
479
+ pq_push(mte->smi_queue, smi); // initialize queue
480
+ } else {
481
+ smi_destroy(smi);
482
+ }
483
+ }
484
+
485
+ if ((t != NULL) && (mte->smi_queue->count > 0)) {
486
+ mte_next(te);
487
+ }
488
+
489
+ return te;
168
490
  }
169
491
 
170
- VALUE
171
- frt_term_eq(VALUE self, VALUE rother)
492
+ /****************************************************************************
493
+ *
494
+ * TermInfosWriter
495
+ *
496
+ ****************************************************************************/
497
+
498
+ const Term EmptyTerm = {"", ""};
499
+
500
+ TermInfosWriter *tiw_open_internal(Store *store,
501
+ char *segment,
502
+ FieldInfos *fis,
503
+ int interval,
504
+ int is_index)
172
505
  {
173
- if (rother == Qnil)
174
- return Qfalse;
175
- return frt_term_compare_to_int(self, rother) == 0 ? Qtrue : Qfalse;
506
+ TermInfosWriter *tiw = ALLOC(TermInfosWriter);
507
+ tiw->index_interval = interval;
508
+ tiw->skip_interval = 16;
509
+ tiw->last_index_pointer = 0;
510
+ tiw->last_term = (Term *)&EmptyTerm;
511
+ tiw->last_term_info = ti_create(0,0,0,0);
512
+ tiw->size = 0;
513
+ tiw->is_index = is_index;
514
+ tiw->fis = fis;
515
+ tiw->curr_field = NULL;
516
+ tiw->curr_field_num = -1;
517
+
518
+ char fname[SEGMENT_NAME_MAX_LENGTH];
519
+ strcpy(fname, segment);
520
+ strcat(fname, (is_index ? ".tii" : ".tis"));
521
+ OutStream *os = tiw->os = store->create_output(store, fname);
522
+ os_write_int(os, TERM_INFO_FORMAT); // write format
523
+ os_write_long(os, 0); // leave space for size
524
+ os_write_int(os, tiw->index_interval); // write index_interval
525
+ os_write_int(os, tiw->skip_interval); // write skip_interval
526
+ if (!is_index) {
527
+ tiw->other = tiw_open_internal(store, segment, fis, interval, true);
528
+ tiw->other->other = tiw;
529
+ }
530
+ return tiw;
176
531
  }
177
532
 
533
+ TermInfosWriter *tiw_open(Store *store, char *segment, FieldInfos *fis, int interval)
534
+ {
535
+ return tiw_open_internal(store, segment, fis, interval, false);
536
+ }
178
537
 
179
- static VALUE
180
- frt_term_compare_to(VALUE self, VALUE other)
538
+ void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
181
539
  {
182
- return INT2FIX(frt_term_compare_to_int(self, other));
540
+ //printf("%s, %s\n", tiw->last_term->text, t->text);
541
+ int start = hlp_string_diff(tiw->last_term->text, t->text);
542
+ int length = strlen(t->text) - start;
543
+
544
+ os_write_vint(os, start); // write shared prefix length
545
+ os_write_vint(os, length); // write delta length
546
+ os_write_chars(os, t->text, start, length); // write delta chars
547
+ if (tiw->curr_field != t->field) {
548
+ tiw->curr_field = t->field;
549
+ tiw->curr_field_num = fis_get_number(tiw->fis, t->field);
550
+ }
551
+ os_write_vint(os, tiw->curr_field_num);
552
+ tiw->last_term = t;
183
553
  }
184
554
 
185
- static VALUE
186
- frt_term_hash(VALUE self)
555
+ void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
187
556
  {
188
- GET_TERM;
189
- return INT2FIX(frt_hash(term->text, term->tlen) +
190
- frt_hash(RSTRING(term->field)->ptr, RSTRING(term->field)->len));
557
+ if (tiw->is_index && term_cmp(tiw->last_term, t) > 0) {
558
+ eprintf(STATE_ERROR,
559
+ "term out of order %s < %s", t->text, tiw->last_term->text);
560
+ }
561
+ if (ti->freq_pointer < tiw->last_term_info->freq_pointer) {
562
+ eprintf(STATE_ERROR, "freq pointer out of order");
563
+ }
564
+ if (ti->prox_pointer < tiw->last_term_info->prox_pointer) {
565
+ eprintf(STATE_ERROR, "prox pointer out of order");
566
+ }
567
+
568
+ if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0)
569
+ tiw_add(tiw->other, tiw->last_term, tiw->last_term_info); // add an index term
570
+
571
+ tiw_write_term(tiw, tiw->os, t); // write term
572
+ os_write_vint(tiw->os, ti->doc_freq); // write doc freq
573
+ os_write_vint(tiw->os, ti->freq_pointer - tiw->last_term_info->freq_pointer);
574
+ os_write_vint(tiw->os, ti->prox_pointer - tiw->last_term_info->prox_pointer);
575
+ if (ti->doc_freq >= tiw->skip_interval)
576
+ os_write_vint(tiw->os, ti->skip_offset);
577
+
578
+ if (tiw->is_index) {
579
+ OutStream *other_os = tiw->other->os;
580
+ int other_pos = os_pos(other_os);
581
+ os_write_vint(tiw->os, other_pos - tiw->last_index_pointer);
582
+ tiw->last_index_pointer = other_pos; // write pointer
583
+ }
584
+
585
+ ti_cpy(tiw->last_term_info, ti);
586
+ tiw->size++;
587
+ }
588
+
589
+ void tiw_close(TermInfosWriter *tiw)
590
+ {
591
+ OutStream *os = tiw->os;
592
+ os_seek(os, 4); // write @size after format
593
+ os_write_long(os, tiw->size);
594
+ os_close(os);
595
+
596
+ if (!tiw->is_index)
597
+ tiw_close(tiw->other);
598
+
599
+ ti_destroy(tiw->last_term_info);
600
+ free(tiw);
191
601
  }
192
602
 
193
603
  /****************************************************************************
194
604
  *
195
- * Init Function
605
+ * TermInfosReader
196
606
  *
197
607
  ****************************************************************************/
198
608
 
199
- void
200
- Init_term(void)
201
- {
202
- /* Term */
203
- cTerm = rb_define_class_under(mIndex, "Term", rb_cObject);
204
- rb_define_alloc_func(cTerm, frt_term_alloc);
205
- rb_include_module(cTerm, rb_mComparable);
206
-
207
- rb_define_method(cTerm, "initialize", frt_term_init, 2);
208
- rb_define_method(cTerm, "set!", frt_term_set, 2);
209
- rb_define_method(cTerm, "to_s", frt_term_to_s, 0);
210
- rb_define_method(cTerm, "<=>", frt_term_compare_to, 1);
211
- rb_define_method(cTerm, "<", frt_term_lt, 1);
212
- rb_define_method(cTerm, ">", frt_term_gt, 1);
213
- rb_define_method(cTerm, "<=", frt_term_le, 1);
214
- rb_define_method(cTerm, ">=", frt_term_ge, 1);
215
- rb_define_method(cTerm, "eql?", frt_term_eq, 1);
216
- rb_define_method(cTerm, "==", frt_term_eq, 1);
217
- rb_define_method(cTerm, "text", frt_term_get_text, 0);
218
- rb_define_method(cTerm, "text=", frt_term_set_text, 1);
219
- rb_define_method(cTerm, "field", frt_term_get_field, 0);
220
- rb_define_method(cTerm, "field=", frt_term_set_field, 1);
221
- rb_define_method(cTerm, "hash", frt_term_hash, 0);
609
+ void tir_close(TermInfosReader *tir)
610
+ {
611
+ int i;
612
+ if (tir->index_terms != NULL) {
613
+ for (i = 0; i < tir->index_size; i++) {
614
+ term_destroy(tir->index_terms[i]);
615
+ ti_destroy(tir->index_term_infos[i]);
616
+ }
617
+ free(tir->index_terms);
618
+ free(tir->index_term_infos);
619
+ free(tir->index_pointers);
620
+ }
621
+ if (tir->orig_te) tir->orig_te->close(tir->orig_te);
622
+ thread_key_delete(tir->thread_te);
623
+ ary_destroy(tir->te_bucket);
624
+ if (tir->index_te) tir->index_te->close(tir->index_te);
625
+ mutex_destroy(&tir->mutex);
626
+ free(tir);
627
+ }
628
+
629
+ TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis)
630
+ {
631
+ TermInfosReader *tir = ALLOC(TermInfosReader);
632
+ char fname[SEGMENT_NAME_MAX_LENGTH];
633
+ mutex_init(&tir->mutex, NULL);
634
+ strcpy(fname, segment);
635
+ strcpy(fname + strlen(segment), ".tis");
636
+ InStream *is = store->open_input(store, fname);
637
+ tir->orig_te = ste_create(is, fis, false);
638
+ thread_key_create(&tir->thread_te, NULL);
639
+ tir->te_bucket = ary_create(1, (destroy_func_t)tir->orig_te->close);
640
+
641
+ SegmentTermEnum *ste = tir->orig_te->data;
642
+ tir->size = ste->size;
643
+ tir->skip_interval = ste->skip_interval;
644
+
645
+ strcpy(fname + strlen(segment), ".tii");
646
+ is = store->open_input(store, fname);
647
+ tir->index_te = ste_create(is, fis, true);
648
+ tir->index_terms = NULL;
649
+ tir->index_term_infos = NULL;
650
+ tir->index_pointers = NULL;
651
+ return tir;
652
+ }
653
+
654
+ void tir_ensure_index_is_read(TermInfosReader *tir)
655
+ {
656
+ mutex_lock(&tir->mutex);
657
+ if (tir->index_terms == NULL) {
658
+ int index_size = ((SegmentTermEnum *)tir->index_te->data)->size;
659
+ tir->index_size = index_size;
660
+
661
+ tir->index_terms = ALLOC_N(Term *, index_size);
662
+ tir->index_term_infos = ALLOC_N(TermInfo *, index_size);
663
+ tir->index_pointers = ALLOC_N(int, index_size);
664
+
665
+ int i = 0;
666
+ TermEnum *index_te = tir->index_te;
667
+ SegmentTermEnum *ste = index_te->data;
668
+
669
+ while (ste_next(index_te) != NULL) {
670
+ tir->index_terms[i] = te_get_term(index_te);
671
+ tir->index_term_infos[i] = te_get_ti(index_te);
672
+ tir->index_pointers[i] = ste->index_pointer;
673
+ i++;
674
+ }
675
+
676
+ index_te->close(index_te);
677
+ tir->index_te = NULL;
678
+ }
679
+ mutex_unlock(&tir->mutex);
680
+ }
681
+
682
+ static inline TermEnum *tir_enum(TermInfosReader *tir)
683
+ {
684
+ TermEnum *te;
685
+ if ((te = thread_getspecific(tir->thread_te)) == NULL) {
686
+ te = tir->orig_te->clone(tir->orig_te);
687
+ ary_append(tir->te_bucket, te);
688
+ thread_setspecific(tir->thread_te, te);
689
+ }
690
+ return te;
222
691
  }
692
+
693
+ void tir_seek_enum(TermInfosReader *tir, int ind_offset)
694
+ {
695
+ TermEnum *te = tir_enum(tir);
696
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
697
+ ste_seek(te, tir->index_pointers[ind_offset],
698
+ (ind_offset * ste->index_interval) - 1,
699
+ tir->index_terms[ind_offset],
700
+ tir->index_term_infos[ind_offset]);
701
+ }
702
+
703
+ int tir_get_index_offset(TermInfosReader *tir, Term *t)
704
+ {
705
+ int lo = 0; // binary search tir->index_terms[]
706
+ int hi = tir->index_size - 1;
707
+ int mid, delta;
708
+ Term **index_terms = tir->index_terms;
709
+
710
+ while (hi >= lo) {
711
+ mid = (lo + hi) >> 1;
712
+ delta = term_cmp(t, index_terms[mid]);
713
+ if (delta < 0) {
714
+ hi = mid - 1;
715
+ } else if (delta > 0) {
716
+ lo = mid + 1;
717
+ } else {
718
+ return mid;
719
+ }
720
+ }
721
+ return hi;
722
+ }
723
+
724
+ TermInfo *tir_get_ti(TermInfosReader *tir, Term *t)
725
+ {
726
+ if (tir->size == 0)
727
+ return NULL;
728
+
729
+ tir_ensure_index_is_read(tir);
730
+
731
+ // optimize sequential access: first try scanning cached enum w/o seeking
732
+ TermEnum *te = tir_enum(tir);
733
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
734
+ if (ste->pos < ste->size && tb_term_cmp(te->tb_curr, t) <= 0) {
735
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
736
+ int enum_offset = (int)(ste->pos / ste->index_interval) + 1;
737
+ if (tir->index_size == enum_offset ||
738
+ term_cmp(t, tir->index_terms[enum_offset]) < 0) { // but before end of block
739
+ return ste_scan_for_term_info(te, t); // no need to seek
740
+ }
741
+ }
742
+
743
+ // random-access: must seek
744
+ tir_seek_enum(tir, tir_get_index_offset(tir, t));
745
+ return ste_scan_for_term_info(te, t);
746
+ }
747
+
748
+ Term *tir_get_term(TermInfosReader *tir, int pos)
749
+ {
750
+ if (tir->size == 0)
751
+ return NULL;
752
+
753
+ TermEnum *te = tir_enum(tir);
754
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
755
+ if (pos >= ste->pos &&
756
+ pos < (ste->pos + ste->index_interval)) {
757
+ return ste_scan_for_term(te, pos); // can avoid seek
758
+ }
759
+
760
+ tir_seek_enum(tir, (int)(pos / ste->index_interval)); // must seek
761
+ return ste_scan_for_term(te, pos);
762
+ }
763
+
764
+ int tir_get_term_pos(TermInfosReader *tir, Term *t)
765
+ {
766
+ if (tir->size == 0)
767
+ return -1;
768
+
769
+ tir_ensure_index_is_read(tir);
770
+
771
+ int ind_offset = tir_get_index_offset(tir, t);
772
+ tir_seek_enum(tir, ind_offset);
773
+
774
+ TermEnum *te = tir_enum(tir);
775
+ while ((tb_term_cmp(te->tb_curr, t) < 0) && (ste_next(te) != NULL))
776
+ ;
777
+
778
+ if (tb_term_cmp(te->tb_curr, t) == 0)
779
+ return ((SegmentTermEnum *)te->data)->pos;
780
+ else
781
+ return -1;
782
+ }
783
+