ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/store.c ADDED
@@ -0,0 +1,35 @@
1
+ #include "store.h"
2
+
3
+ void with_lock(Lock *lock, void (*func)(void *arg), void *arg)
4
+ {
5
+ if (!lock->obtain(lock))
6
+ eprintf(IO_ERROR, "Could not obtain lock: <%s>", lock->name);
7
+ func(arg);
8
+ lock->release(lock);
9
+ }
10
+
11
+ void with_lock_name(Store *store, char *lock_name,
12
+ void (*func)(void *arg), void *arg)
13
+ {
14
+ Lock *lock = store->open_lock(store, lock_name);
15
+ if (!lock->obtain(lock))
16
+ eprintf(IO_ERROR, "Could not obtain lock: <%s>", lock->name);
17
+ func(arg);
18
+ lock->release(lock);
19
+ store->close_lock(lock);
20
+ }
21
+
22
+ Store *store_create()
23
+ {
24
+ Store *store = ALLOC(Store);
25
+ mutex_init(&store->mutex, NULL);
26
+ mutex_init(&store->ext_mutex, NULL);
27
+ return store;
28
+ }
29
+
30
+ void store_destroy(Store *store)
31
+ {
32
+ mutex_destroy(&store->mutex);
33
+ mutex_destroy(&store->ext_mutex);
34
+ free(store);
35
+ }
data/ext/store.h ADDED
@@ -0,0 +1,152 @@
1
+ #include "global.h"
2
+ #include "hash.h"
3
+
4
+ #ifndef FRT_STORE_H
5
+ #define FRT_STORE_H
6
+
7
+ #define BUFFER_SIZE 1024
8
+ #define LOCK_PREFIX "ferret-"
9
+
10
+ #define VINT_MAX_LEN 10
11
+ #define VINT_END BUFFER_SIZE - VINT_MAX_LEN
12
+
13
+ typedef struct Buffer {
14
+ uchar buf[BUFFER_SIZE];
15
+ int start;
16
+ int pos;
17
+ int len;
18
+ } Buffer;
19
+
20
+ typedef struct OutStream {
21
+ Buffer buf;
22
+ void *file;
23
+ int pointer; // only used by RAMOut
24
+ void (*flush_internal)(struct OutStream *os, uchar *buf, int len);
25
+ void (*seek_internal)(struct OutStream *os, int pos);
26
+ void (*close_internal)(struct OutStream *os);
27
+ } OutStream;
28
+
29
+ typedef struct CompoundInStream CompoundInStream;
30
+
31
+ typedef struct InStream {
32
+ int is_clone;
33
+ Buffer buf;
34
+ void *file;
35
+ union {
36
+ int pointer; // only used by RAMIn
37
+ char *path; // only used by FSIn
38
+ CompoundInStream *cis;
39
+ } d;
40
+ void (*read_internal)(struct InStream *is, uchar *buf, int offset, int len);
41
+ void (*seek_internal)(struct InStream *is, int pos);
42
+ void (*close_internal)(struct InStream *is);
43
+ void (*clone_internal)(struct InStream *is, struct InStream *new_index_i);
44
+ int (*length_internal)(struct InStream *is);
45
+ } InStream;
46
+
47
+ struct CompoundInStream {
48
+ InStream *sub;
49
+ int offset;
50
+ int length;
51
+ };
52
+
53
+ #define is_length(mis) mis->length_internal(mis)
54
+
55
+ typedef struct Store Store;
56
+ typedef struct Lock Lock;
57
+ struct Lock {
58
+ char *name;
59
+ Store *store;
60
+ int (*obtain)(Lock *lock);
61
+ int (*is_locked)(Lock *lock);
62
+ void (*release)(Lock *lock);
63
+ };
64
+
65
+ typedef struct CompoundStore {
66
+ Store *store;
67
+ const char *name;
68
+ HshTable *entries;
69
+ InStream *stream;
70
+ } CompoundStore;
71
+
72
+ struct Store {
73
+ int ref_cnt; /* for fs_store only */
74
+ mutex_t mutex;
75
+ mutex_t ext_mutex;
76
+ union {
77
+ char *path; /* for fs_store only */
78
+ HshTable *ht; /* for ram_store only */
79
+ CompoundStore *cmpd; /* for compound_store only */
80
+ } dir;
81
+ void (*touch)(Store *store, char *filename);
82
+ int (*exists)(Store *store, char *filename);
83
+ int (*remove)(Store *store, char *filename);
84
+ int (*rename)(Store *store, char *from, char *to);
85
+ int (*count)(Store *store);
86
+ void (*close)(Store *store);
87
+ void (*clear)(Store *store);
88
+ void (*clear_all)(Store *store);
89
+ void (*clear_locks)(Store *store);
90
+ int (*length)(Store *store, char *filename);
91
+ void (*each)(Store *store, void (*func)(char *fname, void *arg), void *arg);
92
+ OutStream *(*create_output)(Store *store, const char *filename);
93
+ InStream *(*open_input)(Store *store, const char *filename);
94
+ Lock *(*open_lock)(Store *store, char *lockname);
95
+ void (*close_lock)(Lock *lock);
96
+ };
97
+
98
+ #define store_close(mstore) mstore->close(mstore)
99
+
100
+ Store *store_create();
101
+ void store_destroy(Store *store);
102
+ Store *open_fs_store(const char *pathname);
103
+ Store *open_ram_store();
104
+ Store *open_ram_store_and_copy(Store *store, bool close_dir);
105
+ Store *open_cmpd_store(Store *sub, const char *filename);
106
+ void ram_close(Store *store);
107
+ Buffer *buf_create();
108
+ void os_flush(OutStream *os);
109
+ void os_close(OutStream *os);
110
+ int os_pos(OutStream *os);
111
+ void os_seek(OutStream *os, int new_pos);
112
+ void os_write_byte(OutStream *os, uchar b);
113
+ void os_write_bytes(OutStream *os, uchar *b, int len);
114
+ uchar is_read_byte(InStream *is);
115
+ int is_pos(InStream *is);
116
+ uchar *is_read_bytes(InStream *is, uchar *b, int offset, int len);
117
+ void is_seek(InStream *is, int pos);
118
+ InStream *is_clone(InStream *is);
119
+ void is_close(InStream *is);
120
+ int is_read_int(InStream *is);
121
+ long long is_read_long(InStream *is);
122
+ unsigned int is_read_uint(InStream *is);
123
+ unsigned long long is_read_ulong(InStream *is);
124
+ unsigned long long is_read_vint(InStream *is);
125
+ void is_read_chars(InStream *is, char* buffer, int off, int len) ;
126
+ char *is_read_string(InStream *is);
127
+ void os_write_int(OutStream *os, int l);
128
+ void os_write_long(OutStream *os, long long l);
129
+ void os_write_uint(OutStream *os, unsigned int l);
130
+ void os_write_ulong(OutStream *os, unsigned long long l);
131
+ void os_write_vint(OutStream *os, register unsigned long long i);
132
+ void os_write_chars(OutStream *os, char *buf, int start, int length);
133
+ void os_write_string(OutStream *os, char *str);
134
+ OutStream *os_create();
135
+ InStream *is_create();
136
+ void buf_destroy(Buffer *buf);
137
+
138
+ // RamStore functions
139
+ int ramo_length(OutStream *os);
140
+ void ramo_reset(OutStream *os);
141
+ int rami_length(InStream *is);
142
+ void ramo_write_to(OutStream *os, OutStream *other_o);
143
+ OutStream *ram_create_buffer();
144
+ void ram_destroy_buffer(OutStream *os);
145
+
146
+ int file_is_lock(char *filename);
147
+
148
+ void with_lock(Lock *lock, void (*func)(void *arg), void *arg);
149
+ void with_lock_name(Store *store, char *lock_name,
150
+ void (*func)(void *arg), void *arg);
151
+
152
+ #endif
data/ext/term.c CHANGED
@@ -1,222 +1,783 @@
1
- #include "ferret.h"
1
+ #include <index.h>
2
+ #include <string.h>
3
+ #include <helper.h>
4
+ #include <hash.h>
2
5
 
6
+ /****************************************************************************
7
+ *
8
+ * Term
9
+ *
10
+ ****************************************************************************/
11
+
12
+ Term *term_clone(Term *term)
13
+ {
14
+ Term *t = ALLOC(Term);
15
+
16
+ t->field = term->field;
17
+ t->text = estrdup(term->text);
18
+ return t;
19
+ }
20
+
21
+ Term *term_create(const char *field, char *text)
22
+ {
23
+ Term *t = ALLOC(Term);
24
+
25
+ t->field = (char *)field;
26
+ t->text = estrdup(text);
27
+ return t;
28
+ }
29
+
30
+ void term_destroy(void *p)
31
+ {
32
+ Term *t = (Term *)p;
33
+ free(t->text);
34
+ free(t);
35
+ }
36
+
37
+ int term_cmp(void *t1, void *t2)
38
+ {
39
+ int res = strcmp(((Term *)t1)->field, ((Term *)t2)->field);
40
+ if (res != 0) {
41
+ return res;
42
+ } else {
43
+ return strcmp(((Term *)t1)->text, ((Term *)t2)->text);
44
+ }
45
+ }
46
+
47
+ int term_eq(const void *t1, const void *t2)
48
+ {
49
+ return (strcmp(((Term *)t1)->text, ((Term *)t2)->text)) == 0 &&
50
+ (strcmp(((Term *)t1)->field, ((Term *)t2)->field) == 0);
51
+ }
52
+
53
+ unsigned int term_hash(const void *t)
54
+ {
55
+ return str_hash(((Term *)t)->text) * str_hash(((Term *)t)->field);
56
+ }
57
+
58
+ char *term_to_s(Term *term)
59
+ {
60
+ char *string = ALLOC_N(char, strlen(term->field) + strlen(term->text) + 2);
61
+ sprintf(string, "%s:%s", term->field, term->text);
62
+ return string;
63
+ }
3
64
 
4
65
  /****************************************************************************
5
66
  *
6
- * Term Methods
67
+ * TermBuffer
7
68
  *
8
69
  ****************************************************************************/
9
70
 
10
- void
11
- frt_term_free(void *p)
71
+ void tb_reset(TermBuffer *tb)
72
+ {
73
+ tb->field = (char *)EMPTY_STRING;
74
+ tb->text[0] = '\0';
75
+ }
76
+
77
+ TermBuffer *tb_create()
78
+ {
79
+ TermBuffer *tb = ALLOC(TermBuffer);
80
+ tb->field = (char *)EMPTY_STRING;
81
+ tb->text[0] = '\0';
82
+ return tb;
83
+ }
84
+
85
+ void tb_destroy(void *p)
12
86
  {
13
- Term *term = (Term *)p;
14
- free(term->text);
15
87
  free(p);
16
88
  }
17
89
 
18
- void
19
- frt_term_mark(void *p)
90
+ TermBuffer *tb_set_term(TermBuffer *tb, Term *t)
91
+ {
92
+ tb->field = t->field;
93
+ strcpy(tb->text, t->text);
94
+ return tb;
95
+ }
96
+
97
+ Term *tb_get_term(TermBuffer *tb)
20
98
  {
21
- Term *term = (Term *)p;
22
- rb_gc_mark(term->field);
99
+ return term_create(tb->field, tb->text);
23
100
  }
24
101
 
25
- static VALUE
26
- frt_term_alloc(VALUE klass)
102
+ int tb_cmp(TermBuffer *tb1, TermBuffer *tb2)
27
103
  {
28
- Term *term = ALLOC(Term);
29
- MEMZERO(term, Term, 1);
30
- term->field = Qnil;
31
- return Data_Wrap_Struct(klass, frt_term_mark, frt_term_free, term);
104
+ int res = strcmp(tb1->field, tb2->field);
105
+ if (res != 0) {
106
+ return res;
107
+ } else {
108
+ return strcmp(tb1->text, tb2->text);
109
+ }
32
110
  }
33
111
 
34
- #define GET_TERM Term *term; Data_Get_Struct(self, Term, term)
35
- VALUE
36
- frt_term_set(VALUE self, VALUE rfield, VALUE rtext)
112
+ int tb_term_cmp(TermBuffer *tb, Term *t)
37
113
  {
38
- int tlen;
39
- GET_TERM;
114
+ int res = strcmp(tb->field, t->field);
115
+ if (res != 0) {
116
+ return res;
117
+ } else {
118
+ return strcmp(tb->text, t->text);
119
+ }
120
+ }
40
121
 
41
- tlen = RSTRING(rtext)->len;
42
- term->field = rfield;
43
- REALLOC_N(term->text, char, tlen + 1);
44
- MEMCPY(term->text, RSTRING(rtext)->ptr, char, tlen);
45
- term->tlen = tlen;
46
-
47
- return Qnil;
122
+ TermBuffer *tb_cpy(TermBuffer *tb1, TermBuffer *tb2)
123
+ {
124
+ tb1->field = tb2->field;
125
+ strcpy(tb1->text, tb2->text);
126
+ return tb1;
48
127
  }
49
128
 
50
- static VALUE
51
- frt_term_init(VALUE self, VALUE rfield, VALUE rtext)
129
+ TermBuffer *tb_read(TermBuffer *tb, InStream *is, FieldInfos *fis)
52
130
  {
53
- frt_term_set(self, rfield, rtext);
54
- return self;
131
+ int start = is_read_vint(is);
132
+ int length = is_read_vint(is);
133
+ int total_length = start + length;
134
+ is_read_bytes(is, (uchar *)tb->text, start, length);
135
+ tb->text[total_length] = '\0';
136
+ int fnum = is_read_vint(is);
137
+ if (fnum < 0)
138
+ tb->field = (char *)EMPTY_STRING;
139
+ else
140
+ tb->field = fis->by_number[fnum]->name;
141
+ return tb;
55
142
  }
56
143
 
57
- static VALUE
58
- frt_term_get_text(VALUE self)
144
+ /****************************************************************************
145
+ *
146
+ * TermInfo
147
+ *
148
+ ****************************************************************************/
149
+
150
+ TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
59
151
  {
60
- GET_TERM;
61
- return rb_str_new(term->text, term->tlen);
152
+ TermInfo *ti = ALLOC(TermInfo);
153
+ ti->doc_freq = doc_freq;
154
+ ti->freq_pointer = freq_pointer;
155
+ ti->prox_pointer = prox_pointer;
156
+ ti->skip_offset = skip_offset;
157
+ return ti;
62
158
  }
63
159
 
64
- static VALUE
65
- frt_term_set_text(VALUE self, VALUE rtext)
160
+ TermInfo *ti_set(TermInfo *ti, int doc_freq, int freq_pointer, int prox_pointer, int skip_offset)
66
161
  {
67
- int tlen;
68
- char *text;
69
- GET_TERM;
70
- tlen = RSTRING(rtext)->len;
71
- text = RSTRING(rtext)->ptr;
162
+ ti->doc_freq = doc_freq;
163
+ ti->freq_pointer = freq_pointer;
164
+ ti->prox_pointer = prox_pointer;
165
+ ti->skip_offset = skip_offset;
166
+ return ti;
167
+ }
72
168
 
73
- REALLOC_N(term->text, char, tlen + 1);
74
-
75
- MEMCPY(term->text, text, char, tlen);
76
- term->tlen = tlen;
77
-
78
- return Qnil;
169
+ void ti_destroy(void *p)
170
+ {
171
+ free(p);
79
172
  }
80
173
 
81
- static VALUE
82
- frt_term_get_field(VALUE self)
174
+ TermInfo *ti_cpy(TermInfo *ti, TermInfo *other)
83
175
  {
84
- GET_TERM;
85
- return term->field;
176
+ memcpy(ti, other, sizeof(TermInfo));
177
+ return ti;
86
178
  }
87
179
 
88
- static VALUE
89
- frt_term_set_field(VALUE self, VALUE rfield)
180
+ TermInfo *ti_clone(TermInfo *other)
90
181
  {
91
- GET_TERM;
92
- term->field = rfield;
93
- return Qnil;
182
+ return ti_create(other->doc_freq,
183
+ other->freq_pointer, other->prox_pointer, other->skip_offset);
94
184
  }
95
185
 
96
- VALUE
97
- frt_term_to_s(VALUE self)
186
+ int ti_eq(TermInfo *ti, TermInfo *other)
98
187
  {
99
- int tlen, flen;
100
- char delim[] = ":";
101
- char *res;
102
- GET_TERM;
103
- tlen = term->tlen;
104
- flen = RSTRING(term->field)->len;
105
- res = alloca(flen + tlen + 1);
188
+ return (memcmp(ti, other, sizeof(TermInfo)) == 0);
189
+ }
190
+
191
+ /****************************************************************************
192
+ *
193
+ * TermEnum
194
+ *
195
+ ****************************************************************************/
196
+
197
+ TermEnum *te_create()
198
+ {
199
+ TermEnum *te = ALLOC(TermEnum);
200
+ te->tb_curr = tb_create();
201
+ te->tb_prev = tb_create();
202
+ te->ti_curr = ti_create(0, 0, 0, 0);
203
+ return te;
204
+ }
205
+
206
+ void te_destroy(void *p)
207
+ {
208
+ TermEnum *te = (TermEnum *)p;
209
+ tb_destroy(te->tb_curr);
210
+ tb_destroy(te->tb_prev);
211
+ ti_destroy(te->ti_curr);
212
+ free(p);
213
+ }
214
+
215
+ Term *te_get_term(TermEnum *te)
216
+ {
217
+ return tb_get_term(te->tb_curr);
218
+ }
219
+
220
+ TermInfo *te_get_ti(TermEnum *te)
221
+ {
222
+ TermInfo *ti = te->ti_curr;
223
+ return ti_create(ti->doc_freq, ti->freq_pointer, ti->prox_pointer, ti->skip_offset);
224
+ }
225
+
226
+ TermBuffer *te_skip_to(TermEnum *te, Term *t)
227
+ {
228
+ TermBuffer *tb_curr;
229
+ if (tb_term_cmp(te->tb_curr, t) == 0)
230
+ return te->tb_curr;
231
+
232
+ while (((tb_curr = te->next(te)) != NULL) &&
233
+ (tb_term_cmp(tb_curr, t) < 0)) {
234
+ }
235
+ return tb_curr;
236
+ }
237
+
238
+ /****************************************************************************
239
+ *
240
+ * SegmentTermEnum
241
+ *
242
+ ****************************************************************************/
243
+
244
+ #define GET_STE SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
245
+
246
+ TermBuffer *ste_next(TermEnum *te)
247
+ {
248
+ GET_STE;
249
+ InStream *is = ste->is;
250
+ ste->pos++;
251
+ if (ste->pos > ste->size - 1) {
252
+ tb_reset(te->tb_curr);
253
+ return NULL;
254
+ }
255
+
256
+ tb_cpy(te->tb_prev, te->tb_curr);
257
+ tb_read(te->tb_curr, is, ste->fis);
106
258
 
107
- MEMCPY(res, StringValuePtr(term->field), char, flen);
108
- MEMCPY(res + flen, delim, char, 1);
109
- MEMCPY(res + flen + 1, term->text, char, tlen);
110
- return rb_str_new(res, tlen + flen + 1 );
259
+ TermInfo *ti = te->ti_curr;
260
+ ti->doc_freq = is_read_vint(is); // read doc freq
261
+ ti->freq_pointer += is_read_vint(is); // read freq pointer
262
+ ti->prox_pointer += is_read_vint(is); // read prox pointer
263
+
264
+ if (ste->format == -1) {
265
+ // just read skip_offset in order to increment file pointer
266
+ // value is never used since skip_to is switched off
267
+ if (!ste->is_index) {
268
+ if (ti->doc_freq > ste->format_m1skip_interval)
269
+ ti->skip_offset = is_read_vint(is);
270
+ }
271
+ } else {
272
+ if (ti->doc_freq >= ste->skip_interval)
273
+ ti->skip_offset = is_read_vint(is);
274
+ }
275
+
276
+ if (ste->is_index)
277
+ ste->index_pointer += is_read_vint(is); // read index pointer
278
+
279
+ return te->tb_curr;
111
280
  }
112
281
 
113
- inline int
114
- frt_term_cmp(Term *t1, Term *t2)
282
+ TermEnum *ste_clone(TermEnum *other_te);
283
+ TermEnum *ste_allocate()
115
284
  {
116
- int comp, size, my_len, o_len;
285
+ TermEnum *te = te_create();
286
+ te->next = &ste_next;
287
+ te->close = &ste_close;
288
+ te->clone = &ste_clone;
289
+ SegmentTermEnum *ste =
290
+ ALLOC(SegmentTermEnum);
291
+ te->data = ste;
292
+ return te;
293
+ }
294
+
295
+ TermEnum *ste_clone(TermEnum *other_te)
296
+ {
297
+ SegmentTermEnum *other_ste = (SegmentTermEnum *)other_te->data;
298
+ TermEnum *te = ste_allocate();
299
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
300
+ memcpy(ste, other_ste, sizeof(SegmentTermEnum));
301
+ ste->is = is_clone(other_ste->is);
302
+ tb_cpy(te->tb_curr, other_te->tb_curr);
303
+ tb_cpy(te->tb_prev, other_te->tb_prev);
304
+ ti_cpy(te->ti_curr, other_te->ti_curr);
305
+ return te;
306
+ }
307
+
308
+ void ste_close(TermEnum *te)
309
+ {
310
+ GET_STE;
311
+ is_close(ste->is);
312
+ free(ste);
313
+ te->data = NULL;
314
+ te_destroy(te);
315
+ }
316
+
317
+ TermEnum *ste_create(InStream *is, FieldInfos *fis, int is_index)
318
+ {
319
+ TermEnum *te = ste_allocate();
320
+ GET_STE;
321
+ ste->fis = fis;
322
+ ste->is_index = is_index;
323
+ ste->is = is;
324
+ ste->pos = -1;
325
+ ste->index_pointer = 0;
326
+ ste->format_m1skip_interval = -1;
327
+
328
+ int first_int = is_read_int(is);
329
+
330
+ if (first_int >= 0) {
331
+ // original-format file, without explicit format version number
332
+ ste->format = 0;
333
+ ste->size = first_int;
334
+
335
+ // back-compatible settings
336
+ ste->index_interval = 128;
337
+ ste->skip_interval = INT_MAX; // switch off skip_to optimization
338
+
339
+ } else {
340
+ // check that it is a format we can understand
341
+ if (first_int < TERM_INFO_FORMAT)
342
+ eprintf(ERROR, "Unknown format version:%d", first_int);
343
+
344
+ // we have a format version number
345
+ ste->format = first_int;
346
+
347
+
348
+ ste->size = is_read_long(is); // read the size
117
349
 
118
- my_len = RSTRING(t1->field)->len;
119
- o_len = RSTRING(t2->field)->len;
120
- size = my_len >= o_len ? o_len : my_len;
121
- comp = memcmp(RSTRING(t1->field)->ptr, RSTRING(t2->field)->ptr, size);
122
- if (comp == 0) {
123
- if (my_len == o_len) {
124
- my_len = t1->tlen;
125
- o_len = t2->tlen;
126
- size = my_len >= o_len ? o_len : my_len;
127
- comp = memcmp(t1->text, t2->text, size);
128
- if(comp == 0 && my_len != o_len)
129
- comp = my_len > o_len ? 1 : -1;
350
+ if (ste->format == -1) {
351
+ if (!ste->is_index) {
352
+ ste->index_interval = is_read_int(is);
353
+ ste->format_m1skip_interval = is_read_int(is);
354
+ }
355
+ // switch off skip_to optimization for file format prior to
356
+ // 1.4rc2 in order to avoid a bug in skip_to implementation
357
+ // of these versions
358
+ ste->skip_interval = INT_MAX;
130
359
  } else {
131
- comp = my_len > o_len ? 1 : -1;
360
+ ste->index_interval = is_read_int(is);
361
+ ste->skip_interval = is_read_int(is);
132
362
  }
133
363
  }
134
- return comp;
364
+ return te;
365
+ }
366
+
367
+ void ste_seek(TermEnum *te, int pointer, int pos, Term *t, TermInfo *ti)
368
+ {
369
+ GET_STE;
370
+ is_seek(ste->is, pointer);
371
+ ste->pos = pos;
372
+ tb_set_term(te->tb_curr, t);
373
+ tb_reset(te->tb_prev);
374
+ ti_cpy(te->ti_curr, ti);
375
+ }
376
+
377
+ TermInfo *ste_scan_for_term_info(TermEnum *te, Term *t)
378
+ {
379
+ te_skip_to(te, t);
380
+
381
+ if (tb_term_cmp(te->tb_curr, t) == 0) {
382
+ return te_get_ti(te);
383
+ } else {
384
+ return NULL;
385
+ }
135
386
  }
136
387
 
137
- int
138
- frt_term_compare_to_int(VALUE self, VALUE rother)
388
+ Term *ste_scan_for_term(TermEnum *te, int pos)
139
389
  {
140
- Term *other;
141
- GET_TERM;
142
- Data_Get_Struct(rother, Term, other);
143
- return frt_term_cmp(term, other);
390
+ GET_STE;
391
+ while (ste->pos < pos) {
392
+ if (ste_next(te) == NULL)
393
+ return NULL;
394
+ }
395
+
396
+ return te_get_term(te);
144
397
  }
145
398
 
146
- VALUE
147
- frt_term_lt(VALUE self, VALUE rother)
399
+ /****************************************************************************
400
+ *
401
+ * MultiTermEnum
402
+ *
403
+ ****************************************************************************/
404
+
405
+ #define GET_MTE MultiTermEnum *mte = (MultiTermEnum *)te->data;
406
+
407
+ TermBuffer *mte_next(TermEnum *te)
148
408
  {
149
- return frt_term_compare_to_int(self, rother) < 0 ? Qtrue : Qfalse;
409
+ GET_MTE;
410
+ SegmentMergeInfo *top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
411
+
412
+ if (top == NULL) {
413
+ tb_reset(te->tb_curr);
414
+ return false;
415
+ }
416
+
417
+ tb_cpy(te->tb_prev, te->tb_curr);
418
+ tb_cpy(te->tb_curr, top->tb);
419
+
420
+ te->ti_curr->doc_freq = 0;
421
+
422
+ while ((top != NULL) && (tb_cmp(te->tb_curr, top->tb) == 0)) {
423
+ pq_pop(mte->smi_queue);
424
+ te->ti_curr->doc_freq += top->te->ti_curr->doc_freq; // increment freq
425
+ if (smi_next(top)) {
426
+ pq_push(mte->smi_queue, top); // restore queue
427
+ } else {
428
+ smi_destroy(top); // done with a segment
429
+ }
430
+ top = (SegmentMergeInfo *)pq_top(mte->smi_queue);
431
+ }
432
+ return te->tb_curr;
150
433
  }
151
434
 
152
- VALUE
153
- frt_term_gt(VALUE self, VALUE rother)
435
+ void mte_close(TermEnum *te)
154
436
  {
155
- return frt_term_compare_to_int(self, rother) > 0 ? Qtrue : Qfalse;
437
+ GET_MTE;
438
+ pq_clear(mte->smi_queue);
439
+ pq_destroy(mte->smi_queue);
440
+ free(mte);
441
+ te_destroy(te);
156
442
  }
157
443
 
158
- VALUE
159
- frt_term_le(VALUE self, VALUE rother)
444
+ TermEnum *mte_clone(TermEnum *te)
160
445
  {
161
- return frt_term_compare_to_int(self, rother) <= 0 ? Qtrue : Qfalse;
446
+ eprintf(ERROR, "MultiTermEnum does not support cloning");
447
+ return NULL;
162
448
  }
163
449
 
164
- VALUE
165
- frt_term_ge(VALUE self, VALUE rother)
450
+ TermEnum *mte_create(IndexReader **readers, int *starts, int rcnt, Term *t)
166
451
  {
167
- return frt_term_compare_to_int(self, rother) >= 0 ? Qtrue : Qfalse;
452
+ int i;
453
+ TermEnum *te = te_create();
454
+ te->next = &mte_next;
455
+ te->clone = &mte_clone;
456
+ te->close = &mte_close;
457
+
458
+ MultiTermEnum *mte = ALLOC(MultiTermEnum);
459
+ te->data = mte;
460
+
461
+ IndexReader *reader;
462
+ TermEnum *sub_te;
463
+
464
+ mte->smi_queue = pq_create(rcnt, &smi_lt);
465
+ mte->smi_queue->free_elem = &smi_destroy;
466
+
467
+ for (i = 0; i < rcnt; i++) {
468
+ reader = readers[i];
469
+
470
+ if (t != NULL) {
471
+ sub_te = reader->terms_from(reader, t);
472
+ } else {
473
+ sub_te = reader->terms(reader);
474
+ }
475
+
476
+ SegmentMergeInfo *smi = smi_create(starts[i], sub_te, reader);
477
+ if (((t == NULL) && smi_next(smi)) ||
478
+ (sub_te->tb_curr->field != (char *)EMPTY_STRING)) {
479
+ pq_push(mte->smi_queue, smi); // initialize queue
480
+ } else {
481
+ smi_destroy(smi);
482
+ }
483
+ }
484
+
485
+ if ((t != NULL) && (mte->smi_queue->count > 0)) {
486
+ mte_next(te);
487
+ }
488
+
489
+ return te;
168
490
  }
169
491
 
170
- VALUE
171
- frt_term_eq(VALUE self, VALUE rother)
492
+ /****************************************************************************
493
+ *
494
+ * TermInfosWriter
495
+ *
496
+ ****************************************************************************/
497
+
498
+ const Term EmptyTerm = {"", ""};
499
+
500
+ TermInfosWriter *tiw_open_internal(Store *store,
501
+ char *segment,
502
+ FieldInfos *fis,
503
+ int interval,
504
+ int is_index)
172
505
  {
173
- if (rother == Qnil)
174
- return Qfalse;
175
- return frt_term_compare_to_int(self, rother) == 0 ? Qtrue : Qfalse;
506
+ TermInfosWriter *tiw = ALLOC(TermInfosWriter);
507
+ tiw->index_interval = interval;
508
+ tiw->skip_interval = 16;
509
+ tiw->last_index_pointer = 0;
510
+ tiw->last_term = (Term *)&EmptyTerm;
511
+ tiw->last_term_info = ti_create(0,0,0,0);
512
+ tiw->size = 0;
513
+ tiw->is_index = is_index;
514
+ tiw->fis = fis;
515
+ tiw->curr_field = NULL;
516
+ tiw->curr_field_num = -1;
517
+
518
+ char fname[SEGMENT_NAME_MAX_LENGTH];
519
+ strcpy(fname, segment);
520
+ strcat(fname, (is_index ? ".tii" : ".tis"));
521
+ OutStream *os = tiw->os = store->create_output(store, fname);
522
+ os_write_int(os, TERM_INFO_FORMAT); // write format
523
+ os_write_long(os, 0); // leave space for size
524
+ os_write_int(os, tiw->index_interval); // write index_interval
525
+ os_write_int(os, tiw->skip_interval); // write skip_interval
526
+ if (!is_index) {
527
+ tiw->other = tiw_open_internal(store, segment, fis, interval, true);
528
+ tiw->other->other = tiw;
529
+ }
530
+ return tiw;
176
531
  }
177
532
 
533
+ TermInfosWriter *tiw_open(Store *store, char *segment, FieldInfos *fis, int interval)
534
+ {
535
+ return tiw_open_internal(store, segment, fis, interval, false);
536
+ }
178
537
 
179
- static VALUE
180
- frt_term_compare_to(VALUE self, VALUE other)
538
+ void tiw_write_term(TermInfosWriter *tiw, OutStream *os, Term *t)
181
539
  {
182
- return INT2FIX(frt_term_compare_to_int(self, other));
540
+ //printf("%s, %s\n", tiw->last_term->text, t->text);
541
+ int start = hlp_string_diff(tiw->last_term->text, t->text);
542
+ int length = strlen(t->text) - start;
543
+
544
+ os_write_vint(os, start); // write shared prefix length
545
+ os_write_vint(os, length); // write delta length
546
+ os_write_chars(os, t->text, start, length); // write delta chars
547
+ if (tiw->curr_field != t->field) {
548
+ tiw->curr_field = t->field;
549
+ tiw->curr_field_num = fis_get_number(tiw->fis, t->field);
550
+ }
551
+ os_write_vint(os, tiw->curr_field_num);
552
+ tiw->last_term = t;
183
553
  }
184
554
 
185
- static VALUE
186
- frt_term_hash(VALUE self)
555
+ void tiw_add(TermInfosWriter *tiw, Term *t, TermInfo *ti)
187
556
  {
188
- GET_TERM;
189
- return INT2FIX(frt_hash(term->text, term->tlen) +
190
- frt_hash(RSTRING(term->field)->ptr, RSTRING(term->field)->len));
557
+ if (tiw->is_index && term_cmp(tiw->last_term, t) > 0) {
558
+ eprintf(STATE_ERROR,
559
+ "term out of order %s < %s", t->text, tiw->last_term->text);
560
+ }
561
+ if (ti->freq_pointer < tiw->last_term_info->freq_pointer) {
562
+ eprintf(STATE_ERROR, "freq pointer out of order");
563
+ }
564
+ if (ti->prox_pointer < tiw->last_term_info->prox_pointer) {
565
+ eprintf(STATE_ERROR, "prox pointer out of order");
566
+ }
567
+
568
+ if (!tiw->is_index && (tiw->size % tiw->index_interval) == 0)
569
+ tiw_add(tiw->other, tiw->last_term, tiw->last_term_info); // add an index term
570
+
571
+ tiw_write_term(tiw, tiw->os, t); // write term
572
+ os_write_vint(tiw->os, ti->doc_freq); // write doc freq
573
+ os_write_vint(tiw->os, ti->freq_pointer - tiw->last_term_info->freq_pointer);
574
+ os_write_vint(tiw->os, ti->prox_pointer - tiw->last_term_info->prox_pointer);
575
+ if (ti->doc_freq >= tiw->skip_interval)
576
+ os_write_vint(tiw->os, ti->skip_offset);
577
+
578
+ if (tiw->is_index) {
579
+ OutStream *other_os = tiw->other->os;
580
+ int other_pos = os_pos(other_os);
581
+ os_write_vint(tiw->os, other_pos - tiw->last_index_pointer);
582
+ tiw->last_index_pointer = other_pos; // write pointer
583
+ }
584
+
585
+ ti_cpy(tiw->last_term_info, ti);
586
+ tiw->size++;
587
+ }
588
+
589
+ void tiw_close(TermInfosWriter *tiw)
590
+ {
591
+ OutStream *os = tiw->os;
592
+ os_seek(os, 4); // write @size after format
593
+ os_write_long(os, tiw->size);
594
+ os_close(os);
595
+
596
+ if (!tiw->is_index)
597
+ tiw_close(tiw->other);
598
+
599
+ ti_destroy(tiw->last_term_info);
600
+ free(tiw);
191
601
  }
192
602
 
193
603
  /****************************************************************************
194
604
  *
195
- * Init Function
605
+ * TermInfosReader
196
606
  *
197
607
  ****************************************************************************/
198
608
 
199
- void
200
- Init_term(void)
201
- {
202
- /* Term */
203
- cTerm = rb_define_class_under(mIndex, "Term", rb_cObject);
204
- rb_define_alloc_func(cTerm, frt_term_alloc);
205
- rb_include_module(cTerm, rb_mComparable);
206
-
207
- rb_define_method(cTerm, "initialize", frt_term_init, 2);
208
- rb_define_method(cTerm, "set!", frt_term_set, 2);
209
- rb_define_method(cTerm, "to_s", frt_term_to_s, 0);
210
- rb_define_method(cTerm, "<=>", frt_term_compare_to, 1);
211
- rb_define_method(cTerm, "<", frt_term_lt, 1);
212
- rb_define_method(cTerm, ">", frt_term_gt, 1);
213
- rb_define_method(cTerm, "<=", frt_term_le, 1);
214
- rb_define_method(cTerm, ">=", frt_term_ge, 1);
215
- rb_define_method(cTerm, "eql?", frt_term_eq, 1);
216
- rb_define_method(cTerm, "==", frt_term_eq, 1);
217
- rb_define_method(cTerm, "text", frt_term_get_text, 0);
218
- rb_define_method(cTerm, "text=", frt_term_set_text, 1);
219
- rb_define_method(cTerm, "field", frt_term_get_field, 0);
220
- rb_define_method(cTerm, "field=", frt_term_set_field, 1);
221
- rb_define_method(cTerm, "hash", frt_term_hash, 0);
609
+ void tir_close(TermInfosReader *tir)
610
+ {
611
+ int i;
612
+ if (tir->index_terms != NULL) {
613
+ for (i = 0; i < tir->index_size; i++) {
614
+ term_destroy(tir->index_terms[i]);
615
+ ti_destroy(tir->index_term_infos[i]);
616
+ }
617
+ free(tir->index_terms);
618
+ free(tir->index_term_infos);
619
+ free(tir->index_pointers);
620
+ }
621
+ if (tir->orig_te) tir->orig_te->close(tir->orig_te);
622
+ thread_key_delete(tir->thread_te);
623
+ ary_destroy(tir->te_bucket);
624
+ if (tir->index_te) tir->index_te->close(tir->index_te);
625
+ mutex_destroy(&tir->mutex);
626
+ free(tir);
627
+ }
628
+
629
+ TermInfosReader *tir_open(Store *store, char *segment, FieldInfos *fis)
630
+ {
631
+ TermInfosReader *tir = ALLOC(TermInfosReader);
632
+ char fname[SEGMENT_NAME_MAX_LENGTH];
633
+ mutex_init(&tir->mutex, NULL);
634
+ strcpy(fname, segment);
635
+ strcpy(fname + strlen(segment), ".tis");
636
+ InStream *is = store->open_input(store, fname);
637
+ tir->orig_te = ste_create(is, fis, false);
638
+ thread_key_create(&tir->thread_te, NULL);
639
+ tir->te_bucket = ary_create(1, (destroy_func_t)tir->orig_te->close);
640
+
641
+ SegmentTermEnum *ste = tir->orig_te->data;
642
+ tir->size = ste->size;
643
+ tir->skip_interval = ste->skip_interval;
644
+
645
+ strcpy(fname + strlen(segment), ".tii");
646
+ is = store->open_input(store, fname);
647
+ tir->index_te = ste_create(is, fis, true);
648
+ tir->index_terms = NULL;
649
+ tir->index_term_infos = NULL;
650
+ tir->index_pointers = NULL;
651
+ return tir;
652
+ }
653
+
654
+ void tir_ensure_index_is_read(TermInfosReader *tir)
655
+ {
656
+ mutex_lock(&tir->mutex);
657
+ if (tir->index_terms == NULL) {
658
+ int index_size = ((SegmentTermEnum *)tir->index_te->data)->size;
659
+ tir->index_size = index_size;
660
+
661
+ tir->index_terms = ALLOC_N(Term *, index_size);
662
+ tir->index_term_infos = ALLOC_N(TermInfo *, index_size);
663
+ tir->index_pointers = ALLOC_N(int, index_size);
664
+
665
+ int i = 0;
666
+ TermEnum *index_te = tir->index_te;
667
+ SegmentTermEnum *ste = index_te->data;
668
+
669
+ while (ste_next(index_te) != NULL) {
670
+ tir->index_terms[i] = te_get_term(index_te);
671
+ tir->index_term_infos[i] = te_get_ti(index_te);
672
+ tir->index_pointers[i] = ste->index_pointer;
673
+ i++;
674
+ }
675
+
676
+ index_te->close(index_te);
677
+ tir->index_te = NULL;
678
+ }
679
+ mutex_unlock(&tir->mutex);
680
+ }
681
+
682
+ static inline TermEnum *tir_enum(TermInfosReader *tir)
683
+ {
684
+ TermEnum *te;
685
+ if ((te = thread_getspecific(tir->thread_te)) == NULL) {
686
+ te = tir->orig_te->clone(tir->orig_te);
687
+ ary_append(tir->te_bucket, te);
688
+ thread_setspecific(tir->thread_te, te);
689
+ }
690
+ return te;
222
691
  }
692
+
693
+ void tir_seek_enum(TermInfosReader *tir, int ind_offset)
694
+ {
695
+ TermEnum *te = tir_enum(tir);
696
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
697
+ ste_seek(te, tir->index_pointers[ind_offset],
698
+ (ind_offset * ste->index_interval) - 1,
699
+ tir->index_terms[ind_offset],
700
+ tir->index_term_infos[ind_offset]);
701
+ }
702
+
703
+ int tir_get_index_offset(TermInfosReader *tir, Term *t)
704
+ {
705
+ int lo = 0; // binary search tir->index_terms[]
706
+ int hi = tir->index_size - 1;
707
+ int mid, delta;
708
+ Term **index_terms = tir->index_terms;
709
+
710
+ while (hi >= lo) {
711
+ mid = (lo + hi) >> 1;
712
+ delta = term_cmp(t, index_terms[mid]);
713
+ if (delta < 0) {
714
+ hi = mid - 1;
715
+ } else if (delta > 0) {
716
+ lo = mid + 1;
717
+ } else {
718
+ return mid;
719
+ }
720
+ }
721
+ return hi;
722
+ }
723
+
724
+ TermInfo *tir_get_ti(TermInfosReader *tir, Term *t)
725
+ {
726
+ if (tir->size == 0)
727
+ return NULL;
728
+
729
+ tir_ensure_index_is_read(tir);
730
+
731
+ // optimize sequential access: first try scanning cached enum w/o seeking
732
+ TermEnum *te = tir_enum(tir);
733
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
734
+ if (ste->pos < ste->size && tb_term_cmp(te->tb_curr, t) <= 0) {
735
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
736
+ int enum_offset = (int)(ste->pos / ste->index_interval) + 1;
737
+ if (tir->index_size == enum_offset ||
738
+ term_cmp(t, tir->index_terms[enum_offset]) < 0) { // but before end of block
739
+ return ste_scan_for_term_info(te, t); // no need to seek
740
+ }
741
+ }
742
+
743
+ // random-access: must seek
744
+ tir_seek_enum(tir, tir_get_index_offset(tir, t));
745
+ return ste_scan_for_term_info(te, t);
746
+ }
747
+
748
+ Term *tir_get_term(TermInfosReader *tir, int pos)
749
+ {
750
+ if (tir->size == 0)
751
+ return NULL;
752
+
753
+ TermEnum *te = tir_enum(tir);
754
+ SegmentTermEnum *ste = (SegmentTermEnum *)te->data;
755
+ if (pos >= ste->pos &&
756
+ pos < (ste->pos + ste->index_interval)) {
757
+ return ste_scan_for_term(te, pos); // can avoid seek
758
+ }
759
+
760
+ tir_seek_enum(tir, (int)(pos / ste->index_interval)); // must seek
761
+ return ste_scan_for_term(te, pos);
762
+ }
763
+
764
+ int tir_get_term_pos(TermInfosReader *tir, Term *t)
765
+ {
766
+ if (tir->size == 0)
767
+ return -1;
768
+
769
+ tir_ensure_index_is_read(tir);
770
+
771
+ int ind_offset = tir_get_index_offset(tir, t);
772
+ tir_seek_enum(tir, ind_offset);
773
+
774
+ TermEnum *te = tir_enum(tir);
775
+ while ((tb_term_cmp(te->tb_curr, t) < 0) && (ste_next(te) != NULL))
776
+ ;
777
+
778
+ if (tb_term_cmp(te->tb_curr, t) == 0)
779
+ return ((SegmentTermEnum *)te->data)->pos;
780
+ else
781
+ return -1;
782
+ }
783
+