ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/vector.c ADDED
@@ -0,0 +1,594 @@
1
+ #include <index.h>
2
+ #include <string.h>
3
+ #include <helper.h>
4
+
5
+ #define TERM_ARR_START_SIZE 16
6
+ #define FIELD_ARR_START_SIZE 8
7
+
8
+ TVOffsetInfo *tvoi_create(int start, int end)
9
+ {
10
+ TVOffsetInfo *tvoi = ALLOC(TVOffsetInfo);
11
+ tvoi->start = start;
12
+ tvoi->end = end;
13
+ return tvoi;
14
+ }
15
+
16
+ void tvoi_destroy(void *p)
17
+ {
18
+ free(p);
19
+ }
20
+
21
+ TVField *tvf_create(int number, int store_positions, int store_offsets)
22
+ {
23
+ TVField *tvf = ALLOC(TVField);
24
+ tvf->tvf_pointer = 0;
25
+ tvf->number = number;
26
+ tvf->store_positions = store_positions;
27
+ tvf->store_offsets = store_offsets;
28
+ return tvf;
29
+ }
30
+
31
+ void tvf_destroy(void *p)
32
+ {
33
+ free(p);
34
+ }
35
+
36
+ TVTerm *tvt_create(char *text, int freq, int *positions, TVOffsetInfo **offsets)
37
+ {
38
+ TVTerm *tvt = ALLOC(TVTerm);
39
+ tvt->text = text;
40
+ tvt->freq = freq;
41
+ tvt->positions = positions;
42
+ tvt->offsets = offsets;
43
+ return tvt;
44
+ }
45
+
46
+ void tvt_destroy(void *p)
47
+ {
48
+ //int i;
49
+ //TVTerm *tvt = (TVTerm *)p;
50
+ //free(tvt->text);
51
+ //free(tvt->positions);
52
+ //if (tvt->offsets != NULL) {
53
+ // for (i = 0; i < tvt->freq; i++) {
54
+ // tvoi_destroy(tvt->offsets[i]);
55
+ // }
56
+ // free(tvt->offsets);
57
+ //}
58
+ free(p);
59
+ }
60
+
61
+
62
+ TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis)
63
+ {
64
+ TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
65
+ tvw->curr_field = NULL;
66
+ tvw->curr_doc_pointer = -1;
67
+
68
+ // Open files for TermVector storage
69
+ char fname[SEGMENT_NAME_MAX_LENGTH];
70
+ int segment_len = strlen(segment);
71
+ strcpy(fname, segment);
72
+
73
+ strcpy(fname + segment_len, TVX_EXTENSION);
74
+ OutStream *os = tvw->tvx = store->create_output(store, fname);
75
+ os_write_int(os, FORMAT_VERSION);
76
+
77
+ strcpy(fname + segment_len, TVD_EXTENSION);
78
+ os = tvw->tvd = store->create_output(store, fname);
79
+ os_write_int(os, FORMAT_VERSION);
80
+
81
+ strcpy(fname + segment_len, TVF_EXTENSION);
82
+ os = tvw->tvf = store->create_output(store, fname);
83
+ os_write_int(os, FORMAT_VERSION);
84
+
85
+ tvw->fis = fis;
86
+
87
+ tvw->fields = NULL;
88
+ tvw->fcnt = 0;
89
+ tvw->fsize = 0;
90
+ tvw->terms = NULL;
91
+ tvw->tcnt = 0;
92
+ tvw->tsize = 0;
93
+
94
+ return tvw;
95
+ }
96
+
97
+ void tvw_write_field(TermVectorsWriter *tvw)
98
+ {
99
+ int i, j, start, length;
100
+ char *last_term_text;
101
+ TVOffsetInfo *tmp_offset;
102
+ // remember where this field is written
103
+ OutStream *tvf = tvw->tvf;
104
+ tvw->curr_field->tvf_pointer = os_pos(tvf);
105
+
106
+ // write the number of terms
107
+ os_write_vint(tvf, tvw->tcnt);
108
+
109
+ int store_positions = tvw->curr_field->store_positions;
110
+ int store_offsets = tvw->curr_field->store_offsets;
111
+ int bits = 0x0;
112
+ if (store_positions)
113
+ bits |= STORE_POSITIONS_WITH_TERMVECTOR;
114
+
115
+ if (store_offsets)
116
+ bits |= STORE_OFFSET_WITH_TERMVECTOR;
117
+
118
+ os_write_byte(tvf, bits);
119
+
120
+ last_term_text = (char *)EMPTY_STRING;
121
+ TVTerm **terms = tvw->terms;
122
+ TVTerm *term;
123
+ for (i = 0; i < tvw->tcnt; i++) {
124
+ term = terms[i];
125
+ start = hlp_string_diff(last_term_text, term->text);
126
+ length = strlen(term->text) - start;
127
+ os_write_vint(tvf, start); // write shared prefix length
128
+ os_write_vint(tvf, length); // write delta length
129
+ os_write_chars(tvf, term->text, start, length); // write delta chars
130
+ os_write_vint(tvf, term->freq);
131
+ last_term_text = term->text;
132
+
133
+ if (store_positions) {
134
+ if (term->positions == NULL)
135
+ eprintf(IO_ERROR, "Trying to write positions that are null!");
136
+
137
+ // use delta encoding for positions
138
+ int last_pos = 0;
139
+ for (j = 0; j < term->freq; j++) {
140
+ os_write_vint(tvf, term->positions[j] - last_pos);
141
+ last_pos = term->positions[j];
142
+ }
143
+ }
144
+
145
+ if (store_offsets) {
146
+ if (term->offsets == NULL)
147
+ eprintf(IO_ERROR, "Trying to write offsets that are null!");
148
+
149
+ // use delta encoding for offsets
150
+ int last_end = 0;
151
+ for (j = 0; j < term->freq; j++) {
152
+ tmp_offset = term->offsets[j];
153
+ os_write_vint(tvf, tmp_offset->start - last_end);
154
+
155
+ // save the diff between the two.
156
+ os_write_vint(tvf, tmp_offset->end - tmp_offset->start);
157
+ last_end = tmp_offset->end;
158
+ }
159
+ }
160
+ }
161
+ }
162
+
163
+ void tvw_close_field(TermVectorsWriter *tvw)
164
+ {
165
+ int i;
166
+ if (tvw->curr_field != NULL) {
167
+ // save field and terms
168
+ tvw_write_field(tvw);
169
+
170
+ if (tvw->fcnt >= tvw->fsize) {
171
+ tvw->fsize *=2;
172
+ if (tvw->fsize < FIELD_ARR_START_SIZE)
173
+ tvw->fsize = FIELD_ARR_START_SIZE;
174
+ REALLOC_N(tvw->fields, TVField *, tvw->fsize);
175
+ }
176
+ tvw->fields[tvw->fcnt] = tvw->curr_field;
177
+ tvw->fcnt++;
178
+
179
+ for (i = 0; i < tvw->tcnt; i++) {
180
+ tvt_destroy(tvw->terms[i]);
181
+ }
182
+ tvw->tcnt = 0;
183
+
184
+ tvw->curr_field = NULL;
185
+ }
186
+ }
187
+
188
+ void tvw_create_field(TermVectorsWriter *tvw,
189
+ int field_number, int store_position, int store_offset)
190
+ {
191
+ tvw_close_field(tvw);
192
+ tvw->curr_field = tvf_create(field_number, store_position, store_offset);
193
+ }
194
+
195
+ void tvw_open_field(TermVectorsWriter *tvw, char *field)
196
+ {
197
+ FieldInfo *fi = fis_get_fi(tvw->fis, field);
198
+ tvw_create_field(tvw, fi->number, fi->store_pos, fi->store_offset);
199
+ }
200
+
201
+ void tvw_write_doc(TermVectorsWriter *tvw)
202
+ {
203
+ if (tvw->curr_field != NULL)
204
+ eprintf(STATE_ERROR, "Field is still open while writing document");
205
+
206
+ // puts("Writing doc pointer: " + @curr_doc_pointer)
207
+ // write document index record
208
+ os_write_long(tvw->tvx, tvw->curr_doc_pointer);
209
+
210
+ OutStream *tvd = tvw->tvd;
211
+ // write the number of @fields
212
+ os_write_vint(tvd, tvw->fcnt);
213
+
214
+ // write field numbers
215
+ int i;
216
+ TVField **fields = tvw->fields;
217
+ for (i = 0; i < tvw->fcnt; i++) {
218
+ os_write_vint(tvd, fields[i]->number);
219
+ }
220
+
221
+ // write field pointers
222
+ int last_field_pointer = 0;
223
+ for (i = 0; i < tvw->fcnt; i++) {
224
+ os_write_vint(tvd, fields[i]->tvf_pointer - last_field_pointer);
225
+ last_field_pointer = fields[i]->tvf_pointer;
226
+ }
227
+ }
228
+
229
+ void tvw_close_doc(TermVectorsWriter *tvw)
230
+ {
231
+ int i;
232
+ if (tvw->curr_doc_pointer >= 0) {
233
+ tvw_close_field(tvw);
234
+ tvw_write_doc(tvw);
235
+
236
+ for (i = 0; i < tvw->fcnt; i++) {
237
+ tvf_destroy(tvw->fields[i]);
238
+ }
239
+ tvw->fcnt = 0;
240
+ tvw->curr_doc_pointer = -1;
241
+ }
242
+ }
243
+
244
+ void tvw_open_doc(TermVectorsWriter *tvw)
245
+ {
246
+ tvw_close_doc(tvw);
247
+ tvw->curr_doc_pointer = os_pos(tvw->tvd);
248
+ }
249
+
250
+ void tvw_add_term(TermVectorsWriter *tvw,
251
+ char *text, int freq, int *positions, TVOffsetInfo **offsets)
252
+ {
253
+ if (tvw->tcnt >= tvw->tsize) {
254
+ tvw->tsize *= 2;
255
+ if (tvw->tsize < TERM_ARR_START_SIZE)
256
+ tvw->tsize = TERM_ARR_START_SIZE;
257
+ REALLOC_N(tvw->terms, TVTerm *, tvw->tsize);
258
+ }
259
+ tvw->terms[tvw->tcnt] = tvt_create(text, freq, positions, offsets);
260
+ tvw->tcnt++;
261
+ }
262
+
263
+ void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors)
264
+ {
265
+ tvw_open_doc(tvw);
266
+
267
+ int i, j, store_positions, store_offsets;
268
+ TermVector *tv;
269
+ for (i = 0; i < vectors->size; i++) {
270
+ tv = vectors->elems[i];
271
+
272
+ store_positions = (tv->tcnt > 0 && tv->positions != NULL);
273
+ store_offsets = (tv->tcnt > 0 && tv->offsets != NULL);
274
+
275
+ tvw_create_field(tvw, fis_get_number(tvw->fis, tv->field),
276
+ store_positions, store_offsets);
277
+
278
+ if (store_positions && store_offsets) {
279
+ for (j = 0; j < tv->tcnt; j++)
280
+ tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], tv->offsets[j]);
281
+ } else if (store_positions) {
282
+ for (j = 0; j < tv->tcnt; j++)
283
+ tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], NULL);
284
+ } else if (store_offsets) {
285
+ for (j = 0; j < tv->tcnt; j++)
286
+ tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, tv->offsets[j]);
287
+ } else {
288
+ for (j = 0; j < tv->tcnt; j++)
289
+ tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, NULL);
290
+ }
291
+ tvw_close_field(tvw);
292
+ }
293
+ tvw_close_doc(tvw);
294
+ }
295
+
296
+ void tvw_close(TermVectorsWriter *tvw)
297
+ {
298
+ tvw_close_doc(tvw);
299
+ os_close(tvw->tvx);
300
+ os_close(tvw->tvd);
301
+ os_close(tvw->tvf);
302
+ free(tvw->terms);
303
+ free(tvw->fields);
304
+ free(tvw);
305
+ }
306
+
307
+ TermVector *tv_create(
308
+ const char *field,
309
+ char **terms,
310
+ int tcnt,
311
+ int *freqs,
312
+ int **positions,
313
+ TVOffsetInfo ***offsets)
314
+ {
315
+ TermVector *tv =
316
+ ALLOC(TermVector);
317
+ tv->field = (char *)field;
318
+ tv->terms = terms;
319
+ tv->tcnt = tcnt;
320
+ tv->freqs = freqs;
321
+ tv->positions = positions;
322
+ tv->offsets = offsets;
323
+ return tv;
324
+ }
325
+
326
+ void tv_destroy(void *p)
327
+ {
328
+ int i, j;
329
+ TermVector *tv = (TermVector *)p;
330
+ for (i = 0; i < tv->tcnt; i++) {
331
+ free(tv->terms[i]);
332
+ }
333
+ free(tv->terms);
334
+ if (tv->positions != NULL) {
335
+ for (i = 0; i < tv->tcnt; i++) {
336
+ free(tv->positions[i]);
337
+ }
338
+ free(tv->positions);
339
+ }
340
+ if (tv->offsets != NULL) {
341
+ for (i = 0; i < tv->tcnt; i++) {
342
+ for (j = 0; j < tv->freqs[i]; j++) {
343
+ tvoi_destroy(tv->offsets[i][j]);
344
+ }
345
+ free(tv->offsets[i]);
346
+ }
347
+ free(tv->offsets);
348
+ }
349
+ free(tv->freqs);
350
+ free(p);
351
+ }
352
+
353
+ void tv_destroy_except_data(void *p)
354
+ {
355
+ TermVector *tv = (TermVector *)p;
356
+ free(tv->terms);
357
+ if (tv->positions != NULL) {
358
+ free(tv->positions);
359
+ }
360
+ if (tv->offsets != NULL) {
361
+ free(tv->offsets);
362
+ }
363
+ free(tv->freqs);
364
+ free(p);
365
+ }
366
+
367
+ int tvr_check_valid_format(InStream *is)
368
+ {
369
+ int format = is_read_int(is);
370
+ if (format > FORMAT_VERSION)
371
+ eprintf(ERROR, "Incompatible format version: %d expected %d or less",
372
+ format, FORMAT_VERSION);
373
+ return format;
374
+ }
375
+
376
+ TermVectorsReader *tvr_clone(TermVectorsReader *orig)
377
+ {
378
+ TermVectorsReader *clone = NULL;
379
+ if (orig->tvx && orig->tvd && orig->tvf) {
380
+ clone = ALLOC(TermVectorsReader);
381
+ memcpy(clone, orig, sizeof(TermVectorsReader));
382
+ clone->tvx = is_clone(orig->tvx);
383
+ clone->tvd = is_clone(orig->tvd);
384
+ clone->tvf = is_clone(orig->tvf);
385
+ }
386
+ return clone;
387
+ }
388
+
389
+ TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis)
390
+ {
391
+ TermVectorsReader *tvr = ALLOC(TermVectorsReader);
392
+ // Open files for TermVector storage
393
+ char fname[SEGMENT_NAME_MAX_LENGTH];
394
+ int segment_len = strlen(segment);
395
+ strcpy(fname, segment);
396
+
397
+ strcpy(fname + segment_len, TVX_EXTENSION);
398
+ InStream *is = tvr->tvx = store->open_input(store, fname);
399
+ tvr_check_valid_format(is);
400
+ tvr->size = is_length(is)/8;
401
+
402
+ strcpy(fname + segment_len, TVD_EXTENSION);
403
+ is = tvr->tvd = store->open_input(store, fname);
404
+ tvr->tvd_format = tvr_check_valid_format(is);
405
+
406
+ strcpy(fname + segment_len, TVF_EXTENSION);
407
+ is = tvr->tvf = store->open_input(store, fname);
408
+ tvr->tvf_format = tvr_check_valid_format(is);
409
+
410
+ tvr->fis = fis;
411
+ return tvr;
412
+ }
413
+
414
+ void tvr_close(TermVectorsReader *tvr)
415
+ {
416
+ is_close(tvr->tvx);
417
+ is_close(tvr->tvd);
418
+ is_close(tvr->tvf);
419
+ free(tvr);
420
+ }
421
+
422
+ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
423
+ char *field, int tvf_pointer)
424
+ {
425
+ int i, j, store_positions, store_offsets, bits;
426
+ // Now read the data from specified position
427
+ // We don't need to offset by the FORMAT here since the pointer
428
+ // already includes the offset
429
+ is_seek(tvr->tvf, tvf_pointer);
430
+
431
+ int num_terms = is_read_vint(tvr->tvf);
432
+ // If no terms - return a constant empty termvector. However, this should
433
+ // never occur!
434
+ if (num_terms == 0)
435
+ return tv_create(field, NULL, 0, NULL, NULL, NULL);
436
+
437
+ if(tvr->tvf_format == FORMAT_VERSION) {
438
+ bits = is_read_byte(tvr->tvf);
439
+ store_positions = ((bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0);
440
+ store_offsets = ((bits & STORE_OFFSET_WITH_TERMVECTOR) != 0);
441
+ } else {
442
+ is_read_vint(tvr->tvf);
443
+ store_positions = false;
444
+ store_offsets = false;
445
+ }
446
+
447
+ char **terms = ALLOC_N(char *, num_terms);
448
+ int *term_freqs = ALLOC_N(int, num_terms);
449
+
450
+ // we may not need these, but declare them
451
+ int **positions = NULL;
452
+ TVOffsetInfo ***offsets = NULL;
453
+
454
+ if(store_positions)
455
+ positions = ALLOC_N(int *, num_terms);
456
+
457
+ if(store_offsets)
458
+ offsets = ALLOC_N(TVOffsetInfo **, num_terms);
459
+
460
+ int start, delta_length, total_length, freq, prev_pos;
461
+ int start_offset, end_offset, prev_offset;
462
+ int *pos;
463
+ TVOffsetInfo **offs;
464
+ char buffer[MAX_WORD_SIZE] = "";
465
+
466
+ for (i = 0; i < num_terms; i++) {
467
+ start = is_read_vint(tvr->tvf);
468
+ delta_length = is_read_vint(tvr->tvf);
469
+ total_length = start + delta_length;
470
+ is_read_chars(tvr->tvf, buffer, start, delta_length);
471
+ buffer[total_length] = '\0';
472
+ terms[i] = estrdup(buffer);
473
+ freq = is_read_vint(tvr->tvf);
474
+ term_freqs[i] = freq;
475
+
476
+ if (store_positions) {//read in the positions
477
+ pos = ALLOC_N(int, freq);
478
+ positions[i] = pos;
479
+ prev_pos = 0;
480
+ for (j = 0; j < freq; j++) {
481
+ pos[j] = prev_pos + is_read_vint(tvr->tvf);
482
+ prev_pos = pos[j];
483
+ }
484
+ }
485
+
486
+ if (store_offsets) {
487
+ offs = ALLOC_N(TVOffsetInfo *, freq);
488
+ offsets[i] = offs;
489
+ prev_offset = 0;
490
+ for (j = 0; j < freq; j++) {
491
+ start_offset = prev_offset + is_read_vint(tvr->tvf);
492
+ end_offset = start_offset + is_read_vint(tvr->tvf);
493
+ offs[j] = tvoi_create(start_offset, end_offset);
494
+ prev_offset = end_offset;
495
+ }
496
+ }
497
+ }
498
+ return tv_create(field, terms, num_terms, term_freqs, positions, offsets);
499
+ }
500
+
501
+ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
502
+ {
503
+ int i;
504
+ Array *tvs = NULL;
505
+ // Check if no term vectors are available for this segment at all
506
+ if (tvr->tvx != NULL) {
507
+ // We need to offset by
508
+ is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
509
+
510
+ int position = is_read_long(tvr->tvx);
511
+
512
+ is_seek(tvr->tvd, position);
513
+ int field_count = is_read_vint(tvr->tvd);
514
+
515
+ // No fields are vectorized for this document
516
+ if (field_count > 0) {
517
+ int number = 0;
518
+ char **fields = ALLOC_N(char *, field_count);
519
+
520
+ for (i = 0; i < field_count; i++) {
521
+ if (tvr->tvd_format == FORMAT_VERSION)
522
+ number = is_read_vint(tvr->tvd);
523
+ else
524
+ number += is_read_vint(tvr->tvd);
525
+
526
+ fields[i] = tvr->fis->by_number[number]->name;
527
+ }
528
+
529
+ // Compute position in the tvf file
530
+ int position = 0;
531
+ int *tvf_pointers = ALLOC_N(int, field_count);
532
+ for (i = 0; i < field_count; i++) {
533
+ position += is_read_vint(tvr->tvd);
534
+ tvf_pointers[i] = position;
535
+ }
536
+
537
+ tvs = ary_create(field_count, &tv_destroy);
538
+ for (i = 0; i < field_count; i++) {
539
+ ary_append(tvs, tvr_read_term_vector(tvr, fields[i], tvf_pointers[i]));
540
+ }
541
+ free(fields);
542
+ free(tvf_pointers);
543
+ }
544
+ }
545
+ return tvs;
546
+ }
547
+
548
+ TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field)
549
+ {
550
+ int i;
551
+ // Check if no term vectors are available for this segment at all
552
+ int field_number = fis_get_number(tvr->fis, field);
553
+ TermVector *tv = NULL;
554
+
555
+ if (tvr->tvx != NULL) {
556
+ // We need to account for the FORMAT_SIZE at when seeking in the @tvx
557
+ // We don't need to do this in other seeks because we already have the
558
+ // file pointer that was written in another file
559
+ is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
560
+ // puts("TVX Pointer: " + @tvx.pos())
561
+ int pos = is_read_long(tvr->tvx);
562
+
563
+ is_seek(tvr->tvd, pos);
564
+ int field_count = is_read_vint(tvr->tvd);
565
+ //puts("Num Fields: " + field_count)
566
+ // There are only a few fields per document. We opt for a full scan
567
+ // rather then requiring that they be ordered. We need to read through
568
+ // all of the fields anyway to get to the tvf pointers.
569
+ int number = 0;
570
+ int found = -1;
571
+
572
+ for (i = 0; i < field_count; i++) {
573
+ if (tvr->tvd_format == FORMAT_VERSION)
574
+ number = is_read_vint(tvr->tvd);
575
+ else
576
+ number += is_read_vint(tvr->tvd);
577
+
578
+ if (number == field_number)
579
+ found = i;
580
+ }
581
+
582
+ // This field, although valid in the segment, was not found in this
583
+ // document
584
+ if (found != -1) {
585
+ // Compute pos in the @tvf file
586
+ pos = 0;
587
+ for (i = 0; i <= found; i++)
588
+ pos += is_read_vint(tvr->tvd);
589
+
590
+ tv = tvr_read_term_vector(tvr, field, pos);
591
+ }
592
+ }
593
+ return tv;
594
+ }