ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/vector.c ADDED
@@ -0,0 +1,594 @@
1
+ #include <index.h>
2
+ #include <string.h>
3
+ #include <helper.h>
4
+
5
+ #define TERM_ARR_START_SIZE 16
6
+ #define FIELD_ARR_START_SIZE 8
7
+
8
+ TVOffsetInfo *tvoi_create(int start, int end)
9
+ {
10
+ TVOffsetInfo *tvoi = ALLOC(TVOffsetInfo);
11
+ tvoi->start = start;
12
+ tvoi->end = end;
13
+ return tvoi;
14
+ }
15
+
16
+ void tvoi_destroy(void *p)
17
+ {
18
+ free(p);
19
+ }
20
+
21
+ TVField *tvf_create(int number, int store_positions, int store_offsets)
22
+ {
23
+ TVField *tvf = ALLOC(TVField);
24
+ tvf->tvf_pointer = 0;
25
+ tvf->number = number;
26
+ tvf->store_positions = store_positions;
27
+ tvf->store_offsets = store_offsets;
28
+ return tvf;
29
+ }
30
+
31
+ void tvf_destroy(void *p)
32
+ {
33
+ free(p);
34
+ }
35
+
36
+ TVTerm *tvt_create(char *text, int freq, int *positions, TVOffsetInfo **offsets)
37
+ {
38
+ TVTerm *tvt = ALLOC(TVTerm);
39
+ tvt->text = text;
40
+ tvt->freq = freq;
41
+ tvt->positions = positions;
42
+ tvt->offsets = offsets;
43
+ return tvt;
44
+ }
45
+
46
+ void tvt_destroy(void *p)
47
+ {
48
+ //int i;
49
+ //TVTerm *tvt = (TVTerm *)p;
50
+ //free(tvt->text);
51
+ //free(tvt->positions);
52
+ //if (tvt->offsets != NULL) {
53
+ // for (i = 0; i < tvt->freq; i++) {
54
+ // tvoi_destroy(tvt->offsets[i]);
55
+ // }
56
+ // free(tvt->offsets);
57
+ //}
58
+ free(p);
59
+ }
60
+
61
+
62
+ TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis)
63
+ {
64
+ TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
65
+ tvw->curr_field = NULL;
66
+ tvw->curr_doc_pointer = -1;
67
+
68
+ // Open files for TermVector storage
69
+ char fname[SEGMENT_NAME_MAX_LENGTH];
70
+ int segment_len = strlen(segment);
71
+ strcpy(fname, segment);
72
+
73
+ strcpy(fname + segment_len, TVX_EXTENSION);
74
+ OutStream *os = tvw->tvx = store->create_output(store, fname);
75
+ os_write_int(os, FORMAT_VERSION);
76
+
77
+ strcpy(fname + segment_len, TVD_EXTENSION);
78
+ os = tvw->tvd = store->create_output(store, fname);
79
+ os_write_int(os, FORMAT_VERSION);
80
+
81
+ strcpy(fname + segment_len, TVF_EXTENSION);
82
+ os = tvw->tvf = store->create_output(store, fname);
83
+ os_write_int(os, FORMAT_VERSION);
84
+
85
+ tvw->fis = fis;
86
+
87
+ tvw->fields = NULL;
88
+ tvw->fcnt = 0;
89
+ tvw->fsize = 0;
90
+ tvw->terms = NULL;
91
+ tvw->tcnt = 0;
92
+ tvw->tsize = 0;
93
+
94
+ return tvw;
95
+ }
96
+
97
+ void tvw_write_field(TermVectorsWriter *tvw)
98
+ {
99
+ int i, j, start, length;
100
+ char *last_term_text;
101
+ TVOffsetInfo *tmp_offset;
102
+ // remember where this field is written
103
+ OutStream *tvf = tvw->tvf;
104
+ tvw->curr_field->tvf_pointer = os_pos(tvf);
105
+
106
+ // write the number of terms
107
+ os_write_vint(tvf, tvw->tcnt);
108
+
109
+ int store_positions = tvw->curr_field->store_positions;
110
+ int store_offsets = tvw->curr_field->store_offsets;
111
+ int bits = 0x0;
112
+ if (store_positions)
113
+ bits |= STORE_POSITIONS_WITH_TERMVECTOR;
114
+
115
+ if (store_offsets)
116
+ bits |= STORE_OFFSET_WITH_TERMVECTOR;
117
+
118
+ os_write_byte(tvf, bits);
119
+
120
+ last_term_text = (char *)EMPTY_STRING;
121
+ TVTerm **terms = tvw->terms;
122
+ TVTerm *term;
123
+ for (i = 0; i < tvw->tcnt; i++) {
124
+ term = terms[i];
125
+ start = hlp_string_diff(last_term_text, term->text);
126
+ length = strlen(term->text) - start;
127
+ os_write_vint(tvf, start); // write shared prefix length
128
+ os_write_vint(tvf, length); // write delta length
129
+ os_write_chars(tvf, term->text, start, length); // write delta chars
130
+ os_write_vint(tvf, term->freq);
131
+ last_term_text = term->text;
132
+
133
+ if (store_positions) {
134
+ if (term->positions == NULL)
135
+ eprintf(IO_ERROR, "Trying to write positions that are null!");
136
+
137
+ // use delta encoding for positions
138
+ int last_pos = 0;
139
+ for (j = 0; j < term->freq; j++) {
140
+ os_write_vint(tvf, term->positions[j] - last_pos);
141
+ last_pos = term->positions[j];
142
+ }
143
+ }
144
+
145
+ if (store_offsets) {
146
+ if (term->offsets == NULL)
147
+ eprintf(IO_ERROR, "Trying to write offsets that are null!");
148
+
149
+ // use delta encoding for offsets
150
+ int last_end = 0;
151
+ for (j = 0; j < term->freq; j++) {
152
+ tmp_offset = term->offsets[j];
153
+ os_write_vint(tvf, tmp_offset->start - last_end);
154
+
155
+ // save the diff between the two.
156
+ os_write_vint(tvf, tmp_offset->end - tmp_offset->start);
157
+ last_end = tmp_offset->end;
158
+ }
159
+ }
160
+ }
161
+ }
162
+
163
+ void tvw_close_field(TermVectorsWriter *tvw)
164
+ {
165
+ int i;
166
+ if (tvw->curr_field != NULL) {
167
+ // save field and terms
168
+ tvw_write_field(tvw);
169
+
170
+ if (tvw->fcnt >= tvw->fsize) {
171
+ tvw->fsize *=2;
172
+ if (tvw->fsize < FIELD_ARR_START_SIZE)
173
+ tvw->fsize = FIELD_ARR_START_SIZE;
174
+ REALLOC_N(tvw->fields, TVField *, tvw->fsize);
175
+ }
176
+ tvw->fields[tvw->fcnt] = tvw->curr_field;
177
+ tvw->fcnt++;
178
+
179
+ for (i = 0; i < tvw->tcnt; i++) {
180
+ tvt_destroy(tvw->terms[i]);
181
+ }
182
+ tvw->tcnt = 0;
183
+
184
+ tvw->curr_field = NULL;
185
+ }
186
+ }
187
+
188
+ void tvw_create_field(TermVectorsWriter *tvw,
189
+ int field_number, int store_position, int store_offset)
190
+ {
191
+ tvw_close_field(tvw);
192
+ tvw->curr_field = tvf_create(field_number, store_position, store_offset);
193
+ }
194
+
195
+ void tvw_open_field(TermVectorsWriter *tvw, char *field)
196
+ {
197
+ FieldInfo *fi = fis_get_fi(tvw->fis, field);
198
+ tvw_create_field(tvw, fi->number, fi->store_pos, fi->store_offset);
199
+ }
200
+
201
+ void tvw_write_doc(TermVectorsWriter *tvw)
202
+ {
203
+ if (tvw->curr_field != NULL)
204
+ eprintf(STATE_ERROR, "Field is still open while writing document");
205
+
206
+ // puts("Writing doc pointer: " + @curr_doc_pointer)
207
+ // write document index record
208
+ os_write_long(tvw->tvx, tvw->curr_doc_pointer);
209
+
210
+ OutStream *tvd = tvw->tvd;
211
+ // write the number of @fields
212
+ os_write_vint(tvd, tvw->fcnt);
213
+
214
+ // write field numbers
215
+ int i;
216
+ TVField **fields = tvw->fields;
217
+ for (i = 0; i < tvw->fcnt; i++) {
218
+ os_write_vint(tvd, fields[i]->number);
219
+ }
220
+
221
+ // write field pointers
222
+ int last_field_pointer = 0;
223
+ for (i = 0; i < tvw->fcnt; i++) {
224
+ os_write_vint(tvd, fields[i]->tvf_pointer - last_field_pointer);
225
+ last_field_pointer = fields[i]->tvf_pointer;
226
+ }
227
+ }
228
+
229
+ void tvw_close_doc(TermVectorsWriter *tvw)
230
+ {
231
+ int i;
232
+ if (tvw->curr_doc_pointer >= 0) {
233
+ tvw_close_field(tvw);
234
+ tvw_write_doc(tvw);
235
+
236
+ for (i = 0; i < tvw->fcnt; i++) {
237
+ tvf_destroy(tvw->fields[i]);
238
+ }
239
+ tvw->fcnt = 0;
240
+ tvw->curr_doc_pointer = -1;
241
+ }
242
+ }
243
+
244
+ void tvw_open_doc(TermVectorsWriter *tvw)
245
+ {
246
+ tvw_close_doc(tvw);
247
+ tvw->curr_doc_pointer = os_pos(tvw->tvd);
248
+ }
249
+
250
+ void tvw_add_term(TermVectorsWriter *tvw,
251
+ char *text, int freq, int *positions, TVOffsetInfo **offsets)
252
+ {
253
+ if (tvw->tcnt >= tvw->tsize) {
254
+ tvw->tsize *= 2;
255
+ if (tvw->tsize < TERM_ARR_START_SIZE)
256
+ tvw->tsize = TERM_ARR_START_SIZE;
257
+ REALLOC_N(tvw->terms, TVTerm *, tvw->tsize);
258
+ }
259
+ tvw->terms[tvw->tcnt] = tvt_create(text, freq, positions, offsets);
260
+ tvw->tcnt++;
261
+ }
262
+
263
+ void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors)
264
+ {
265
+ tvw_open_doc(tvw);
266
+
267
+ int i, j, store_positions, store_offsets;
268
+ TermVector *tv;
269
+ for (i = 0; i < vectors->size; i++) {
270
+ tv = vectors->elems[i];
271
+
272
+ store_positions = (tv->tcnt > 0 && tv->positions != NULL);
273
+ store_offsets = (tv->tcnt > 0 && tv->offsets != NULL);
274
+
275
+ tvw_create_field(tvw, fis_get_number(tvw->fis, tv->field),
276
+ store_positions, store_offsets);
277
+
278
+ if (store_positions && store_offsets) {
279
+ for (j = 0; j < tv->tcnt; j++)
280
+ tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], tv->offsets[j]);
281
+ } else if (store_positions) {
282
+ for (j = 0; j < tv->tcnt; j++)
283
+ tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], NULL);
284
+ } else if (store_offsets) {
285
+ for (j = 0; j < tv->tcnt; j++)
286
+ tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, tv->offsets[j]);
287
+ } else {
288
+ for (j = 0; j < tv->tcnt; j++)
289
+ tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, NULL);
290
+ }
291
+ tvw_close_field(tvw);
292
+ }
293
+ tvw_close_doc(tvw);
294
+ }
295
+
296
+ void tvw_close(TermVectorsWriter *tvw)
297
+ {
298
+ tvw_close_doc(tvw);
299
+ os_close(tvw->tvx);
300
+ os_close(tvw->tvd);
301
+ os_close(tvw->tvf);
302
+ free(tvw->terms);
303
+ free(tvw->fields);
304
+ free(tvw);
305
+ }
306
+
307
+ TermVector *tv_create(
308
+ const char *field,
309
+ char **terms,
310
+ int tcnt,
311
+ int *freqs,
312
+ int **positions,
313
+ TVOffsetInfo ***offsets)
314
+ {
315
+ TermVector *tv =
316
+ ALLOC(TermVector);
317
+ tv->field = (char *)field;
318
+ tv->terms = terms;
319
+ tv->tcnt = tcnt;
320
+ tv->freqs = freqs;
321
+ tv->positions = positions;
322
+ tv->offsets = offsets;
323
+ return tv;
324
+ }
325
+
326
+ void tv_destroy(void *p)
327
+ {
328
+ int i, j;
329
+ TermVector *tv = (TermVector *)p;
330
+ for (i = 0; i < tv->tcnt; i++) {
331
+ free(tv->terms[i]);
332
+ }
333
+ free(tv->terms);
334
+ if (tv->positions != NULL) {
335
+ for (i = 0; i < tv->tcnt; i++) {
336
+ free(tv->positions[i]);
337
+ }
338
+ free(tv->positions);
339
+ }
340
+ if (tv->offsets != NULL) {
341
+ for (i = 0; i < tv->tcnt; i++) {
342
+ for (j = 0; j < tv->freqs[i]; j++) {
343
+ tvoi_destroy(tv->offsets[i][j]);
344
+ }
345
+ free(tv->offsets[i]);
346
+ }
347
+ free(tv->offsets);
348
+ }
349
+ free(tv->freqs);
350
+ free(p);
351
+ }
352
+
353
+ void tv_destroy_except_data(void *p)
354
+ {
355
+ TermVector *tv = (TermVector *)p;
356
+ free(tv->terms);
357
+ if (tv->positions != NULL) {
358
+ free(tv->positions);
359
+ }
360
+ if (tv->offsets != NULL) {
361
+ free(tv->offsets);
362
+ }
363
+ free(tv->freqs);
364
+ free(p);
365
+ }
366
+
367
+ int tvr_check_valid_format(InStream *is)
368
+ {
369
+ int format = is_read_int(is);
370
+ if (format > FORMAT_VERSION)
371
+ eprintf(ERROR, "Incompatible format version: %d expected %d or less",
372
+ format, FORMAT_VERSION);
373
+ return format;
374
+ }
375
+
376
+ TermVectorsReader *tvr_clone(TermVectorsReader *orig)
377
+ {
378
+ TermVectorsReader *clone = NULL;
379
+ if (orig->tvx && orig->tvd && orig->tvf) {
380
+ clone = ALLOC(TermVectorsReader);
381
+ memcpy(clone, orig, sizeof(TermVectorsReader));
382
+ clone->tvx = is_clone(orig->tvx);
383
+ clone->tvd = is_clone(orig->tvd);
384
+ clone->tvf = is_clone(orig->tvf);
385
+ }
386
+ return clone;
387
+ }
388
+
389
+ TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis)
390
+ {
391
+ TermVectorsReader *tvr = ALLOC(TermVectorsReader);
392
+ // Open files for TermVector storage
393
+ char fname[SEGMENT_NAME_MAX_LENGTH];
394
+ int segment_len = strlen(segment);
395
+ strcpy(fname, segment);
396
+
397
+ strcpy(fname + segment_len, TVX_EXTENSION);
398
+ InStream *is = tvr->tvx = store->open_input(store, fname);
399
+ tvr_check_valid_format(is);
400
+ tvr->size = is_length(is)/8;
401
+
402
+ strcpy(fname + segment_len, TVD_EXTENSION);
403
+ is = tvr->tvd = store->open_input(store, fname);
404
+ tvr->tvd_format = tvr_check_valid_format(is);
405
+
406
+ strcpy(fname + segment_len, TVF_EXTENSION);
407
+ is = tvr->tvf = store->open_input(store, fname);
408
+ tvr->tvf_format = tvr_check_valid_format(is);
409
+
410
+ tvr->fis = fis;
411
+ return tvr;
412
+ }
413
+
414
+ void tvr_close(TermVectorsReader *tvr)
415
+ {
416
+ is_close(tvr->tvx);
417
+ is_close(tvr->tvd);
418
+ is_close(tvr->tvf);
419
+ free(tvr);
420
+ }
421
+
422
+ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
423
+ char *field, int tvf_pointer)
424
+ {
425
+ int i, j, store_positions, store_offsets, bits;
426
+ // Now read the data from specified position
427
+ // We don't need to offset by the FORMAT here since the pointer
428
+ // already includes the offset
429
+ is_seek(tvr->tvf, tvf_pointer);
430
+
431
+ int num_terms = is_read_vint(tvr->tvf);
432
+ // If no terms - return a constant empty termvector. However, this should
433
+ // never occur!
434
+ if (num_terms == 0)
435
+ return tv_create(field, NULL, 0, NULL, NULL, NULL);
436
+
437
+ if(tvr->tvf_format == FORMAT_VERSION) {
438
+ bits = is_read_byte(tvr->tvf);
439
+ store_positions = ((bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0);
440
+ store_offsets = ((bits & STORE_OFFSET_WITH_TERMVECTOR) != 0);
441
+ } else {
442
+ is_read_vint(tvr->tvf);
443
+ store_positions = false;
444
+ store_offsets = false;
445
+ }
446
+
447
+ char **terms = ALLOC_N(char *, num_terms);
448
+ int *term_freqs = ALLOC_N(int, num_terms);
449
+
450
+ // we may not need these, but declare them
451
+ int **positions = NULL;
452
+ TVOffsetInfo ***offsets = NULL;
453
+
454
+ if(store_positions)
455
+ positions = ALLOC_N(int *, num_terms);
456
+
457
+ if(store_offsets)
458
+ offsets = ALLOC_N(TVOffsetInfo **, num_terms);
459
+
460
+ int start, delta_length, total_length, freq, prev_pos;
461
+ int start_offset, end_offset, prev_offset;
462
+ int *pos;
463
+ TVOffsetInfo **offs;
464
+ char buffer[MAX_WORD_SIZE] = "";
465
+
466
+ for (i = 0; i < num_terms; i++) {
467
+ start = is_read_vint(tvr->tvf);
468
+ delta_length = is_read_vint(tvr->tvf);
469
+ total_length = start + delta_length;
470
+ is_read_chars(tvr->tvf, buffer, start, delta_length);
471
+ buffer[total_length] = '\0';
472
+ terms[i] = estrdup(buffer);
473
+ freq = is_read_vint(tvr->tvf);
474
+ term_freqs[i] = freq;
475
+
476
+ if (store_positions) {//read in the positions
477
+ pos = ALLOC_N(int, freq);
478
+ positions[i] = pos;
479
+ prev_pos = 0;
480
+ for (j = 0; j < freq; j++) {
481
+ pos[j] = prev_pos + is_read_vint(tvr->tvf);
482
+ prev_pos = pos[j];
483
+ }
484
+ }
485
+
486
+ if (store_offsets) {
487
+ offs = ALLOC_N(TVOffsetInfo *, freq);
488
+ offsets[i] = offs;
489
+ prev_offset = 0;
490
+ for (j = 0; j < freq; j++) {
491
+ start_offset = prev_offset + is_read_vint(tvr->tvf);
492
+ end_offset = start_offset + is_read_vint(tvr->tvf);
493
+ offs[j] = tvoi_create(start_offset, end_offset);
494
+ prev_offset = end_offset;
495
+ }
496
+ }
497
+ }
498
+ return tv_create(field, terms, num_terms, term_freqs, positions, offsets);
499
+ }
500
+
501
+ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
502
+ {
503
+ int i;
504
+ Array *tvs = NULL;
505
+ // Check if no term vectors are available for this segment at all
506
+ if (tvr->tvx != NULL) {
507
+ // We need to offset by
508
+ is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
509
+
510
+ int position = is_read_long(tvr->tvx);
511
+
512
+ is_seek(tvr->tvd, position);
513
+ int field_count = is_read_vint(tvr->tvd);
514
+
515
+ // No fields are vectorized for this document
516
+ if (field_count > 0) {
517
+ int number = 0;
518
+ char **fields = ALLOC_N(char *, field_count);
519
+
520
+ for (i = 0; i < field_count; i++) {
521
+ if (tvr->tvd_format == FORMAT_VERSION)
522
+ number = is_read_vint(tvr->tvd);
523
+ else
524
+ number += is_read_vint(tvr->tvd);
525
+
526
+ fields[i] = tvr->fis->by_number[number]->name;
527
+ }
528
+
529
+ // Compute position in the tvf file
530
+ int position = 0;
531
+ int *tvf_pointers = ALLOC_N(int, field_count);
532
+ for (i = 0; i < field_count; i++) {
533
+ position += is_read_vint(tvr->tvd);
534
+ tvf_pointers[i] = position;
535
+ }
536
+
537
+ tvs = ary_create(field_count, &tv_destroy);
538
+ for (i = 0; i < field_count; i++) {
539
+ ary_append(tvs, tvr_read_term_vector(tvr, fields[i], tvf_pointers[i]));
540
+ }
541
+ free(fields);
542
+ free(tvf_pointers);
543
+ }
544
+ }
545
+ return tvs;
546
+ }
547
+
548
+ TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field)
549
+ {
550
+ int i;
551
+ // Check if no term vectors are available for this segment at all
552
+ int field_number = fis_get_number(tvr->fis, field);
553
+ TermVector *tv = NULL;
554
+
555
+ if (tvr->tvx != NULL) {
556
+ // We need to account for the FORMAT_SIZE at when seeking in the @tvx
557
+ // We don't need to do this in other seeks because we already have the
558
+ // file pointer that was written in another file
559
+ is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
560
+ // puts("TVX Pointer: " + @tvx.pos())
561
+ int pos = is_read_long(tvr->tvx);
562
+
563
+ is_seek(tvr->tvd, pos);
564
+ int field_count = is_read_vint(tvr->tvd);
565
+ //puts("Num Fields: " + field_count)
566
+ // There are only a few fields per document. We opt for a full scan
567
+ // rather then requiring that they be ordered. We need to read through
568
+ // all of the fields anyway to get to the tvf pointers.
569
+ int number = 0;
570
+ int found = -1;
571
+
572
+ for (i = 0; i < field_count; i++) {
573
+ if (tvr->tvd_format == FORMAT_VERSION)
574
+ number = is_read_vint(tvr->tvd);
575
+ else
576
+ number += is_read_vint(tvr->tvd);
577
+
578
+ if (number == field_number)
579
+ found = i;
580
+ }
581
+
582
+ // This field, although valid in the segment, was not found in this
583
+ // document
584
+ if (found != -1) {
585
+ // Compute pos in the @tvf file
586
+ pos = 0;
587
+ for (i = 0; i <= found; i++)
588
+ pos += is_read_vint(tvr->tvd);
589
+
590
+ tv = tvr_read_term_vector(tvr, field, pos);
591
+ }
592
+ }
593
+ return tv;
594
+ }