ferret 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/termdocs.c CHANGED
@@ -1,4 +1,4 @@
1
- #include <index.h>
1
+ #include "index.h"
2
2
  #include <string.h>
3
3
 
4
4
  static char * const TPE_VS_TDE_ERROR_MSG = "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.";
@@ -59,22 +59,24 @@ bool stde_next(TermDocEnum *tde)
59
59
  SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
60
60
  while (true) {
61
61
 
62
- if (stde->count >= stde->doc_freq)
62
+ if (stde->count >= stde->doc_freq) {
63
63
  return false;
64
+ }
64
65
 
65
- doc_code = is_read_vint(stde->freq_in);
66
- stde->doc_num += doc_code >> 1; // shift off low bit
67
- if ((doc_code & 1) != 0) { // if low bit is set
68
- stde->freq = 1; // freq is one
66
+ doc_code = (int)is_read_vint(stde->freq_in);
67
+ stde->doc_num += doc_code >> 1; /* shift off low bit */
68
+ if ((doc_code & 1) != 0) { /* if low bit is set */
69
+ stde->freq = 1; /* freq is one */
69
70
  } else {
70
- stde->freq = is_read_vint(stde->freq_in); // else read freq
71
+ stde->freq = (int)is_read_vint(stde->freq_in); /* read freq */
71
72
  }
72
73
 
73
74
  stde->count++;
74
75
 
75
76
  if (stde->deleted_docs == NULL ||
76
- bv_get(stde->deleted_docs, stde->doc_num) == 0)
77
- break; // We found an undeleted doc so return
77
+ bv_get(stde->deleted_docs, stde->doc_num) == 0) {
78
+ break; /* We found an undeleted doc so return */
79
+ }
78
80
 
79
81
  stde->skip_prox(stde);
80
82
  }
@@ -90,41 +92,49 @@ int stde_freq(TermDocEnum *tde)
90
92
  bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
91
93
  {
92
94
  SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
93
- if (stde->doc_freq >= stde->skip_interval) { // optimized case
94
95
 
95
- if (stde->skip_in == NULL)
96
- stde->skip_in = is_clone(stde->freq_in); // lazily clone
96
+ if (stde->doc_freq >= stde->skip_interval) { /* optimized case */
97
+ int last_skip_doc;
98
+ int last_freq_pointer;
99
+ int last_prox_pointer;
100
+ int num_skipped;
101
+
102
+ if (stde->skip_in == NULL) {
103
+ stde->skip_in = is_clone(stde->freq_in); /* lazily clone */
104
+ }
97
105
 
98
- if (!stde->have_skipped) { // lazily seek skip stream
106
+ if (!stde->have_skipped) { /* lazily seek skip stream */
99
107
  is_seek(stde->skip_in, stde->skip_pointer);
100
108
  stde->have_skipped = true;
101
109
  }
102
110
 
103
- // scan skip data
104
- int last_skip_doc = stde->skip_doc;
105
- int last_freq_pointer = is_pos(stde->freq_in);
106
- int last_prox_pointer = -1;
107
- int num_skipped = -1 - (stde->count % stde->skip_interval);
111
+ /* scan skip data */
112
+ last_skip_doc = stde->skip_doc;
113
+ last_freq_pointer = is_pos(stde->freq_in);
114
+ last_prox_pointer = -1;
115
+ num_skipped = -1 - (stde->count % stde->skip_interval);
108
116
 
109
117
  while (target_doc_num > stde->skip_doc) {
110
118
  last_skip_doc = stde->skip_doc;
111
119
  last_freq_pointer = stde->freq_pointer;
112
120
  last_prox_pointer = stde->prox_pointer;
113
121
 
114
- if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num)
122
+ if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num) {
115
123
  num_skipped += stde->skip_interval;
124
+ }
116
125
 
117
- if(stde->skip_count >= stde->num_skips)
126
+ if(stde->skip_count >= stde->num_skips) {
118
127
  break;
128
+ }
119
129
 
120
- stde->skip_doc += is_read_vint(stde->skip_in);
121
- stde->freq_pointer += is_read_vint(stde->skip_in);
122
- stde->prox_pointer += is_read_vint(stde->skip_in);
130
+ stde->skip_doc += (int)is_read_vint(stde->skip_in);
131
+ stde->freq_pointer += (int)is_read_vint(stde->skip_in);
132
+ stde->prox_pointer += (int)is_read_vint(stde->skip_in);
123
133
 
124
134
  stde->skip_count++;
125
135
  }
126
136
 
127
- // if we found something to skip, so skip it
137
+ /* if we found something to skip, so skip it */
128
138
  if (last_freq_pointer > is_pos(stde->freq_in)) {
129
139
  is_seek(stde->freq_in, last_freq_pointer);
130
140
  stde->seek_prox(stde, last_prox_pointer);
@@ -134,7 +144,7 @@ bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
134
144
  }
135
145
  }
136
146
 
137
- // done skipping, now just scan
147
+ /* done skipping, now just scan */
138
148
  do {
139
149
  if (! tde->next(tde)) {
140
150
  return false;
@@ -148,13 +158,14 @@ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
148
158
  SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
149
159
  int i = 0, doc_code;
150
160
  while (i < req_num && stde->count < stde->doc_freq) {
151
- // manually inlined call to next() for speed
152
- doc_code = is_read_vint(stde->freq_in);
153
- stde->doc_num += doc_code >> 1; // shift off low bit
154
- if ((doc_code & 1) != 0) // if low bit is set
155
- stde->freq = 1; // freq is one
156
- else
157
- stde->freq = is_read_vint(stde->freq_in); // else read freq
161
+ /* manually inlined call to next() for speed */
162
+ doc_code = (int)is_read_vint(stde->freq_in);
163
+ stde->doc_num += (doc_code >> 1); /* shift off low bit */
164
+ if ((doc_code & 1) != 0) { /* if low bit is set */
165
+ stde->freq = 1; /* freq is one */
166
+ } else {
167
+ stde->freq = (int)is_read_vint(stde->freq_in); /* else read freq */
168
+ }
158
169
 
159
170
  stde->count++;
160
171
 
@@ -170,6 +181,7 @@ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
170
181
 
171
182
  TermDocEnum *stde_create(IndexReader *ir)
172
183
  {
184
+ SegmentTermDocEnum *stde = ALLOC_AND_ZERO(SegmentTermDocEnum);
173
185
  SegmentReader *sr = (SegmentReader *)ir->data;
174
186
  TermDocEnum *tde = ALLOC(TermDocEnum);
175
187
  tde->seek = &stde_seek;
@@ -181,8 +193,6 @@ TermDocEnum *stde_create(IndexReader *ir)
181
193
  tde->next_position = NULL;
182
194
  tde->close = &stde_close;
183
195
 
184
- SegmentTermDocEnum *stde = ALLOC(SegmentTermDocEnum);
185
- ZEROSET(stde, SegmentTermDocEnum, 1); // set all values to 0
186
196
  tde->data = stde;
187
197
  stde->parent = sr;
188
198
  stde->freq_in = is_clone(sr->freq_in);
@@ -260,11 +270,12 @@ int stpe_next_position(TermDocEnum *tde)
260
270
  {
261
271
  SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
262
272
  stde->prox_cnt--;
263
- return stde->position += is_read_vint(stde->prox_in);
273
+ return stde->position += (int)is_read_vint(stde->prox_in);
264
274
  }
265
275
 
266
276
  TermDocEnum *stpe_create(IndexReader *ir)
267
277
  {
278
+ SegmentTermDocEnum *stde;
268
279
  SegmentReader *sr = (SegmentReader *)ir->data;
269
280
  TermDocEnum *tde = stde_create(ir);
270
281
  tde->close = &stpe_close;
@@ -273,7 +284,7 @@ TermDocEnum *stpe_create(IndexReader *ir)
273
284
  tde->read = &stpe_read;
274
285
  tde->next_position = &stpe_next_position;
275
286
 
276
- SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
287
+ stde = (SegmentTermDocEnum *)tde->data;
277
288
  stde->prox_in = is_clone(sr->prox_in);
278
289
  stde->prox_cnt = 0;
279
290
  stde->position = 0;
@@ -321,16 +332,18 @@ TermDocEnum *mtde_term_docs_from_reader(IndexReader *ir)
321
332
 
322
333
  TermDocEnum *mtde_term_docs(MultiTermDocEnum *mtde, int i)
323
334
  {
324
- if (mtde->term == NULL)
335
+ if (mtde->term == NULL) {
325
336
  return NULL;
337
+ } else {
326
338
 
327
- TermDocEnum *tde = mtde->irs_tde[i];
328
- if (tde == NULL) {
329
- tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
330
- }
339
+ TermDocEnum *tde = mtde->irs_tde[i];
340
+ if (tde == NULL) {
341
+ tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
342
+ }
331
343
 
332
- tde->seek(tde, mtde->term);
333
- return tde;
344
+ tde->seek(tde, mtde->term);
345
+ return tde;
346
+ }
334
347
  }
335
348
 
336
349
  bool mtde_next(TermDocEnum *tde)
@@ -411,6 +424,7 @@ int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
411
424
 
412
425
  TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
413
426
  {
427
+ MultiTermDocEnum *mtde = ALLOC_AND_ZERO(MultiTermDocEnum);
414
428
  TermDocEnum *tde = ALLOC(TermDocEnum);
415
429
  tde->close = &mtde_close;
416
430
  tde->seek = &mtde_seek;
@@ -421,8 +435,6 @@ TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
421
435
  tde->read = &mtde_read;
422
436
  tde->next_position = NULL;
423
437
 
424
- MultiTermDocEnum *mtde = ALLOC(MultiTermDocEnum);
425
- ZEROSET(mtde, MultiTermDocEnum, 1); // set all values to 0
426
438
  tde->data = mtde;
427
439
  mtde->irs = irs;
428
440
  mtde->starts = starts;
@@ -467,8 +479,7 @@ TermDocEnum *mtpe_create(IndexReader **irs, int *starts, int ir_cnt)
467
479
  ****************************************************************************/
468
480
 
469
481
  #define GET_MTDPE MultipleTermDocPosEnum *mtdpe = (MultipleTermDocPosEnum *)self->data
470
- void tde_destroy(void *p) {
471
- TermDocEnum *self = (TermDocEnum *)p;
482
+ void tde_destroy(TermDocEnum *self) {
472
483
  self->close(self);
473
484
  }
474
485
 
@@ -570,12 +581,26 @@ int mtdpe_next_position(TermDocEnum *self)
570
581
 
571
582
  TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
572
583
  {
584
+ int i;
573
585
  TermDocEnum *self = ALLOC(TermDocEnum);
574
- MultipleTermDocPosEnum *mtdpe = ALLOC(MultipleTermDocPosEnum);
586
+ MultipleTermDocPosEnum *mtdpe = ALLOC_AND_ZERO_N(MultipleTermDocPosEnum, 1);
575
587
  PriorityQueue *pq;
576
588
  TermDocEnum *tpe;
577
- int i;
578
589
 
590
+ pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
591
+ mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
592
+ mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
593
+ for (i = 0; i < t_cnt; i++) {
594
+ tpe = ir_term_positions_for(ir, terms[i]);
595
+ if (tpe->next(tpe)) {
596
+ pq_push(pq, tpe);
597
+ } else {
598
+ tpe->close(tpe);
599
+ }
600
+ }
601
+ pq->free_elem = (free_ft)&tde_destroy;
602
+
603
+ self->data = mtdpe;
579
604
  self->close = &mtdpe_close;
580
605
  self->seek = &mtdpe_seek;
581
606
  self->next = &mtdpe_next;
@@ -585,17 +610,6 @@ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
585
610
  self->read = &mtdpe_read;
586
611
  self->next_position = &mtdpe_next_position;
587
612
 
588
- ZEROSET(mtdpe, MultipleTermDocPosEnum, 1); // set all values to 0
589
- self->data = mtdpe;
590
- pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
591
- mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
592
- mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
593
- for (i = 0; i < t_cnt; i++) {
594
- tpe = ir_term_positions_for(ir, terms[i]);
595
- if (tpe->next(tpe)) pq_push(pq, tpe);
596
- }
597
- pq->free_elem = &tde_destroy;
598
-
599
613
  return self;
600
614
  }
601
615
 
data/ext/vector.c CHANGED
@@ -1,6 +1,6 @@
1
- #include <index.h>
1
+ #include "index.h"
2
+ #include "helper.h"
2
3
  #include <string.h>
3
- #include <helper.h>
4
4
 
5
5
  static char * const NULL_POS_ERROR_MSG = "Trying to write positions that are null!";
6
6
  static char * const NULL_OFFSETS_ERROR_MSG = "Trying to write offsets that are null!";
@@ -50,33 +50,25 @@ TVTerm *tvt_create(char *text, int freq, int *positions, TVOffsetInfo **offsets)
50
50
 
51
51
  void tvt_destroy(void *p)
52
52
  {
53
- //int i;
54
- //TVTerm *tvt = (TVTerm *)p;
55
- //free(tvt->text);
56
- //free(tvt->positions);
57
- //if (tvt->offsets != NULL) {
58
- // for (i = 0; i < tvt->freq; i++) {
59
- // tvoi_destroy(tvt->offsets[i]);
60
- // }
61
- // free(tvt->offsets);
62
- //}
63
53
  free(p);
64
54
  }
65
55
 
66
56
 
67
57
  TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis)
68
58
  {
59
+ char fname[SEGMENT_NAME_MAX_LENGTH];
60
+ size_t segment_len = strlen(segment);
69
61
  TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
62
+ OutStream *os;
63
+
70
64
  tvw->curr_field = NULL;
71
65
  tvw->curr_doc_pointer = -1;
72
66
 
73
- // Open files for TermVector storage
74
- char fname[SEGMENT_NAME_MAX_LENGTH];
75
- int segment_len = strlen(segment);
67
+ /* Open files for TermVector storage */
76
68
  strcpy(fname, segment);
77
69
 
78
70
  strcpy(fname + segment_len, TVX_EXTENSION);
79
- OutStream *os = tvw->tvx = store->create_output(store, fname);
71
+ os = tvw->tvx = store->create_output(store, fname);
80
72
  os_write_int(os, FORMAT_VERSION);
81
73
 
82
74
  strcpy(fname + segment_len, TVD_EXTENSION);
@@ -104,43 +96,48 @@ void tvw_write_field(TermVectorsWriter *tvw)
104
96
  int i, j, start, length;
105
97
  char *last_term_text;
106
98
  TVOffsetInfo *tmp_offset;
107
- // remember where this field is written
99
+ TVTerm **terms = tvw->terms;
100
+ TVTerm *term;
101
+ /* remember where this field is written */
108
102
  OutStream *tvf = tvw->tvf;
103
+ int store_positions = tvw->curr_field->store_positions;
104
+ int store_offsets = tvw->curr_field->store_offsets;
105
+ uchar bits = 0x0;
106
+
109
107
  tvw->curr_field->tvf_pointer = os_pos(tvf);
110
108
 
111
- // write the number of terms
109
+ /* write the number of terms */
112
110
  os_write_vint(tvf, tvw->tcnt);
113
111
 
114
- int store_positions = tvw->curr_field->store_positions;
115
- int store_offsets = tvw->curr_field->store_offsets;
116
- int bits = 0x0;
117
- if (store_positions)
112
+ if (store_positions) {
118
113
  bits |= STORE_POSITIONS_WITH_TERMVECTOR;
114
+ }
119
115
 
120
- if (store_offsets)
116
+ if (store_offsets) {
121
117
  bits |= STORE_OFFSET_WITH_TERMVECTOR;
118
+ }
122
119
 
123
- os_write_byte(tvf, bits);
120
+ os_write_byte(tvf, (uchar)bits);
124
121
 
125
122
  last_term_text = (char *)EMPTY_STRING;
126
- TVTerm **terms = tvw->terms;
127
- TVTerm *term;
128
123
  for (i = 0; i < tvw->tcnt; i++) {
129
124
  term = terms[i];
130
125
  start = hlp_string_diff(last_term_text, term->text);
131
- length = strlen(term->text) - start;
132
- os_write_vint(tvf, start); // write shared prefix length
133
- os_write_vint(tvf, length); // write delta length
134
- os_write_chars(tvf, term->text, start, length); // write delta chars
126
+ length = (int)strlen(term->text) - start;
127
+ os_write_vint(tvf, start); /* write shared prefix length */
128
+ os_write_vint(tvf, length); /* write delta length */
129
+ os_write_chars(tvf, term->text, start, length); /* write delta chars */
135
130
  os_write_vint(tvf, term->freq);
136
131
  last_term_text = term->text;
137
132
 
138
133
  if (store_positions) {
139
- if (term->positions == NULL)
140
- RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
141
-
142
- // use delta encoding for positions
143
134
  int last_pos = 0;
135
+
136
+ if (term->positions == NULL) {
137
+ RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
138
+ }
139
+
140
+ /* use delta encoding for positions */
144
141
  for (j = 0; j < term->freq; j++) {
145
142
  os_write_vint(tvf, term->positions[j] - last_pos);
146
143
  last_pos = term->positions[j];
@@ -148,16 +145,18 @@ void tvw_write_field(TermVectorsWriter *tvw)
148
145
  }
149
146
 
150
147
  if (store_offsets) {
151
- if (term->offsets == NULL)
152
- RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
153
-
154
- // use delta encoding for offsets
155
148
  int last_end = 0;
149
+
150
+ if (term->offsets == NULL) {
151
+ RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
152
+ }
153
+
154
+ /* use delta encoding for offsets */
156
155
  for (j = 0; j < term->freq; j++) {
157
156
  tmp_offset = term->offsets[j];
158
157
  os_write_vint(tvf, tmp_offset->start - last_end);
159
158
 
160
- // save the diff between the two.
159
+ /* save the diff between the two */
161
160
  os_write_vint(tvf, tmp_offset->end - tmp_offset->start);
162
161
  last_end = tmp_offset->end;
163
162
  }
@@ -169,13 +168,14 @@ void tvw_close_field(TermVectorsWriter *tvw)
169
168
  {
170
169
  int i;
171
170
  if (tvw->curr_field != NULL) {
172
- // save field and terms
171
+ /* save field and terms */
173
172
  tvw_write_field(tvw);
174
173
 
175
174
  if (tvw->fcnt >= tvw->fsize) {
176
175
  tvw->fsize *=2;
177
- if (tvw->fsize < FIELD_ARR_START_SIZE)
176
+ if (tvw->fsize < FIELD_ARR_START_SIZE) {
178
177
  tvw->fsize = FIELD_ARR_START_SIZE;
178
+ }
179
179
  REALLOC_N(tvw->fields, TVField *, tvw->fsize);
180
180
  }
181
181
  tvw->fields[tvw->fcnt] = tvw->curr_field;
@@ -205,30 +205,34 @@ void tvw_open_field(TermVectorsWriter *tvw, char *field)
205
205
 
206
206
  void tvw_write_doc(TermVectorsWriter *tvw)
207
207
  {
208
- if (tvw->curr_field != NULL)
208
+ OutStream *tvd = tvw->tvd;
209
+ int i;
210
+ TVField **fields = tvw->fields;
211
+ int last_field_pointer = 0;
212
+
213
+ if (tvw->curr_field != NULL) {
209
214
  RAISE(STATE_ERROR, FIELD_OPEN_ERROR_MSG);
215
+ }
210
216
 
211
- // puts("Writing doc pointer: " + @curr_doc_pointer)
212
- // write document index record
217
+ //printf("Writing doc pointer: %d\n", tvw->curr_doc_pointer);
218
+ /* write document index record */
213
219
  os_write_long(tvw->tvx, tvw->curr_doc_pointer);
214
220
 
215
- OutStream *tvd = tvw->tvd;
216
- // write the number of @fields
221
+ //printf("Writing field count: %ld, %d, %d -> ", (long long)tvw, tvw->fcnt, os_pos(tvd));
222
+ /* write the number of @fields */
217
223
  os_write_vint(tvd, tvw->fcnt);
218
-
219
- // write field numbers
220
- int i;
221
- TVField **fields = tvw->fields;
224
+
225
+ /* write field numbers */
222
226
  for (i = 0; i < tvw->fcnt; i++) {
223
227
  os_write_vint(tvd, fields[i]->number);
224
228
  }
225
229
 
226
- // write field pointers
227
- int last_field_pointer = 0;
230
+ /* write field pointers */
228
231
  for (i = 0; i < tvw->fcnt; i++) {
229
232
  os_write_vint(tvd, fields[i]->tvf_pointer - last_field_pointer);
230
233
  last_field_pointer = fields[i]->tvf_pointer;
231
234
  }
235
+ //printf("%d\n", os_pos(tvw->tvd));
232
236
  }
233
237
 
234
238
  void tvw_close_doc(TermVectorsWriter *tvw)
@@ -257,8 +261,10 @@ void tvw_add_term(TermVectorsWriter *tvw,
257
261
  {
258
262
  if (tvw->tcnt >= tvw->tsize) {
259
263
  tvw->tsize *= 2;
260
- if (tvw->tsize < TERM_ARR_START_SIZE)
264
+ if (tvw->tsize < TERM_ARR_START_SIZE) {
261
265
  tvw->tsize = TERM_ARR_START_SIZE;
266
+ }
267
+
262
268
  REALLOC_N(tvw->terms, TVTerm *, tvw->tsize);
263
269
  }
264
270
  tvw->terms[tvw->tcnt] = tvt_create(text, freq, positions, offsets);
@@ -267,31 +273,36 @@ void tvw_add_term(TermVectorsWriter *tvw,
267
273
 
268
274
  void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors)
269
275
  {
270
- tvw_open_doc(tvw);
271
-
272
276
  int i, j, store_positions, store_offsets;
273
277
  TermVector *tv;
278
+
279
+ tvw_open_doc(tvw);
280
+
274
281
  for (i = 0; i < vectors->size; i++) {
275
282
  tv = vectors->elems[i];
276
283
 
277
284
  store_positions = (tv->tcnt > 0 && tv->positions != NULL);
278
285
  store_offsets = (tv->tcnt > 0 && tv->offsets != NULL);
279
286
 
280
- tvw_create_field(tvw, fis_get_number(tvw->fis, tv->field),
287
+ tvw_create_field(tvw, (int)fis_get_number(tvw->fis, tv->field),
281
288
  store_positions, store_offsets);
282
289
 
283
290
  if (store_positions && store_offsets) {
284
- for (j = 0; j < tv->tcnt; j++)
291
+ for (j = 0; j < tv->tcnt; j++) {
285
292
  tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], tv->offsets[j]);
293
+ }
286
294
  } else if (store_positions) {
287
- for (j = 0; j < tv->tcnt; j++)
295
+ for (j = 0; j < tv->tcnt; j++) {
288
296
  tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], NULL);
297
+ }
289
298
  } else if (store_offsets) {
290
- for (j = 0; j < tv->tcnt; j++)
299
+ for (j = 0; j < tv->tcnt; j++) {
291
300
  tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, tv->offsets[j]);
301
+ }
292
302
  } else {
293
- for (j = 0; j < tv->tcnt; j++)
303
+ for (j = 0; j < tv->tcnt; j++) {
294
304
  tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, NULL);
305
+ }
295
306
  }
296
307
  tvw_close_field(tvw);
297
308
  }
@@ -333,10 +344,9 @@ TermVector *tv_create(
333
344
  return tv;
334
345
  }
335
346
 
336
- void tv_destroy(void *p)
347
+ void tv_destroy(TermVector *tv)
337
348
  {
338
349
  int i, j;
339
- TermVector *tv = (TermVector *)p;
340
350
  for (i = 0; i < tv->tcnt; i++) {
341
351
  free(tv->terms[i]);
342
352
  }
@@ -357,12 +367,11 @@ void tv_destroy(void *p)
357
367
  free(tv->offsets);
358
368
  }
359
369
  free(tv->freqs);
360
- free(p);
370
+ free(tv);
361
371
  }
362
372
 
363
- void tv_destroy_except_data(void *p)
373
+ void tv_destroy_except_data(TermVector *tv)
364
374
  {
365
- TermVector *tv = (TermVector *)p;
366
375
  free(tv->terms);
367
376
  if (tv->positions != NULL) {
368
377
  free(tv->positions);
@@ -371,23 +380,23 @@ void tv_destroy_except_data(void *p)
371
380
  free(tv->offsets);
372
381
  }
373
382
  free(tv->freqs);
374
- free(p);
383
+ free(tv);
375
384
  }
376
385
 
377
386
  int tvr_check_valid_format(InStream *is)
378
387
  {
379
388
  int format = is_read_int(is);
380
389
  if (format > FORMAT_VERSION)
381
- RAISE(ERROR, FORMAT_VERSION_ERROR_MSG);
390
+ RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
382
391
  return format;
383
392
  }
384
393
 
385
394
  TermVectorsReader *tvr_clone(TermVectorsReader *orig)
386
395
  {
387
396
  TermVectorsReader *clone = NULL;
397
+ clone = ALLOC(TermVectorsReader);
398
+ memcpy(clone, orig, sizeof(TermVectorsReader));
388
399
  if (orig->tvx && orig->tvd && orig->tvf) {
389
- clone = ALLOC(TermVectorsReader);
390
- memcpy(clone, orig, sizeof(TermVectorsReader));
391
400
  clone->tvx = is_clone(orig->tvx);
392
401
  clone->tvd = is_clone(orig->tvd);
393
402
  clone->tvf = is_clone(orig->tvf);
@@ -400,23 +409,30 @@ TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis)
400
409
  TermVectorsReader *tvr = ALLOC(TermVectorsReader);
401
410
  // Open files for TermVector storage
402
411
  char fname[SEGMENT_NAME_MAX_LENGTH];
403
- int segment_len = strlen(segment);
412
+ size_t segment_len = strlen(segment);
413
+ InStream *is;
414
+
404
415
  strcpy(fname, segment);
405
416
 
406
417
  strcpy(fname + segment_len, TVX_EXTENSION);
407
- InStream *is = tvr->tvx = store->open_input(store, fname);
408
- tvr_check_valid_format(is);
409
- tvr->size = is_length(is)/8;
418
+ if (!store->exists(store, fname)) {
419
+ tvr->tvx = tvr->tvd = tvr->tvf = NULL;
420
+ tvr->size = 0;
421
+ } else {
422
+ is = tvr->tvx = store->open_input(store, fname);
423
+ tvr_check_valid_format(is);
424
+ tvr->size = is_length(is)/8;
410
425
 
411
- strcpy(fname + segment_len, TVD_EXTENSION);
412
- is = tvr->tvd = store->open_input(store, fname);
413
- tvr->tvd_format = tvr_check_valid_format(is);
426
+ strcpy(fname + segment_len, TVD_EXTENSION);
427
+ is = tvr->tvd = store->open_input(store, fname);
428
+ tvr->tvd_format = tvr_check_valid_format(is);
414
429
 
415
- strcpy(fname + segment_len, TVF_EXTENSION);
416
- is = tvr->tvf = store->open_input(store, fname);
417
- tvr->tvf_format = tvr_check_valid_format(is);
430
+ strcpy(fname + segment_len, TVF_EXTENSION);
431
+ is = tvr->tvf = store->open_input(store, fname);
432
+ tvr->tvf_format = tvr_check_valid_format(is);
418
433
 
419
- tvr->fis = fis;
434
+ tvr->fis = fis;
435
+ }
420
436
  return tvr;
421
437
  }
422
438
 
@@ -426,9 +442,15 @@ void tvr_close(TermVectorsReader *tvr)
426
442
  * exception, everything else will also be closed. */
427
443
  TRY
428
444
  XFINALLY
429
- is_close(tvr->tvx);
430
- is_close(tvr->tvd);
431
- is_close(tvr->tvf);
445
+ if (tvr->tvx) {
446
+ is_close(tvr->tvx);
447
+ }
448
+ if (tvr->tvd) {
449
+ is_close(tvr->tvd);
450
+ }
451
+ if (tvr->tvf) {
452
+ is_close(tvr->tvf);
453
+ }
432
454
  free(tvr);
433
455
  XENDTRY
434
456
  }
@@ -436,17 +458,29 @@ void tvr_close(TermVectorsReader *tvr)
436
458
  TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
437
459
  char *field, int tvf_pointer)
438
460
  {
439
- int i, j, store_positions, store_offsets, bits;
440
- // Now read the data from specified position
441
- // We don't need to offset by the FORMAT here since the pointer
442
- // already includes the offset
461
+ int i, j, store_positions, store_offsets, bits, num_terms;
462
+ char **terms;
463
+ int *term_freqs;
464
+
465
+ /* we may not need these, but declare them */
466
+ int **positions = NULL;
467
+ TVOffsetInfo ***offsets = NULL;
468
+ int start, delta_length, total_length, freq, prev_pos;
469
+ int start_offset, end_offset, prev_offset;
470
+ int *pos;
471
+ TVOffsetInfo **offs;
472
+ char buffer[MAX_WORD_SIZE] = "";
473
+
474
+ /* Now read the data from specified position. We don't need to offset
475
+ * offset by the FORMAT here since the pointer already includes the offset */
443
476
  is_seek(tvr->tvf, tvf_pointer);
444
-
445
- int num_terms = is_read_vint(tvr->tvf);
446
- // If no terms - return a constant empty termvector. However, this should
447
- // never occur!
448
- if (num_terms == 0)
477
+ num_terms = (int)is_read_vint(tvr->tvf);
478
+
479
+ /* If no terms - return a constant empty termvector. However, this should
480
+ * never occur! */
481
+ if (num_terms == 0) {
449
482
  return tv_create(field, NULL, 0, NULL, NULL, NULL);
483
+ }
450
484
 
451
485
  if(tvr->tvf_format == FORMAT_VERSION) {
452
486
  bits = is_read_byte(tvr->tvf);
@@ -458,41 +492,34 @@ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
458
492
  store_offsets = false;
459
493
  }
460
494
 
461
- char **terms = ALLOC_N(char *, num_terms);
462
- int *term_freqs = ALLOC_N(int, num_terms);
463
-
464
- // we may not need these, but declare them
465
- int **positions = NULL;
466
- TVOffsetInfo ***offsets = NULL;
495
+ terms = ALLOC_N(char *, num_terms);
496
+ term_freqs = ALLOC_N(int, num_terms);
467
497
 
468
- if(store_positions)
498
+ if (store_positions) {
469
499
  positions = ALLOC_N(int *, num_terms);
500
+ }
470
501
 
471
- if(store_offsets)
502
+ if (store_offsets) {
472
503
  offsets = ALLOC_N(TVOffsetInfo **, num_terms);
504
+ }
473
505
 
474
- int start, delta_length, total_length, freq, prev_pos;
475
- int start_offset, end_offset, prev_offset;
476
- int *pos;
477
- TVOffsetInfo **offs;
478
- char buffer[MAX_WORD_SIZE] = "";
479
506
 
480
507
  for (i = 0; i < num_terms; i++) {
481
- start = is_read_vint(tvr->tvf);
482
- delta_length = is_read_vint(tvr->tvf);
508
+ start = (int)is_read_vint(tvr->tvf);
509
+ delta_length = (int)is_read_vint(tvr->tvf);
483
510
  total_length = start + delta_length;
484
511
  is_read_chars(tvr->tvf, buffer, start, delta_length);
485
512
  buffer[total_length] = '\0';
486
513
  terms[i] = estrdup(buffer);
487
- freq = is_read_vint(tvr->tvf);
514
+ freq = (int)is_read_vint(tvr->tvf);
488
515
  term_freqs[i] = freq;
489
516
 
490
- if (store_positions) {//read in the positions
517
+ if (store_positions) {/* read in the positions */
491
518
  pos = ALLOC_N(int, freq);
492
519
  positions[i] = pos;
493
520
  prev_pos = 0;
494
521
  for (j = 0; j < freq; j++) {
495
- pos[j] = prev_pos + is_read_vint(tvr->tvf);
522
+ pos[j] = prev_pos + (int)is_read_vint(tvr->tvf);
496
523
  prev_pos = pos[j];
497
524
  }
498
525
  }
@@ -502,8 +529,8 @@ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
502
529
  offsets[i] = offs;
503
530
  prev_offset = 0;
504
531
  for (j = 0; j < freq; j++) {
505
- start_offset = prev_offset + is_read_vint(tvr->tvf);
506
- end_offset = start_offset + is_read_vint(tvr->tvf);
532
+ start_offset = prev_offset + (int)is_read_vint(tvr->tvf);
533
+ end_offset = start_offset + (int)is_read_vint(tvr->tvf);
507
534
  offs[j] = tvoi_create(start_offset, end_offset);
508
535
  prev_offset = end_offset;
509
536
  }
@@ -516,39 +543,41 @@ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
516
543
  {
517
544
  int i;
518
545
  Array *tvs = NULL;
519
- // Check if no term vectors are available for this segment at all
546
+ /* Check if no term vectors are available for this segment at all */
520
547
  if (tvr->tvx != NULL) {
521
- // We need to offset by
548
+ int position, field_count;
549
+ /* We need to offset by */
522
550
  is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
523
551
 
524
- int position = is_read_long(tvr->tvx);
552
+ position = (int)is_read_long(tvr->tvx);
525
553
 
526
554
  is_seek(tvr->tvd, position);
527
- int field_count = is_read_vint(tvr->tvd);
555
+ field_count = (int)is_read_vint(tvr->tvd);
528
556
 
529
- // No fields are vectorized for this document
557
+ /* No fields are vectorized for this document */
530
558
  if (field_count > 0) {
531
559
  int number = 0;
560
+ int position = 0;
561
+ int *tvf_pointers = ALLOC_N(int, field_count);
532
562
  char **fields = ALLOC_N(char *, field_count);
533
563
 
534
564
  for (i = 0; i < field_count; i++) {
535
- if (tvr->tvd_format == FORMAT_VERSION)
536
- number = is_read_vint(tvr->tvd);
537
- else
538
- number += is_read_vint(tvr->tvd);
565
+ if (tvr->tvd_format == FORMAT_VERSION) {
566
+ number = (int)is_read_vint(tvr->tvd);
567
+ } else {
568
+ number += (int)is_read_vint(tvr->tvd);
569
+ }
539
570
 
540
571
  fields[i] = tvr->fis->by_number[number]->name;
541
572
  }
542
573
 
543
- // Compute position in the tvf file
544
- int position = 0;
545
- int *tvf_pointers = ALLOC_N(int, field_count);
574
+ /* Compute position in the tvf file */
546
575
  for (i = 0; i < field_count; i++) {
547
- position += is_read_vint(tvr->tvd);
576
+ position += (int)is_read_vint(tvr->tvd);
548
577
  tvf_pointers[i] = position;
549
578
  }
550
579
 
551
- tvs = ary_create(field_count, &tv_destroy);
580
+ tvs = ary_create(field_count, (free_ft)&tv_destroy);
552
581
  for (i = 0; i < field_count; i++) {
553
582
  ary_append(tvs, tvr_read_term_vector(tvr, fields[i], tvf_pointers[i]));
554
583
  }
@@ -562,45 +591,45 @@ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
562
591
  TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field)
563
592
  {
564
593
  int i;
565
- // Check if no term vectors are available for this segment at all
566
- int field_number = fis_get_number(tvr->fis, field);
594
+ /* Check if no term vectors are available for this segment at all */
595
+ int field_number = (int)fis_get_number(tvr->fis, field);
567
596
  TermVector *tv = NULL;
568
597
 
569
598
  if (tvr->tvx != NULL) {
570
- // We need to account for the FORMAT_SIZE at when seeking in the @tvx
571
- // We don't need to do this in other seeks because we already have the
572
- // file pointer that was written in another file
599
+ int pos, field_count, number = 0, found = -1;
600
+ /* We need to account for the FORMAT_SIZE at when seeking in the @tvx
601
+ * We don't need to do this in other seeks because we already have the
602
+ * file pointer that was written in another file */
573
603
  is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
574
- // puts("TVX Pointer: " + @tvx.pos())
575
- int pos = is_read_long(tvr->tvx);
604
+ //printf("TVX Pointer: %d\n", is_pos(tvr->tvx));
605
+ pos = (int)is_read_long(tvr->tvx);
576
606
 
577
607
  is_seek(tvr->tvd, pos);
578
- int field_count = is_read_vint(tvr->tvd);
579
- //puts("Num Fields: " + field_count)
580
- // There are only a few fields per document. We opt for a full scan
581
- // rather then requiring that they be ordered. We need to read through
582
- // all of the fields anyway to get to the tvf pointers.
583
- int number = 0;
584
- int found = -1;
585
-
608
+ field_count = (int)is_read_vint(tvr->tvd);
609
+ //printf("Num Fields: %d\n", field_count);
610
+ /* There are only a few fields per document. We opt for a full scan
611
+ * rather then requiring that they be ordered. We need to read through
612
+ * all of the fields anyway to get to the tvf pointers. */
586
613
  for (i = 0; i < field_count; i++) {
587
- if (tvr->tvd_format == FORMAT_VERSION)
588
- number = is_read_vint(tvr->tvd);
589
- else
590
- number += is_read_vint(tvr->tvd);
614
+ if (tvr->tvd_format == FORMAT_VERSION) {
615
+ number = (int)is_read_vint(tvr->tvd);
616
+ } else {
617
+ number += (int)is_read_vint(tvr->tvd);
618
+ }
591
619
 
592
- if (number == field_number)
620
+ if (number == field_number) {
593
621
  found = i;
622
+ }
594
623
  }
595
624
 
596
- // This field, although valid in the segment, was not found in this
597
- // document
625
+ /* This field, although valid in the segment, was not found in this
626
+ * document */
598
627
  if (found != -1) {
599
- // Compute pos in the @tvf file
628
+ /* Compute pos in the tvf file */
600
629
  pos = 0;
601
- for (i = 0; i <= found; i++)
602
- pos += is_read_vint(tvr->tvd);
603
-
630
+ for (i = 0; i <= found; i++) {
631
+ pos += (int)is_read_vint(tvr->tvd);
632
+ }
604
633
  tv = tvr_read_term_vector(tvr, field, pos);
605
634
  }
606
635
  }