ferret 0.9.1 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (105) hide show
  1. data/README +6 -5
  2. data/Rakefile +34 -13
  3. data/TODO +1 -0
  4. data/TUTORIAL +1 -1
  5. data/ext/analysis.c +87 -70
  6. data/ext/analysis.h +18 -6
  7. data/ext/array.c +1 -2
  8. data/ext/array.h +1 -1
  9. data/ext/bitvector.c +10 -6
  10. data/ext/bitvector.h +2 -2
  11. data/ext/compound_io.c +30 -27
  12. data/ext/document.c +15 -15
  13. data/ext/document.h +5 -5
  14. data/ext/except.c +2 -0
  15. data/ext/except.h +25 -23
  16. data/ext/extconf.rb +1 -0
  17. data/ext/ferret.c +10 -8
  18. data/ext/ferret.h +9 -8
  19. data/ext/field.c +29 -25
  20. data/ext/filter.c +52 -14
  21. data/ext/frtio.h +13 -0
  22. data/ext/fs_store.c +115 -170
  23. data/ext/global.c +9 -8
  24. data/ext/global.h +17 -13
  25. data/ext/hash.c +13 -19
  26. data/ext/hash.h +11 -11
  27. data/ext/hashset.c +5 -7
  28. data/ext/hashset.h +9 -8
  29. data/ext/helper.c +1 -1
  30. data/ext/helper.h +2 -1
  31. data/ext/inc/except.h +25 -23
  32. data/ext/inc/lang.h +11 -1
  33. data/ext/ind.c +33 -21
  34. data/ext/index.h +44 -39
  35. data/ext/index_io.c +61 -57
  36. data/ext/index_rw.c +418 -361
  37. data/ext/lang.c +10 -0
  38. data/ext/lang.h +11 -1
  39. data/ext/nix_io.c +135 -0
  40. data/ext/priorityqueue.c +16 -16
  41. data/ext/priorityqueue.h +9 -6
  42. data/ext/q_boolean.c +128 -76
  43. data/ext/q_const_score.c +20 -20
  44. data/ext/q_filtered_query.c +20 -20
  45. data/ext/q_fuzzy.c +37 -23
  46. data/ext/q_match_all.c +15 -19
  47. data/ext/q_multi_phrase.c +87 -46
  48. data/ext/q_parser.c +247 -119
  49. data/ext/q_phrase.c +86 -52
  50. data/ext/q_prefix.c +25 -14
  51. data/ext/q_range.c +59 -14
  52. data/ext/q_span.c +263 -172
  53. data/ext/q_term.c +62 -51
  54. data/ext/q_wildcard.c +24 -13
  55. data/ext/r_analysis.c +328 -80
  56. data/ext/r_doc.c +11 -6
  57. data/ext/r_index_io.c +40 -32
  58. data/ext/r_qparser.c +15 -14
  59. data/ext/r_search.c +270 -152
  60. data/ext/r_store.c +32 -17
  61. data/ext/ram_store.c +38 -22
  62. data/ext/search.c +617 -87
  63. data/ext/search.h +227 -163
  64. data/ext/similarity.c +54 -45
  65. data/ext/similarity.h +3 -3
  66. data/ext/sort.c +132 -53
  67. data/ext/store.c +21 -2
  68. data/ext/store.h +14 -14
  69. data/ext/tags +4322 -232
  70. data/ext/term.c +140 -109
  71. data/ext/termdocs.c +74 -60
  72. data/ext/vector.c +181 -152
  73. data/ext/w32_io.c +150 -0
  74. data/lib/ferret.rb +1 -1
  75. data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
  76. data/lib/ferret/document/field.rb +1 -1
  77. data/lib/ferret/index/field_infos.rb +1 -1
  78. data/lib/ferret/index/term.rb +1 -1
  79. data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
  80. data/lib/ferret/search.rb +1 -0
  81. data/lib/ferret/search/boolean_query.rb +0 -4
  82. data/lib/ferret/search/index_searcher.rb +21 -8
  83. data/lib/ferret/search/multi_phrase_query.rb +7 -0
  84. data/lib/ferret/search/multi_searcher.rb +261 -0
  85. data/lib/ferret/search/phrase_query.rb +1 -1
  86. data/lib/ferret/search/query.rb +34 -5
  87. data/lib/ferret/search/sort.rb +7 -3
  88. data/lib/ferret/search/sort_field.rb +8 -4
  89. data/lib/ferret/store/fs_store.rb +13 -6
  90. data/lib/ferret/store/index_io.rb +0 -14
  91. data/lib/ferret/store/ram_store.rb +3 -2
  92. data/lib/rferret.rb +1 -1
  93. data/test/unit/analysis/ctc_analyzer.rb +131 -0
  94. data/test/unit/analysis/ctc_tokenstream.rb +98 -9
  95. data/test/unit/index/tc_index.rb +40 -1
  96. data/test/unit/index/tc_term.rb +7 -0
  97. data/test/unit/index/th_doc.rb +8 -0
  98. data/test/unit/query_parser/tc_query_parser.rb +6 -4
  99. data/test/unit/search/rtc_sort_field.rb +6 -6
  100. data/test/unit/search/tc_index_searcher.rb +8 -0
  101. data/test/unit/search/tc_multi_searcher.rb +275 -0
  102. data/test/unit/search/tc_multi_searcher2.rb +126 -0
  103. data/test/unit/search/tc_search_and_sort.rb +66 -0
  104. metadata +31 -26
  105. data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/termdocs.c CHANGED
@@ -1,4 +1,4 @@
1
- #include <index.h>
1
+ #include "index.h"
2
2
  #include <string.h>
3
3
 
4
4
  static char * const TPE_VS_TDE_ERROR_MSG = "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.";
@@ -59,22 +59,24 @@ bool stde_next(TermDocEnum *tde)
59
59
  SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
60
60
  while (true) {
61
61
 
62
- if (stde->count >= stde->doc_freq)
62
+ if (stde->count >= stde->doc_freq) {
63
63
  return false;
64
+ }
64
65
 
65
- doc_code = is_read_vint(stde->freq_in);
66
- stde->doc_num += doc_code >> 1; // shift off low bit
67
- if ((doc_code & 1) != 0) { // if low bit is set
68
- stde->freq = 1; // freq is one
66
+ doc_code = (int)is_read_vint(stde->freq_in);
67
+ stde->doc_num += doc_code >> 1; /* shift off low bit */
68
+ if ((doc_code & 1) != 0) { /* if low bit is set */
69
+ stde->freq = 1; /* freq is one */
69
70
  } else {
70
- stde->freq = is_read_vint(stde->freq_in); // else read freq
71
+ stde->freq = (int)is_read_vint(stde->freq_in); /* read freq */
71
72
  }
72
73
 
73
74
  stde->count++;
74
75
 
75
76
  if (stde->deleted_docs == NULL ||
76
- bv_get(stde->deleted_docs, stde->doc_num) == 0)
77
- break; // We found an undeleted doc so return
77
+ bv_get(stde->deleted_docs, stde->doc_num) == 0) {
78
+ break; /* We found an undeleted doc so return */
79
+ }
78
80
 
79
81
  stde->skip_prox(stde);
80
82
  }
@@ -90,41 +92,49 @@ int stde_freq(TermDocEnum *tde)
90
92
  bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
91
93
  {
92
94
  SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
93
- if (stde->doc_freq >= stde->skip_interval) { // optimized case
94
95
 
95
- if (stde->skip_in == NULL)
96
- stde->skip_in = is_clone(stde->freq_in); // lazily clone
96
+ if (stde->doc_freq >= stde->skip_interval) { /* optimized case */
97
+ int last_skip_doc;
98
+ int last_freq_pointer;
99
+ int last_prox_pointer;
100
+ int num_skipped;
101
+
102
+ if (stde->skip_in == NULL) {
103
+ stde->skip_in = is_clone(stde->freq_in); /* lazily clone */
104
+ }
97
105
 
98
- if (!stde->have_skipped) { // lazily seek skip stream
106
+ if (!stde->have_skipped) { /* lazily seek skip stream */
99
107
  is_seek(stde->skip_in, stde->skip_pointer);
100
108
  stde->have_skipped = true;
101
109
  }
102
110
 
103
- // scan skip data
104
- int last_skip_doc = stde->skip_doc;
105
- int last_freq_pointer = is_pos(stde->freq_in);
106
- int last_prox_pointer = -1;
107
- int num_skipped = -1 - (stde->count % stde->skip_interval);
111
+ /* scan skip data */
112
+ last_skip_doc = stde->skip_doc;
113
+ last_freq_pointer = is_pos(stde->freq_in);
114
+ last_prox_pointer = -1;
115
+ num_skipped = -1 - (stde->count % stde->skip_interval);
108
116
 
109
117
  while (target_doc_num > stde->skip_doc) {
110
118
  last_skip_doc = stde->skip_doc;
111
119
  last_freq_pointer = stde->freq_pointer;
112
120
  last_prox_pointer = stde->prox_pointer;
113
121
 
114
- if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num)
122
+ if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num) {
115
123
  num_skipped += stde->skip_interval;
124
+ }
116
125
 
117
- if(stde->skip_count >= stde->num_skips)
126
+ if(stde->skip_count >= stde->num_skips) {
118
127
  break;
128
+ }
119
129
 
120
- stde->skip_doc += is_read_vint(stde->skip_in);
121
- stde->freq_pointer += is_read_vint(stde->skip_in);
122
- stde->prox_pointer += is_read_vint(stde->skip_in);
130
+ stde->skip_doc += (int)is_read_vint(stde->skip_in);
131
+ stde->freq_pointer += (int)is_read_vint(stde->skip_in);
132
+ stde->prox_pointer += (int)is_read_vint(stde->skip_in);
123
133
 
124
134
  stde->skip_count++;
125
135
  }
126
136
 
127
- // if we found something to skip, so skip it
137
+ /* if we found something to skip, so skip it */
128
138
  if (last_freq_pointer > is_pos(stde->freq_in)) {
129
139
  is_seek(stde->freq_in, last_freq_pointer);
130
140
  stde->seek_prox(stde, last_prox_pointer);
@@ -134,7 +144,7 @@ bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
134
144
  }
135
145
  }
136
146
 
137
- // done skipping, now just scan
147
+ /* done skipping, now just scan */
138
148
  do {
139
149
  if (! tde->next(tde)) {
140
150
  return false;
@@ -148,13 +158,14 @@ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
148
158
  SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
149
159
  int i = 0, doc_code;
150
160
  while (i < req_num && stde->count < stde->doc_freq) {
151
- // manually inlined call to next() for speed
152
- doc_code = is_read_vint(stde->freq_in);
153
- stde->doc_num += doc_code >> 1; // shift off low bit
154
- if ((doc_code & 1) != 0) // if low bit is set
155
- stde->freq = 1; // freq is one
156
- else
157
- stde->freq = is_read_vint(stde->freq_in); // else read freq
161
+ /* manually inlined call to next() for speed */
162
+ doc_code = (int)is_read_vint(stde->freq_in);
163
+ stde->doc_num += (doc_code >> 1); /* shift off low bit */
164
+ if ((doc_code & 1) != 0) { /* if low bit is set */
165
+ stde->freq = 1; /* freq is one */
166
+ } else {
167
+ stde->freq = (int)is_read_vint(stde->freq_in); /* else read freq */
168
+ }
158
169
 
159
170
  stde->count++;
160
171
 
@@ -170,6 +181,7 @@ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
170
181
 
171
182
  TermDocEnum *stde_create(IndexReader *ir)
172
183
  {
184
+ SegmentTermDocEnum *stde = ALLOC_AND_ZERO(SegmentTermDocEnum);
173
185
  SegmentReader *sr = (SegmentReader *)ir->data;
174
186
  TermDocEnum *tde = ALLOC(TermDocEnum);
175
187
  tde->seek = &stde_seek;
@@ -181,8 +193,6 @@ TermDocEnum *stde_create(IndexReader *ir)
181
193
  tde->next_position = NULL;
182
194
  tde->close = &stde_close;
183
195
 
184
- SegmentTermDocEnum *stde = ALLOC(SegmentTermDocEnum);
185
- ZEROSET(stde, SegmentTermDocEnum, 1); // set all values to 0
186
196
  tde->data = stde;
187
197
  stde->parent = sr;
188
198
  stde->freq_in = is_clone(sr->freq_in);
@@ -260,11 +270,12 @@ int stpe_next_position(TermDocEnum *tde)
260
270
  {
261
271
  SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
262
272
  stde->prox_cnt--;
263
- return stde->position += is_read_vint(stde->prox_in);
273
+ return stde->position += (int)is_read_vint(stde->prox_in);
264
274
  }
265
275
 
266
276
  TermDocEnum *stpe_create(IndexReader *ir)
267
277
  {
278
+ SegmentTermDocEnum *stde;
268
279
  SegmentReader *sr = (SegmentReader *)ir->data;
269
280
  TermDocEnum *tde = stde_create(ir);
270
281
  tde->close = &stpe_close;
@@ -273,7 +284,7 @@ TermDocEnum *stpe_create(IndexReader *ir)
273
284
  tde->read = &stpe_read;
274
285
  tde->next_position = &stpe_next_position;
275
286
 
276
- SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
287
+ stde = (SegmentTermDocEnum *)tde->data;
277
288
  stde->prox_in = is_clone(sr->prox_in);
278
289
  stde->prox_cnt = 0;
279
290
  stde->position = 0;
@@ -321,16 +332,18 @@ TermDocEnum *mtde_term_docs_from_reader(IndexReader *ir)
321
332
 
322
333
  TermDocEnum *mtde_term_docs(MultiTermDocEnum *mtde, int i)
323
334
  {
324
- if (mtde->term == NULL)
335
+ if (mtde->term == NULL) {
325
336
  return NULL;
337
+ } else {
326
338
 
327
- TermDocEnum *tde = mtde->irs_tde[i];
328
- if (tde == NULL) {
329
- tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
330
- }
339
+ TermDocEnum *tde = mtde->irs_tde[i];
340
+ if (tde == NULL) {
341
+ tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
342
+ }
331
343
 
332
- tde->seek(tde, mtde->term);
333
- return tde;
344
+ tde->seek(tde, mtde->term);
345
+ return tde;
346
+ }
334
347
  }
335
348
 
336
349
  bool mtde_next(TermDocEnum *tde)
@@ -411,6 +424,7 @@ int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num)
411
424
 
412
425
  TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
413
426
  {
427
+ MultiTermDocEnum *mtde = ALLOC_AND_ZERO(MultiTermDocEnum);
414
428
  TermDocEnum *tde = ALLOC(TermDocEnum);
415
429
  tde->close = &mtde_close;
416
430
  tde->seek = &mtde_seek;
@@ -421,8 +435,6 @@ TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
421
435
  tde->read = &mtde_read;
422
436
  tde->next_position = NULL;
423
437
 
424
- MultiTermDocEnum *mtde = ALLOC(MultiTermDocEnum);
425
- ZEROSET(mtde, MultiTermDocEnum, 1); // set all values to 0
426
438
  tde->data = mtde;
427
439
  mtde->irs = irs;
428
440
  mtde->starts = starts;
@@ -467,8 +479,7 @@ TermDocEnum *mtpe_create(IndexReader **irs, int *starts, int ir_cnt)
467
479
  ****************************************************************************/
468
480
 
469
481
  #define GET_MTDPE MultipleTermDocPosEnum *mtdpe = (MultipleTermDocPosEnum *)self->data
470
- void tde_destroy(void *p) {
471
- TermDocEnum *self = (TermDocEnum *)p;
482
+ void tde_destroy(TermDocEnum *self) {
472
483
  self->close(self);
473
484
  }
474
485
 
@@ -570,12 +581,26 @@ int mtdpe_next_position(TermDocEnum *self)
570
581
 
571
582
  TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
572
583
  {
584
+ int i;
573
585
  TermDocEnum *self = ALLOC(TermDocEnum);
574
- MultipleTermDocPosEnum *mtdpe = ALLOC(MultipleTermDocPosEnum);
586
+ MultipleTermDocPosEnum *mtdpe = ALLOC_AND_ZERO_N(MultipleTermDocPosEnum, 1);
575
587
  PriorityQueue *pq;
576
588
  TermDocEnum *tpe;
577
- int i;
578
589
 
590
+ pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
591
+ mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
592
+ mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
593
+ for (i = 0; i < t_cnt; i++) {
594
+ tpe = ir_term_positions_for(ir, terms[i]);
595
+ if (tpe->next(tpe)) {
596
+ pq_push(pq, tpe);
597
+ } else {
598
+ tpe->close(tpe);
599
+ }
600
+ }
601
+ pq->free_elem = (free_ft)&tde_destroy;
602
+
603
+ self->data = mtdpe;
579
604
  self->close = &mtdpe_close;
580
605
  self->seek = &mtdpe_seek;
581
606
  self->next = &mtdpe_next;
@@ -585,17 +610,6 @@ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
585
610
  self->read = &mtdpe_read;
586
611
  self->next_position = &mtdpe_next_position;
587
612
 
588
- ZEROSET(mtdpe, MultipleTermDocPosEnum, 1); // set all values to 0
589
- self->data = mtdpe;
590
- pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
591
- mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
592
- mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
593
- for (i = 0; i < t_cnt; i++) {
594
- tpe = ir_term_positions_for(ir, terms[i]);
595
- if (tpe->next(tpe)) pq_push(pq, tpe);
596
- }
597
- pq->free_elem = &tde_destroy;
598
-
599
613
  return self;
600
614
  }
601
615
 
data/ext/vector.c CHANGED
@@ -1,6 +1,6 @@
1
- #include <index.h>
1
+ #include "index.h"
2
+ #include "helper.h"
2
3
  #include <string.h>
3
- #include <helper.h>
4
4
 
5
5
  static char * const NULL_POS_ERROR_MSG = "Trying to write positions that are null!";
6
6
  static char * const NULL_OFFSETS_ERROR_MSG = "Trying to write offsets that are null!";
@@ -50,33 +50,25 @@ TVTerm *tvt_create(char *text, int freq, int *positions, TVOffsetInfo **offsets)
50
50
 
51
51
  void tvt_destroy(void *p)
52
52
  {
53
- //int i;
54
- //TVTerm *tvt = (TVTerm *)p;
55
- //free(tvt->text);
56
- //free(tvt->positions);
57
- //if (tvt->offsets != NULL) {
58
- // for (i = 0; i < tvt->freq; i++) {
59
- // tvoi_destroy(tvt->offsets[i]);
60
- // }
61
- // free(tvt->offsets);
62
- //}
63
53
  free(p);
64
54
  }
65
55
 
66
56
 
67
57
  TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis)
68
58
  {
59
+ char fname[SEGMENT_NAME_MAX_LENGTH];
60
+ size_t segment_len = strlen(segment);
69
61
  TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
62
+ OutStream *os;
63
+
70
64
  tvw->curr_field = NULL;
71
65
  tvw->curr_doc_pointer = -1;
72
66
 
73
- // Open files for TermVector storage
74
- char fname[SEGMENT_NAME_MAX_LENGTH];
75
- int segment_len = strlen(segment);
67
+ /* Open files for TermVector storage */
76
68
  strcpy(fname, segment);
77
69
 
78
70
  strcpy(fname + segment_len, TVX_EXTENSION);
79
- OutStream *os = tvw->tvx = store->create_output(store, fname);
71
+ os = tvw->tvx = store->create_output(store, fname);
80
72
  os_write_int(os, FORMAT_VERSION);
81
73
 
82
74
  strcpy(fname + segment_len, TVD_EXTENSION);
@@ -104,43 +96,48 @@ void tvw_write_field(TermVectorsWriter *tvw)
104
96
  int i, j, start, length;
105
97
  char *last_term_text;
106
98
  TVOffsetInfo *tmp_offset;
107
- // remember where this field is written
99
+ TVTerm **terms = tvw->terms;
100
+ TVTerm *term;
101
+ /* remember where this field is written */
108
102
  OutStream *tvf = tvw->tvf;
103
+ int store_positions = tvw->curr_field->store_positions;
104
+ int store_offsets = tvw->curr_field->store_offsets;
105
+ uchar bits = 0x0;
106
+
109
107
  tvw->curr_field->tvf_pointer = os_pos(tvf);
110
108
 
111
- // write the number of terms
109
+ /* write the number of terms */
112
110
  os_write_vint(tvf, tvw->tcnt);
113
111
 
114
- int store_positions = tvw->curr_field->store_positions;
115
- int store_offsets = tvw->curr_field->store_offsets;
116
- int bits = 0x0;
117
- if (store_positions)
112
+ if (store_positions) {
118
113
  bits |= STORE_POSITIONS_WITH_TERMVECTOR;
114
+ }
119
115
 
120
- if (store_offsets)
116
+ if (store_offsets) {
121
117
  bits |= STORE_OFFSET_WITH_TERMVECTOR;
118
+ }
122
119
 
123
- os_write_byte(tvf, bits);
120
+ os_write_byte(tvf, (uchar)bits);
124
121
 
125
122
  last_term_text = (char *)EMPTY_STRING;
126
- TVTerm **terms = tvw->terms;
127
- TVTerm *term;
128
123
  for (i = 0; i < tvw->tcnt; i++) {
129
124
  term = terms[i];
130
125
  start = hlp_string_diff(last_term_text, term->text);
131
- length = strlen(term->text) - start;
132
- os_write_vint(tvf, start); // write shared prefix length
133
- os_write_vint(tvf, length); // write delta length
134
- os_write_chars(tvf, term->text, start, length); // write delta chars
126
+ length = (int)strlen(term->text) - start;
127
+ os_write_vint(tvf, start); /* write shared prefix length */
128
+ os_write_vint(tvf, length); /* write delta length */
129
+ os_write_chars(tvf, term->text, start, length); /* write delta chars */
135
130
  os_write_vint(tvf, term->freq);
136
131
  last_term_text = term->text;
137
132
 
138
133
  if (store_positions) {
139
- if (term->positions == NULL)
140
- RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
141
-
142
- // use delta encoding for positions
143
134
  int last_pos = 0;
135
+
136
+ if (term->positions == NULL) {
137
+ RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
138
+ }
139
+
140
+ /* use delta encoding for positions */
144
141
  for (j = 0; j < term->freq; j++) {
145
142
  os_write_vint(tvf, term->positions[j] - last_pos);
146
143
  last_pos = term->positions[j];
@@ -148,16 +145,18 @@ void tvw_write_field(TermVectorsWriter *tvw)
148
145
  }
149
146
 
150
147
  if (store_offsets) {
151
- if (term->offsets == NULL)
152
- RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
153
-
154
- // use delta encoding for offsets
155
148
  int last_end = 0;
149
+
150
+ if (term->offsets == NULL) {
151
+ RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
152
+ }
153
+
154
+ /* use delta encoding for offsets */
156
155
  for (j = 0; j < term->freq; j++) {
157
156
  tmp_offset = term->offsets[j];
158
157
  os_write_vint(tvf, tmp_offset->start - last_end);
159
158
 
160
- // save the diff between the two.
159
+ /* save the diff between the two */
161
160
  os_write_vint(tvf, tmp_offset->end - tmp_offset->start);
162
161
  last_end = tmp_offset->end;
163
162
  }
@@ -169,13 +168,14 @@ void tvw_close_field(TermVectorsWriter *tvw)
169
168
  {
170
169
  int i;
171
170
  if (tvw->curr_field != NULL) {
172
- // save field and terms
171
+ /* save field and terms */
173
172
  tvw_write_field(tvw);
174
173
 
175
174
  if (tvw->fcnt >= tvw->fsize) {
176
175
  tvw->fsize *=2;
177
- if (tvw->fsize < FIELD_ARR_START_SIZE)
176
+ if (tvw->fsize < FIELD_ARR_START_SIZE) {
178
177
  tvw->fsize = FIELD_ARR_START_SIZE;
178
+ }
179
179
  REALLOC_N(tvw->fields, TVField *, tvw->fsize);
180
180
  }
181
181
  tvw->fields[tvw->fcnt] = tvw->curr_field;
@@ -205,30 +205,34 @@ void tvw_open_field(TermVectorsWriter *tvw, char *field)
205
205
 
206
206
  void tvw_write_doc(TermVectorsWriter *tvw)
207
207
  {
208
- if (tvw->curr_field != NULL)
208
+ OutStream *tvd = tvw->tvd;
209
+ int i;
210
+ TVField **fields = tvw->fields;
211
+ int last_field_pointer = 0;
212
+
213
+ if (tvw->curr_field != NULL) {
209
214
  RAISE(STATE_ERROR, FIELD_OPEN_ERROR_MSG);
215
+ }
210
216
 
211
- // puts("Writing doc pointer: " + @curr_doc_pointer)
212
- // write document index record
217
+ //printf("Writing doc pointer: %d\n", tvw->curr_doc_pointer);
218
+ /* write document index record */
213
219
  os_write_long(tvw->tvx, tvw->curr_doc_pointer);
214
220
 
215
- OutStream *tvd = tvw->tvd;
216
- // write the number of @fields
221
+ //printf("Writing field count: %ld, %d, %d -> ", (long long)tvw, tvw->fcnt, os_pos(tvd));
222
+ /* write the number of @fields */
217
223
  os_write_vint(tvd, tvw->fcnt);
218
-
219
- // write field numbers
220
- int i;
221
- TVField **fields = tvw->fields;
224
+
225
+ /* write field numbers */
222
226
  for (i = 0; i < tvw->fcnt; i++) {
223
227
  os_write_vint(tvd, fields[i]->number);
224
228
  }
225
229
 
226
- // write field pointers
227
- int last_field_pointer = 0;
230
+ /* write field pointers */
228
231
  for (i = 0; i < tvw->fcnt; i++) {
229
232
  os_write_vint(tvd, fields[i]->tvf_pointer - last_field_pointer);
230
233
  last_field_pointer = fields[i]->tvf_pointer;
231
234
  }
235
+ //printf("%d\n", os_pos(tvw->tvd));
232
236
  }
233
237
 
234
238
  void tvw_close_doc(TermVectorsWriter *tvw)
@@ -257,8 +261,10 @@ void tvw_add_term(TermVectorsWriter *tvw,
257
261
  {
258
262
  if (tvw->tcnt >= tvw->tsize) {
259
263
  tvw->tsize *= 2;
260
- if (tvw->tsize < TERM_ARR_START_SIZE)
264
+ if (tvw->tsize < TERM_ARR_START_SIZE) {
261
265
  tvw->tsize = TERM_ARR_START_SIZE;
266
+ }
267
+
262
268
  REALLOC_N(tvw->terms, TVTerm *, tvw->tsize);
263
269
  }
264
270
  tvw->terms[tvw->tcnt] = tvt_create(text, freq, positions, offsets);
@@ -267,31 +273,36 @@ void tvw_add_term(TermVectorsWriter *tvw,
267
273
 
268
274
  void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors)
269
275
  {
270
- tvw_open_doc(tvw);
271
-
272
276
  int i, j, store_positions, store_offsets;
273
277
  TermVector *tv;
278
+
279
+ tvw_open_doc(tvw);
280
+
274
281
  for (i = 0; i < vectors->size; i++) {
275
282
  tv = vectors->elems[i];
276
283
 
277
284
  store_positions = (tv->tcnt > 0 && tv->positions != NULL);
278
285
  store_offsets = (tv->tcnt > 0 && tv->offsets != NULL);
279
286
 
280
- tvw_create_field(tvw, fis_get_number(tvw->fis, tv->field),
287
+ tvw_create_field(tvw, (int)fis_get_number(tvw->fis, tv->field),
281
288
  store_positions, store_offsets);
282
289
 
283
290
  if (store_positions && store_offsets) {
284
- for (j = 0; j < tv->tcnt; j++)
291
+ for (j = 0; j < tv->tcnt; j++) {
285
292
  tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], tv->offsets[j]);
293
+ }
286
294
  } else if (store_positions) {
287
- for (j = 0; j < tv->tcnt; j++)
295
+ for (j = 0; j < tv->tcnt; j++) {
288
296
  tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], NULL);
297
+ }
289
298
  } else if (store_offsets) {
290
- for (j = 0; j < tv->tcnt; j++)
299
+ for (j = 0; j < tv->tcnt; j++) {
291
300
  tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, tv->offsets[j]);
301
+ }
292
302
  } else {
293
- for (j = 0; j < tv->tcnt; j++)
303
+ for (j = 0; j < tv->tcnt; j++) {
294
304
  tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, NULL);
305
+ }
295
306
  }
296
307
  tvw_close_field(tvw);
297
308
  }
@@ -333,10 +344,9 @@ TermVector *tv_create(
333
344
  return tv;
334
345
  }
335
346
 
336
- void tv_destroy(void *p)
347
+ void tv_destroy(TermVector *tv)
337
348
  {
338
349
  int i, j;
339
- TermVector *tv = (TermVector *)p;
340
350
  for (i = 0; i < tv->tcnt; i++) {
341
351
  free(tv->terms[i]);
342
352
  }
@@ -357,12 +367,11 @@ void tv_destroy(void *p)
357
367
  free(tv->offsets);
358
368
  }
359
369
  free(tv->freqs);
360
- free(p);
370
+ free(tv);
361
371
  }
362
372
 
363
- void tv_destroy_except_data(void *p)
373
+ void tv_destroy_except_data(TermVector *tv)
364
374
  {
365
- TermVector *tv = (TermVector *)p;
366
375
  free(tv->terms);
367
376
  if (tv->positions != NULL) {
368
377
  free(tv->positions);
@@ -371,23 +380,23 @@ void tv_destroy_except_data(void *p)
371
380
  free(tv->offsets);
372
381
  }
373
382
  free(tv->freqs);
374
- free(p);
383
+ free(tv);
375
384
  }
376
385
 
377
386
  int tvr_check_valid_format(InStream *is)
378
387
  {
379
388
  int format = is_read_int(is);
380
389
  if (format > FORMAT_VERSION)
381
- RAISE(ERROR, FORMAT_VERSION_ERROR_MSG);
390
+ RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
382
391
  return format;
383
392
  }
384
393
 
385
394
  TermVectorsReader *tvr_clone(TermVectorsReader *orig)
386
395
  {
387
396
  TermVectorsReader *clone = NULL;
397
+ clone = ALLOC(TermVectorsReader);
398
+ memcpy(clone, orig, sizeof(TermVectorsReader));
388
399
  if (orig->tvx && orig->tvd && orig->tvf) {
389
- clone = ALLOC(TermVectorsReader);
390
- memcpy(clone, orig, sizeof(TermVectorsReader));
391
400
  clone->tvx = is_clone(orig->tvx);
392
401
  clone->tvd = is_clone(orig->tvd);
393
402
  clone->tvf = is_clone(orig->tvf);
@@ -400,23 +409,30 @@ TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis)
400
409
  TermVectorsReader *tvr = ALLOC(TermVectorsReader);
401
410
  // Open files for TermVector storage
402
411
  char fname[SEGMENT_NAME_MAX_LENGTH];
403
- int segment_len = strlen(segment);
412
+ size_t segment_len = strlen(segment);
413
+ InStream *is;
414
+
404
415
  strcpy(fname, segment);
405
416
 
406
417
  strcpy(fname + segment_len, TVX_EXTENSION);
407
- InStream *is = tvr->tvx = store->open_input(store, fname);
408
- tvr_check_valid_format(is);
409
- tvr->size = is_length(is)/8;
418
+ if (!store->exists(store, fname)) {
419
+ tvr->tvx = tvr->tvd = tvr->tvf = NULL;
420
+ tvr->size = 0;
421
+ } else {
422
+ is = tvr->tvx = store->open_input(store, fname);
423
+ tvr_check_valid_format(is);
424
+ tvr->size = is_length(is)/8;
410
425
 
411
- strcpy(fname + segment_len, TVD_EXTENSION);
412
- is = tvr->tvd = store->open_input(store, fname);
413
- tvr->tvd_format = tvr_check_valid_format(is);
426
+ strcpy(fname + segment_len, TVD_EXTENSION);
427
+ is = tvr->tvd = store->open_input(store, fname);
428
+ tvr->tvd_format = tvr_check_valid_format(is);
414
429
 
415
- strcpy(fname + segment_len, TVF_EXTENSION);
416
- is = tvr->tvf = store->open_input(store, fname);
417
- tvr->tvf_format = tvr_check_valid_format(is);
430
+ strcpy(fname + segment_len, TVF_EXTENSION);
431
+ is = tvr->tvf = store->open_input(store, fname);
432
+ tvr->tvf_format = tvr_check_valid_format(is);
418
433
 
419
- tvr->fis = fis;
434
+ tvr->fis = fis;
435
+ }
420
436
  return tvr;
421
437
  }
422
438
 
@@ -426,9 +442,15 @@ void tvr_close(TermVectorsReader *tvr)
426
442
  * exception, everything else will also be closed. */
427
443
  TRY
428
444
  XFINALLY
429
- is_close(tvr->tvx);
430
- is_close(tvr->tvd);
431
- is_close(tvr->tvf);
445
+ if (tvr->tvx) {
446
+ is_close(tvr->tvx);
447
+ }
448
+ if (tvr->tvd) {
449
+ is_close(tvr->tvd);
450
+ }
451
+ if (tvr->tvf) {
452
+ is_close(tvr->tvf);
453
+ }
432
454
  free(tvr);
433
455
  XENDTRY
434
456
  }
@@ -436,17 +458,29 @@ void tvr_close(TermVectorsReader *tvr)
436
458
  TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
437
459
  char *field, int tvf_pointer)
438
460
  {
439
- int i, j, store_positions, store_offsets, bits;
440
- // Now read the data from specified position
441
- // We don't need to offset by the FORMAT here since the pointer
442
- // already includes the offset
461
+ int i, j, store_positions, store_offsets, bits, num_terms;
462
+ char **terms;
463
+ int *term_freqs;
464
+
465
+ /* we may not need these, but declare them */
466
+ int **positions = NULL;
467
+ TVOffsetInfo ***offsets = NULL;
468
+ int start, delta_length, total_length, freq, prev_pos;
469
+ int start_offset, end_offset, prev_offset;
470
+ int *pos;
471
+ TVOffsetInfo **offs;
472
+ char buffer[MAX_WORD_SIZE] = "";
473
+
474
+ /* Now read the data from specified position. We don't need to offset
475
+ * offset by the FORMAT here since the pointer already includes the offset */
443
476
  is_seek(tvr->tvf, tvf_pointer);
444
-
445
- int num_terms = is_read_vint(tvr->tvf);
446
- // If no terms - return a constant empty termvector. However, this should
447
- // never occur!
448
- if (num_terms == 0)
477
+ num_terms = (int)is_read_vint(tvr->tvf);
478
+
479
+ /* If no terms - return a constant empty termvector. However, this should
480
+ * never occur! */
481
+ if (num_terms == 0) {
449
482
  return tv_create(field, NULL, 0, NULL, NULL, NULL);
483
+ }
450
484
 
451
485
  if(tvr->tvf_format == FORMAT_VERSION) {
452
486
  bits = is_read_byte(tvr->tvf);
@@ -458,41 +492,34 @@ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
458
492
  store_offsets = false;
459
493
  }
460
494
 
461
- char **terms = ALLOC_N(char *, num_terms);
462
- int *term_freqs = ALLOC_N(int, num_terms);
463
-
464
- // we may not need these, but declare them
465
- int **positions = NULL;
466
- TVOffsetInfo ***offsets = NULL;
495
+ terms = ALLOC_N(char *, num_terms);
496
+ term_freqs = ALLOC_N(int, num_terms);
467
497
 
468
- if(store_positions)
498
+ if (store_positions) {
469
499
  positions = ALLOC_N(int *, num_terms);
500
+ }
470
501
 
471
- if(store_offsets)
502
+ if (store_offsets) {
472
503
  offsets = ALLOC_N(TVOffsetInfo **, num_terms);
504
+ }
473
505
 
474
- int start, delta_length, total_length, freq, prev_pos;
475
- int start_offset, end_offset, prev_offset;
476
- int *pos;
477
- TVOffsetInfo **offs;
478
- char buffer[MAX_WORD_SIZE] = "";
479
506
 
480
507
  for (i = 0; i < num_terms; i++) {
481
- start = is_read_vint(tvr->tvf);
482
- delta_length = is_read_vint(tvr->tvf);
508
+ start = (int)is_read_vint(tvr->tvf);
509
+ delta_length = (int)is_read_vint(tvr->tvf);
483
510
  total_length = start + delta_length;
484
511
  is_read_chars(tvr->tvf, buffer, start, delta_length);
485
512
  buffer[total_length] = '\0';
486
513
  terms[i] = estrdup(buffer);
487
- freq = is_read_vint(tvr->tvf);
514
+ freq = (int)is_read_vint(tvr->tvf);
488
515
  term_freqs[i] = freq;
489
516
 
490
- if (store_positions) {//read in the positions
517
+ if (store_positions) {/* read in the positions */
491
518
  pos = ALLOC_N(int, freq);
492
519
  positions[i] = pos;
493
520
  prev_pos = 0;
494
521
  for (j = 0; j < freq; j++) {
495
- pos[j] = prev_pos + is_read_vint(tvr->tvf);
522
+ pos[j] = prev_pos + (int)is_read_vint(tvr->tvf);
496
523
  prev_pos = pos[j];
497
524
  }
498
525
  }
@@ -502,8 +529,8 @@ TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
502
529
  offsets[i] = offs;
503
530
  prev_offset = 0;
504
531
  for (j = 0; j < freq; j++) {
505
- start_offset = prev_offset + is_read_vint(tvr->tvf);
506
- end_offset = start_offset + is_read_vint(tvr->tvf);
532
+ start_offset = prev_offset + (int)is_read_vint(tvr->tvf);
533
+ end_offset = start_offset + (int)is_read_vint(tvr->tvf);
507
534
  offs[j] = tvoi_create(start_offset, end_offset);
508
535
  prev_offset = end_offset;
509
536
  }
@@ -516,39 +543,41 @@ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
516
543
  {
517
544
  int i;
518
545
  Array *tvs = NULL;
519
- // Check if no term vectors are available for this segment at all
546
+ /* Check if no term vectors are available for this segment at all */
520
547
  if (tvr->tvx != NULL) {
521
- // We need to offset by
548
+ int position, field_count;
549
+ /* We need to offset by */
522
550
  is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
523
551
 
524
- int position = is_read_long(tvr->tvx);
552
+ position = (int)is_read_long(tvr->tvx);
525
553
 
526
554
  is_seek(tvr->tvd, position);
527
- int field_count = is_read_vint(tvr->tvd);
555
+ field_count = (int)is_read_vint(tvr->tvd);
528
556
 
529
- // No fields are vectorized for this document
557
+ /* No fields are vectorized for this document */
530
558
  if (field_count > 0) {
531
559
  int number = 0;
560
+ int position = 0;
561
+ int *tvf_pointers = ALLOC_N(int, field_count);
532
562
  char **fields = ALLOC_N(char *, field_count);
533
563
 
534
564
  for (i = 0; i < field_count; i++) {
535
- if (tvr->tvd_format == FORMAT_VERSION)
536
- number = is_read_vint(tvr->tvd);
537
- else
538
- number += is_read_vint(tvr->tvd);
565
+ if (tvr->tvd_format == FORMAT_VERSION) {
566
+ number = (int)is_read_vint(tvr->tvd);
567
+ } else {
568
+ number += (int)is_read_vint(tvr->tvd);
569
+ }
539
570
 
540
571
  fields[i] = tvr->fis->by_number[number]->name;
541
572
  }
542
573
 
543
- // Compute position in the tvf file
544
- int position = 0;
545
- int *tvf_pointers = ALLOC_N(int, field_count);
574
+ /* Compute position in the tvf file */
546
575
  for (i = 0; i < field_count; i++) {
547
- position += is_read_vint(tvr->tvd);
576
+ position += (int)is_read_vint(tvr->tvd);
548
577
  tvf_pointers[i] = position;
549
578
  }
550
579
 
551
- tvs = ary_create(field_count, &tv_destroy);
580
+ tvs = ary_create(field_count, (free_ft)&tv_destroy);
552
581
  for (i = 0; i < field_count; i++) {
553
582
  ary_append(tvs, tvr_read_term_vector(tvr, fields[i], tvf_pointers[i]));
554
583
  }
@@ -562,45 +591,45 @@ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num)
562
591
  TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field)
563
592
  {
564
593
  int i;
565
- // Check if no term vectors are available for this segment at all
566
- int field_number = fis_get_number(tvr->fis, field);
594
+ /* Check if no term vectors are available for this segment at all */
595
+ int field_number = (int)fis_get_number(tvr->fis, field);
567
596
  TermVector *tv = NULL;
568
597
 
569
598
  if (tvr->tvx != NULL) {
570
- // We need to account for the FORMAT_SIZE at when seeking in the @tvx
571
- // We don't need to do this in other seeks because we already have the
572
- // file pointer that was written in another file
599
+ int pos, field_count, number = 0, found = -1;
600
+ /* We need to account for the FORMAT_SIZE at when seeking in the @tvx
601
+ * We don't need to do this in other seeks because we already have the
602
+ * file pointer that was written in another file */
573
603
  is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
574
- // puts("TVX Pointer: " + @tvx.pos())
575
- int pos = is_read_long(tvr->tvx);
604
+ //printf("TVX Pointer: %d\n", is_pos(tvr->tvx));
605
+ pos = (int)is_read_long(tvr->tvx);
576
606
 
577
607
  is_seek(tvr->tvd, pos);
578
- int field_count = is_read_vint(tvr->tvd);
579
- //puts("Num Fields: " + field_count)
580
- // There are only a few fields per document. We opt for a full scan
581
- // rather then requiring that they be ordered. We need to read through
582
- // all of the fields anyway to get to the tvf pointers.
583
- int number = 0;
584
- int found = -1;
585
-
608
+ field_count = (int)is_read_vint(tvr->tvd);
609
+ //printf("Num Fields: %d\n", field_count);
610
+ /* There are only a few fields per document. We opt for a full scan
611
+ * rather then requiring that they be ordered. We need to read through
612
+ * all of the fields anyway to get to the tvf pointers. */
586
613
  for (i = 0; i < field_count; i++) {
587
- if (tvr->tvd_format == FORMAT_VERSION)
588
- number = is_read_vint(tvr->tvd);
589
- else
590
- number += is_read_vint(tvr->tvd);
614
+ if (tvr->tvd_format == FORMAT_VERSION) {
615
+ number = (int)is_read_vint(tvr->tvd);
616
+ } else {
617
+ number += (int)is_read_vint(tvr->tvd);
618
+ }
591
619
 
592
- if (number == field_number)
620
+ if (number == field_number) {
593
621
  found = i;
622
+ }
594
623
  }
595
624
 
596
- // This field, although valid in the segment, was not found in this
597
- // document
625
+ /* This field, although valid in the segment, was not found in this
626
+ * document */
598
627
  if (found != -1) {
599
- // Compute pos in the @tvf file
628
+ /* Compute pos in the tvf file */
600
629
  pos = 0;
601
- for (i = 0; i <= found; i++)
602
- pos += is_read_vint(tvr->tvd);
603
-
630
+ for (i = 0; i <= found; i++) {
631
+ pos += (int)is_read_vint(tvr->tvd);
632
+ }
604
633
  tv = tvr_read_term_vector(tvr, field, pos);
605
634
  }
606
635
  }