ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
    
        data/ext/termdocs.c
    CHANGED
    
    | @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            #include  | 
| 1 | 
            +
            #include "index.h"
         | 
| 2 2 | 
             
            #include <string.h>
         | 
| 3 3 |  | 
| 4 4 | 
             
            static char * const TPE_VS_TDE_ERROR_MSG = "TermPosEnum does not handle processing multiple documents in one call. Use TermDocEnum instead.";
         | 
| @@ -59,22 +59,24 @@ bool stde_next(TermDocEnum *tde) | |
| 59 59 | 
             
              SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
         | 
| 60 60 | 
             
              while (true) { 
         | 
| 61 61 |  | 
| 62 | 
            -
                if (stde->count >= stde->doc_freq)
         | 
| 62 | 
            +
                if (stde->count >= stde->doc_freq) {
         | 
| 63 63 | 
             
                  return false;
         | 
| 64 | 
            +
                }
         | 
| 64 65 |  | 
| 65 | 
            -
                doc_code = is_read_vint(stde->freq_in);
         | 
| 66 | 
            -
                stde->doc_num += doc_code >> 1;     | 
| 67 | 
            -
                if ((doc_code & 1) != 0) {          | 
| 68 | 
            -
                  stde->freq = 1;                   | 
| 66 | 
            +
                doc_code = (int)is_read_vint(stde->freq_in);
         | 
| 67 | 
            +
                stde->doc_num += doc_code >> 1;    /* shift off low bit */
         | 
| 68 | 
            +
                if ((doc_code & 1) != 0) {         /* if low bit is set */
         | 
| 69 | 
            +
                  stde->freq = 1;                  /* freq is one */
         | 
| 69 70 | 
             
                } else {
         | 
| 70 | 
            -
                  stde->freq = is_read_vint(stde->freq_in);  | 
| 71 | 
            +
                  stde->freq = (int)is_read_vint(stde->freq_in); /* read freq */
         | 
| 71 72 | 
             
                }
         | 
| 72 73 |  | 
| 73 74 | 
             
                stde->count++;
         | 
| 74 75 |  | 
| 75 76 | 
             
                if (stde->deleted_docs == NULL ||
         | 
| 76 | 
            -
                    bv_get(stde->deleted_docs, stde->doc_num) == 0)
         | 
| 77 | 
            -
                  break;  | 
| 77 | 
            +
                    bv_get(stde->deleted_docs, stde->doc_num) == 0) {
         | 
| 78 | 
            +
                  break; /* We found an undeleted doc so return */
         | 
| 79 | 
            +
                }
         | 
| 78 80 |  | 
| 79 81 | 
             
                stde->skip_prox(stde);
         | 
| 80 82 | 
             
              }
         | 
| @@ -90,41 +92,49 @@ int stde_freq(TermDocEnum *tde) | |
| 90 92 | 
             
            bool stde_skip_to(TermDocEnum *tde, int target_doc_num)
         | 
| 91 93 | 
             
            {
         | 
| 92 94 | 
             
              SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
         | 
| 93 | 
            -
              if (stde->doc_freq >= stde->skip_interval) { // optimized case
         | 
| 94 95 |  | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 96 | 
            +
              if (stde->doc_freq >= stde->skip_interval) { /* optimized case */
         | 
| 97 | 
            +
                int last_skip_doc;
         | 
| 98 | 
            +
                int last_freq_pointer;
         | 
| 99 | 
            +
                int last_prox_pointer;
         | 
| 100 | 
            +
                int num_skipped;
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                if (stde->skip_in == NULL) {
         | 
| 103 | 
            +
                  stde->skip_in = is_clone(stde->freq_in); /* lazily clone */
         | 
| 104 | 
            +
                }
         | 
| 97 105 |  | 
| 98 | 
            -
                if (!stde->have_skipped) {                  | 
| 106 | 
            +
                if (!stde->have_skipped) {                 /* lazily seek skip stream */
         | 
| 99 107 | 
             
                  is_seek(stde->skip_in, stde->skip_pointer);
         | 
| 100 108 | 
             
                  stde->have_skipped = true;
         | 
| 101 109 | 
             
                }
         | 
| 102 110 |  | 
| 103 | 
            -
                 | 
| 104 | 
            -
                 | 
| 105 | 
            -
                 | 
| 106 | 
            -
                 | 
| 107 | 
            -
                 | 
| 111 | 
            +
                /* scan skip data */
         | 
| 112 | 
            +
                last_skip_doc = stde->skip_doc;
         | 
| 113 | 
            +
                last_freq_pointer = is_pos(stde->freq_in);
         | 
| 114 | 
            +
                last_prox_pointer = -1;
         | 
| 115 | 
            +
                num_skipped = -1 - (stde->count % stde->skip_interval);
         | 
| 108 116 |  | 
| 109 117 | 
             
                while (target_doc_num > stde->skip_doc) {
         | 
| 110 118 | 
             
                  last_skip_doc = stde->skip_doc;
         | 
| 111 119 | 
             
                  last_freq_pointer = stde->freq_pointer;
         | 
| 112 120 | 
             
                  last_prox_pointer = stde->prox_pointer;
         | 
| 113 121 |  | 
| 114 | 
            -
                  if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num)
         | 
| 122 | 
            +
                  if (stde->skip_doc != 0 && stde->skip_doc >= stde->doc_num) {
         | 
| 115 123 | 
             
                    num_skipped += stde->skip_interval;
         | 
| 124 | 
            +
                  }
         | 
| 116 125 |  | 
| 117 | 
            -
                  if(stde->skip_count >= stde->num_skips)
         | 
| 126 | 
            +
                  if(stde->skip_count >= stde->num_skips) {
         | 
| 118 127 | 
             
                    break;
         | 
| 128 | 
            +
                  }
         | 
| 119 129 |  | 
| 120 | 
            -
                  stde->skip_doc += is_read_vint(stde->skip_in);
         | 
| 121 | 
            -
                  stde->freq_pointer += is_read_vint(stde->skip_in);
         | 
| 122 | 
            -
                  stde->prox_pointer += is_read_vint(stde->skip_in);
         | 
| 130 | 
            +
                  stde->skip_doc += (int)is_read_vint(stde->skip_in);
         | 
| 131 | 
            +
                  stde->freq_pointer += (int)is_read_vint(stde->skip_in);
         | 
| 132 | 
            +
                  stde->prox_pointer += (int)is_read_vint(stde->skip_in);
         | 
| 123 133 |  | 
| 124 134 | 
             
                  stde->skip_count++;
         | 
| 125 135 | 
             
                }
         | 
| 126 136 |  | 
| 127 | 
            -
                 | 
| 137 | 
            +
                /* if we found something to skip, so skip it */
         | 
| 128 138 | 
             
                if (last_freq_pointer > is_pos(stde->freq_in)) {
         | 
| 129 139 | 
             
                  is_seek(stde->freq_in, last_freq_pointer);
         | 
| 130 140 | 
             
                  stde->seek_prox(stde, last_prox_pointer);
         | 
| @@ -134,7 +144,7 @@ bool stde_skip_to(TermDocEnum *tde, int target_doc_num) | |
| 134 144 | 
             
                }
         | 
| 135 145 | 
             
              }
         | 
| 136 146 |  | 
| 137 | 
            -
               | 
| 147 | 
            +
              /* done skipping, now just scan */
         | 
| 138 148 | 
             
              do { 
         | 
| 139 149 | 
             
                if (! tde->next(tde)) {
         | 
| 140 150 | 
             
                  return false;
         | 
| @@ -148,13 +158,14 @@ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num) | |
| 148 158 | 
             
              SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
         | 
| 149 159 | 
             
              int i = 0, doc_code;
         | 
| 150 160 | 
             
              while (i < req_num && stde->count < stde->doc_freq) {
         | 
| 151 | 
            -
                 | 
| 152 | 
            -
                doc_code = is_read_vint(stde->freq_in);
         | 
| 153 | 
            -
                stde->doc_num += doc_code >> 1; | 
| 154 | 
            -
                if ((doc_code & 1) != 0) | 
| 155 | 
            -
                  stde->freq = 1; | 
| 156 | 
            -
                else
         | 
| 157 | 
            -
                  stde->freq = is_read_vint(stde->freq_in); | 
| 161 | 
            +
                /* manually inlined call to next() for speed */
         | 
| 162 | 
            +
                doc_code = (int)is_read_vint(stde->freq_in);
         | 
| 163 | 
            +
                stde->doc_num += (doc_code >> 1);            /* shift off low bit */
         | 
| 164 | 
            +
                if ((doc_code & 1) != 0) {                   /* if low bit is set */
         | 
| 165 | 
            +
                  stde->freq = 1;                            /* freq is one */
         | 
| 166 | 
            +
                } else {
         | 
| 167 | 
            +
                  stde->freq = (int)is_read_vint(stde->freq_in);  /* else read freq */
         | 
| 168 | 
            +
                }
         | 
| 158 169 |  | 
| 159 170 | 
             
                stde->count++;
         | 
| 160 171 |  | 
| @@ -170,6 +181,7 @@ int stde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num) | |
| 170 181 |  | 
| 171 182 | 
             
            TermDocEnum *stde_create(IndexReader *ir)
         | 
| 172 183 | 
             
            {
         | 
| 184 | 
            +
              SegmentTermDocEnum *stde = ALLOC_AND_ZERO(SegmentTermDocEnum);
         | 
| 173 185 | 
             
              SegmentReader *sr = (SegmentReader *)ir->data;
         | 
| 174 186 | 
             
              TermDocEnum *tde = ALLOC(TermDocEnum);
         | 
| 175 187 | 
             
              tde->seek = &stde_seek;
         | 
| @@ -181,8 +193,6 @@ TermDocEnum *stde_create(IndexReader *ir) | |
| 181 193 | 
             
              tde->next_position = NULL;
         | 
| 182 194 | 
             
              tde->close = &stde_close;
         | 
| 183 195 |  | 
| 184 | 
            -
              SegmentTermDocEnum *stde = ALLOC(SegmentTermDocEnum);
         | 
| 185 | 
            -
              ZEROSET(stde, SegmentTermDocEnum, 1); // set all values to 0
         | 
| 186 196 | 
             
              tde->data = stde;
         | 
| 187 197 | 
             
              stde->parent = sr;
         | 
| 188 198 | 
             
              stde->freq_in = is_clone(sr->freq_in);
         | 
| @@ -260,11 +270,12 @@ int stpe_next_position(TermDocEnum *tde) | |
| 260 270 | 
             
            {
         | 
| 261 271 | 
             
              SegmentTermDocEnum *stde = (SegmentTermDocEnum *)tde->data;
         | 
| 262 272 | 
             
              stde->prox_cnt--;
         | 
| 263 | 
            -
              return stde->position += is_read_vint(stde->prox_in);
         | 
| 273 | 
            +
              return stde->position += (int)is_read_vint(stde->prox_in);
         | 
| 264 274 | 
             
            }
         | 
| 265 275 |  | 
| 266 276 | 
             
            TermDocEnum *stpe_create(IndexReader *ir)
         | 
| 267 277 | 
             
            {
         | 
| 278 | 
            +
              SegmentTermDocEnum *stde;
         | 
| 268 279 | 
             
              SegmentReader *sr = (SegmentReader *)ir->data;
         | 
| 269 280 | 
             
              TermDocEnum *tde = stde_create(ir);
         | 
| 270 281 | 
             
              tde->close = &stpe_close;
         | 
| @@ -273,7 +284,7 @@ TermDocEnum *stpe_create(IndexReader *ir) | |
| 273 284 | 
             
              tde->read = &stpe_read;
         | 
| 274 285 | 
             
              tde->next_position = &stpe_next_position;
         | 
| 275 286 |  | 
| 276 | 
            -
               | 
| 287 | 
            +
              stde = (SegmentTermDocEnum *)tde->data;
         | 
| 277 288 | 
             
              stde->prox_in = is_clone(sr->prox_in);
         | 
| 278 289 | 
             
              stde->prox_cnt = 0;
         | 
| 279 290 | 
             
              stde->position = 0;
         | 
| @@ -321,16 +332,18 @@ TermDocEnum *mtde_term_docs_from_reader(IndexReader *ir) | |
| 321 332 |  | 
| 322 333 | 
             
            TermDocEnum *mtde_term_docs(MultiTermDocEnum *mtde, int i)
         | 
| 323 334 | 
             
            {
         | 
| 324 | 
            -
              if (mtde->term == NULL)
         | 
| 335 | 
            +
              if (mtde->term == NULL) {
         | 
| 325 336 | 
             
                return NULL;
         | 
| 337 | 
            +
              } else {
         | 
| 326 338 |  | 
| 327 | 
            -
             | 
| 328 | 
            -
             | 
| 329 | 
            -
             | 
| 330 | 
            -
             | 
| 339 | 
            +
                TermDocEnum *tde = mtde->irs_tde[i];
         | 
| 340 | 
            +
                if (tde == NULL) {
         | 
| 341 | 
            +
                  tde = mtde->irs_tde[i] = mtde->term_docs_from_reader(mtde->irs[i]);
         | 
| 342 | 
            +
                }
         | 
| 331 343 |  | 
| 332 | 
            -
             | 
| 333 | 
            -
             | 
| 344 | 
            +
                tde->seek(tde, mtde->term);
         | 
| 345 | 
            +
                return tde;
         | 
| 346 | 
            +
              }
         | 
| 334 347 | 
             
            }
         | 
| 335 348 |  | 
| 336 349 | 
             
            bool mtde_next(TermDocEnum *tde)
         | 
| @@ -411,6 +424,7 @@ int mtde_read(TermDocEnum *tde, int *docs, int *freqs, int req_num) | |
| 411 424 |  | 
| 412 425 | 
             
            TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt)
         | 
| 413 426 | 
             
            {
         | 
| 427 | 
            +
              MultiTermDocEnum *mtde = ALLOC_AND_ZERO(MultiTermDocEnum);
         | 
| 414 428 | 
             
              TermDocEnum *tde = ALLOC(TermDocEnum);
         | 
| 415 429 | 
             
              tde->close = &mtde_close;
         | 
| 416 430 | 
             
              tde->seek = &mtde_seek;
         | 
| @@ -421,8 +435,6 @@ TermDocEnum *mtde_create(IndexReader **irs, int *starts, int ir_cnt) | |
| 421 435 | 
             
              tde->read = &mtde_read;
         | 
| 422 436 | 
             
              tde->next_position = NULL;
         | 
| 423 437 |  | 
| 424 | 
            -
              MultiTermDocEnum *mtde = ALLOC(MultiTermDocEnum);
         | 
| 425 | 
            -
              ZEROSET(mtde, MultiTermDocEnum, 1); // set all values to 0
         | 
| 426 438 | 
             
              tde->data = mtde;
         | 
| 427 439 | 
             
              mtde->irs = irs;
         | 
| 428 440 | 
             
              mtde->starts = starts;
         | 
| @@ -467,8 +479,7 @@ TermDocEnum *mtpe_create(IndexReader **irs, int *starts, int ir_cnt) | |
| 467 479 | 
             
             ****************************************************************************/
         | 
| 468 480 |  | 
| 469 481 | 
             
            #define GET_MTDPE MultipleTermDocPosEnum *mtdpe = (MultipleTermDocPosEnum *)self->data
         | 
| 470 | 
            -
            void tde_destroy( | 
| 471 | 
            -
              TermDocEnum *self = (TermDocEnum *)p;
         | 
| 482 | 
            +
            void tde_destroy(TermDocEnum *self) {
         | 
| 472 483 | 
             
              self->close(self);
         | 
| 473 484 | 
             
            }
         | 
| 474 485 |  | 
| @@ -570,12 +581,26 @@ int mtdpe_next_position(TermDocEnum *self) | |
| 570 581 |  | 
| 571 582 | 
             
            TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt)
         | 
| 572 583 | 
             
            {
         | 
| 584 | 
            +
              int i;
         | 
| 573 585 | 
             
              TermDocEnum *self = ALLOC(TermDocEnum);
         | 
| 574 | 
            -
              MultipleTermDocPosEnum *mtdpe =  | 
| 586 | 
            +
              MultipleTermDocPosEnum *mtdpe = ALLOC_AND_ZERO_N(MultipleTermDocPosEnum, 1);
         | 
| 575 587 | 
             
              PriorityQueue *pq;
         | 
| 576 588 | 
             
              TermDocEnum *tpe;
         | 
| 577 | 
            -
              int i;
         | 
| 578 589 |  | 
| 590 | 
            +
              pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
         | 
| 591 | 
            +
              mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
         | 
| 592 | 
            +
              mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
         | 
| 593 | 
            +
              for (i = 0; i < t_cnt; i++) {
         | 
| 594 | 
            +
                tpe = ir_term_positions_for(ir, terms[i]);
         | 
| 595 | 
            +
                if (tpe->next(tpe)) {
         | 
| 596 | 
            +
                  pq_push(pq, tpe);
         | 
| 597 | 
            +
                } else {
         | 
| 598 | 
            +
                  tpe->close(tpe);
         | 
| 599 | 
            +
                }
         | 
| 600 | 
            +
              }
         | 
| 601 | 
            +
              pq->free_elem = (free_ft)&tde_destroy;
         | 
| 602 | 
            +
             | 
| 603 | 
            +
              self->data = mtdpe;
         | 
| 579 604 | 
             
              self->close = &mtdpe_close;
         | 
| 580 605 | 
             
              self->seek = &mtdpe_seek;
         | 
| 581 606 | 
             
              self->next = &mtdpe_next;
         | 
| @@ -585,17 +610,6 @@ TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt) | |
| 585 610 | 
             
              self->read = &mtdpe_read;
         | 
| 586 611 | 
             
              self->next_position = &mtdpe_next_position;
         | 
| 587 612 |  | 
| 588 | 
            -
              ZEROSET(mtdpe, MultipleTermDocPosEnum, 1); // set all values to 0
         | 
| 589 | 
            -
              self->data = mtdpe;
         | 
| 590 | 
            -
              pq = mtdpe->pq = pq_create(t_cnt, &tdpe_less_than);
         | 
| 591 | 
            -
              mtdpe->pos_queue_capa = MTDPE_POS_QUEUE_INIT_CAPA;
         | 
| 592 | 
            -
              mtdpe->pos_queue = ALLOC_N(int, MTDPE_POS_QUEUE_INIT_CAPA);
         | 
| 593 | 
            -
              for (i = 0; i < t_cnt; i++) {
         | 
| 594 | 
            -
                tpe = ir_term_positions_for(ir, terms[i]);
         | 
| 595 | 
            -
                if (tpe->next(tpe)) pq_push(pq, tpe);
         | 
| 596 | 
            -
              }
         | 
| 597 | 
            -
              pq->free_elem = &tde_destroy;
         | 
| 598 | 
            -
             | 
| 599 613 | 
             
              return self;
         | 
| 600 614 | 
             
            }
         | 
| 601 615 |  | 
    
        data/ext/vector.c
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
            -
            #include  | 
| 1 | 
            +
            #include "index.h"
         | 
| 2 | 
            +
            #include "helper.h"
         | 
| 2 3 | 
             
            #include <string.h>
         | 
| 3 | 
            -
            #include <helper.h>
         | 
| 4 4 |  | 
| 5 5 | 
             
            static char * const NULL_POS_ERROR_MSG = "Trying to write positions that are null!";
         | 
| 6 6 | 
             
            static char * const NULL_OFFSETS_ERROR_MSG = "Trying to write offsets that are null!";
         | 
| @@ -50,33 +50,25 @@ TVTerm *tvt_create(char *text, int freq, int *positions, TVOffsetInfo **offsets) | |
| 50 50 |  | 
| 51 51 | 
             
            void tvt_destroy(void *p)
         | 
| 52 52 | 
             
            {
         | 
| 53 | 
            -
              //int i;
         | 
| 54 | 
            -
              //TVTerm *tvt = (TVTerm *)p;
         | 
| 55 | 
            -
              //free(tvt->text);
         | 
| 56 | 
            -
              //free(tvt->positions);
         | 
| 57 | 
            -
              //if (tvt->offsets != NULL) {
         | 
| 58 | 
            -
              //  for (i = 0; i < tvt->freq; i++) {
         | 
| 59 | 
            -
              //    tvoi_destroy(tvt->offsets[i]);
         | 
| 60 | 
            -
              //  }
         | 
| 61 | 
            -
              //  free(tvt->offsets);
         | 
| 62 | 
            -
              //}
         | 
| 63 53 | 
             
              free(p);
         | 
| 64 54 | 
             
            }
         | 
| 65 55 |  | 
| 66 56 |  | 
| 67 57 | 
             
            TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis)
         | 
| 68 58 | 
             
            {
         | 
| 59 | 
            +
              char fname[SEGMENT_NAME_MAX_LENGTH];
         | 
| 60 | 
            +
              size_t segment_len = strlen(segment);
         | 
| 69 61 | 
             
              TermVectorsWriter *tvw = ALLOC(TermVectorsWriter);
         | 
| 62 | 
            +
              OutStream *os;
         | 
| 63 | 
            +
             | 
| 70 64 | 
             
              tvw->curr_field = NULL;
         | 
| 71 65 | 
             
              tvw->curr_doc_pointer = -1;
         | 
| 72 66 |  | 
| 73 | 
            -
               | 
| 74 | 
            -
              char fname[SEGMENT_NAME_MAX_LENGTH];
         | 
| 75 | 
            -
              int segment_len = strlen(segment);
         | 
| 67 | 
            +
              /* Open files for TermVector storage */
         | 
| 76 68 | 
             
              strcpy(fname, segment);
         | 
| 77 69 |  | 
| 78 70 | 
             
              strcpy(fname + segment_len, TVX_EXTENSION);
         | 
| 79 | 
            -
               | 
| 71 | 
            +
              os = tvw->tvx = store->create_output(store, fname);
         | 
| 80 72 | 
             
              os_write_int(os, FORMAT_VERSION);
         | 
| 81 73 |  | 
| 82 74 | 
             
              strcpy(fname + segment_len, TVD_EXTENSION);
         | 
| @@ -104,43 +96,48 @@ void tvw_write_field(TermVectorsWriter *tvw) | |
| 104 96 | 
             
              int i, j, start, length;
         | 
| 105 97 | 
             
              char *last_term_text;
         | 
| 106 98 | 
             
              TVOffsetInfo *tmp_offset;
         | 
| 107 | 
            -
               | 
| 99 | 
            +
              TVTerm **terms = tvw->terms;
         | 
| 100 | 
            +
              TVTerm *term;
         | 
| 101 | 
            +
              /* remember where this field is written */
         | 
| 108 102 | 
             
              OutStream *tvf = tvw->tvf;
         | 
| 103 | 
            +
              int store_positions = tvw->curr_field->store_positions;
         | 
| 104 | 
            +
              int store_offsets = tvw->curr_field->store_offsets;
         | 
| 105 | 
            +
              uchar bits = 0x0;
         | 
| 106 | 
            +
             | 
| 109 107 | 
             
              tvw->curr_field->tvf_pointer = os_pos(tvf);
         | 
| 110 108 |  | 
| 111 | 
            -
               | 
| 109 | 
            +
              /* write the number of terms */
         | 
| 112 110 | 
             
              os_write_vint(tvf, tvw->tcnt);
         | 
| 113 111 |  | 
| 114 | 
            -
               | 
| 115 | 
            -
              int store_offsets = tvw->curr_field->store_offsets;
         | 
| 116 | 
            -
              int bits = 0x0;
         | 
| 117 | 
            -
              if (store_positions) 
         | 
| 112 | 
            +
              if (store_positions) {
         | 
| 118 113 | 
             
                bits |= STORE_POSITIONS_WITH_TERMVECTOR;
         | 
| 114 | 
            +
              }
         | 
| 119 115 |  | 
| 120 | 
            -
              if (store_offsets) 
         | 
| 116 | 
            +
              if (store_offsets) {
         | 
| 121 117 | 
             
                bits |= STORE_OFFSET_WITH_TERMVECTOR;
         | 
| 118 | 
            +
              }
         | 
| 122 119 |  | 
| 123 | 
            -
              os_write_byte(tvf, bits);
         | 
| 120 | 
            +
              os_write_byte(tvf, (uchar)bits);
         | 
| 124 121 |  | 
| 125 122 | 
             
              last_term_text = (char *)EMPTY_STRING;
         | 
| 126 | 
            -
              TVTerm **terms = tvw->terms;
         | 
| 127 | 
            -
              TVTerm *term;
         | 
| 128 123 | 
             
              for (i = 0; i < tvw->tcnt; i++) {
         | 
| 129 124 | 
             
                term = terms[i];
         | 
| 130 125 | 
             
                start = hlp_string_diff(last_term_text, term->text);
         | 
| 131 | 
            -
                length = strlen(term->text) - start;
         | 
| 132 | 
            -
                os_write_vint(tvf, start);        | 
| 133 | 
            -
                os_write_vint(tvf, length);       | 
| 134 | 
            -
                os_write_chars(tvf, term->text, start, length);   | 
| 126 | 
            +
                length = (int)strlen(term->text) - start;
         | 
| 127 | 
            +
                os_write_vint(tvf, start);       /* write shared prefix length */
         | 
| 128 | 
            +
                os_write_vint(tvf, length);      /* write delta length */
         | 
| 129 | 
            +
                os_write_chars(tvf, term->text, start, length);  /* write delta chars */
         | 
| 135 130 | 
             
                os_write_vint(tvf, term->freq);
         | 
| 136 131 | 
             
                last_term_text = term->text;
         | 
| 137 132 |  | 
| 138 133 | 
             
                if (store_positions) {
         | 
| 139 | 
            -
                  if (term->positions == NULL)
         | 
| 140 | 
            -
                    RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
         | 
| 141 | 
            -
                  
         | 
| 142 | 
            -
                  // use delta encoding for positions
         | 
| 143 134 | 
             
                  int last_pos = 0;
         | 
| 135 | 
            +
                  
         | 
| 136 | 
            +
                  if (term->positions == NULL) {
         | 
| 137 | 
            +
                    RAISE(IO_ERROR, NULL_POS_ERROR_MSG);
         | 
| 138 | 
            +
                  }
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                  /* use delta encoding for positions */
         | 
| 144 141 | 
             
                  for (j = 0; j < term->freq; j++) {
         | 
| 145 142 | 
             
                    os_write_vint(tvf, term->positions[j] - last_pos);
         | 
| 146 143 | 
             
                    last_pos = term->positions[j];
         | 
| @@ -148,16 +145,18 @@ void tvw_write_field(TermVectorsWriter *tvw) | |
| 148 145 | 
             
                }
         | 
| 149 146 |  | 
| 150 147 | 
             
                if (store_offsets) {
         | 
| 151 | 
            -
                  if (term->offsets == NULL)
         | 
| 152 | 
            -
                    RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
         | 
| 153 | 
            -
                  
         | 
| 154 | 
            -
                  // use delta encoding for offsets
         | 
| 155 148 | 
             
                  int last_end = 0;
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                  if (term->offsets == NULL) {
         | 
| 151 | 
            +
                    RAISE(IO_ERROR, NULL_OFFSETS_ERROR_MSG);
         | 
| 152 | 
            +
                  }
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                  /* use delta encoding for offsets */
         | 
| 156 155 | 
             
                  for (j = 0; j < term->freq; j++) {
         | 
| 157 156 | 
             
                    tmp_offset = term->offsets[j];
         | 
| 158 157 | 
             
                    os_write_vint(tvf, tmp_offset->start - last_end);
         | 
| 159 158 |  | 
| 160 | 
            -
                     | 
| 159 | 
            +
                    /* save the diff between the two */
         | 
| 161 160 | 
             
                    os_write_vint(tvf, tmp_offset->end - tmp_offset->start);
         | 
| 162 161 | 
             
                    last_end = tmp_offset->end;
         | 
| 163 162 | 
             
                  }
         | 
| @@ -169,13 +168,14 @@ void tvw_close_field(TermVectorsWriter *tvw) | |
| 169 168 | 
             
            {
         | 
| 170 169 | 
             
              int i;
         | 
| 171 170 | 
             
              if (tvw->curr_field != NULL) {
         | 
| 172 | 
            -
                 | 
| 171 | 
            +
                /* save field and terms */
         | 
| 173 172 | 
             
                tvw_write_field(tvw);
         | 
| 174 173 |  | 
| 175 174 | 
             
                if (tvw->fcnt >= tvw->fsize) {
         | 
| 176 175 | 
             
                  tvw->fsize *=2;
         | 
| 177 | 
            -
                  if (tvw->fsize < FIELD_ARR_START_SIZE)
         | 
| 176 | 
            +
                  if (tvw->fsize < FIELD_ARR_START_SIZE) {
         | 
| 178 177 | 
             
                    tvw->fsize = FIELD_ARR_START_SIZE;
         | 
| 178 | 
            +
                  }
         | 
| 179 179 | 
             
                  REALLOC_N(tvw->fields, TVField *, tvw->fsize);
         | 
| 180 180 | 
             
                }
         | 
| 181 181 | 
             
                tvw->fields[tvw->fcnt] = tvw->curr_field;
         | 
| @@ -205,30 +205,34 @@ void tvw_open_field(TermVectorsWriter *tvw, char *field) | |
| 205 205 |  | 
| 206 206 | 
             
            void tvw_write_doc(TermVectorsWriter *tvw)
         | 
| 207 207 | 
             
            {
         | 
| 208 | 
            -
               | 
| 208 | 
            +
              OutStream *tvd = tvw->tvd;
         | 
| 209 | 
            +
              int i;
         | 
| 210 | 
            +
              TVField **fields = tvw->fields;
         | 
| 211 | 
            +
              int last_field_pointer = 0;
         | 
| 212 | 
            +
             | 
| 213 | 
            +
              if (tvw->curr_field != NULL) {
         | 
| 209 214 | 
             
                RAISE(STATE_ERROR, FIELD_OPEN_ERROR_MSG);
         | 
| 215 | 
            +
              }
         | 
| 210 216 |  | 
| 211 | 
            -
              // | 
| 212 | 
            -
               | 
| 217 | 
            +
              //printf("Writing doc pointer: %d\n", tvw->curr_doc_pointer);
         | 
| 218 | 
            +
              /*  write document index record */
         | 
| 213 219 | 
             
              os_write_long(tvw->tvx, tvw->curr_doc_pointer);
         | 
| 214 220 |  | 
| 215 | 
            -
               | 
| 216 | 
            -
               | 
| 221 | 
            +
              //printf("Writing field count: %ld, %d, %d -> ", (long long)tvw, tvw->fcnt, os_pos(tvd));
         | 
| 222 | 
            +
              /* write the number of @fields */
         | 
| 217 223 | 
             
              os_write_vint(tvd, tvw->fcnt);
         | 
| 218 | 
            -
             | 
| 219 | 
            -
               | 
| 220 | 
            -
              int i;
         | 
| 221 | 
            -
              TVField **fields = tvw->fields;
         | 
| 224 | 
            +
              
         | 
| 225 | 
            +
              /* write field numbers */
         | 
| 222 226 | 
             
              for (i = 0; i < tvw->fcnt; i++) {
         | 
| 223 227 | 
             
                os_write_vint(tvd, fields[i]->number);
         | 
| 224 228 | 
             
              }
         | 
| 225 229 |  | 
| 226 | 
            -
               | 
| 227 | 
            -
              int last_field_pointer = 0;
         | 
| 230 | 
            +
              /* write field pointers */
         | 
| 228 231 | 
             
              for (i = 0; i < tvw->fcnt; i++) {
         | 
| 229 232 | 
             
                os_write_vint(tvd, fields[i]->tvf_pointer - last_field_pointer);
         | 
| 230 233 | 
             
                last_field_pointer = fields[i]->tvf_pointer;
         | 
| 231 234 | 
             
              }
         | 
| 235 | 
            +
              //printf("%d\n", os_pos(tvw->tvd));
         | 
| 232 236 | 
             
            }
         | 
| 233 237 |  | 
| 234 238 | 
             
            void tvw_close_doc(TermVectorsWriter *tvw)
         | 
| @@ -257,8 +261,10 @@ void tvw_add_term(TermVectorsWriter *tvw, | |
| 257 261 | 
             
            {
         | 
| 258 262 | 
             
              if (tvw->tcnt >= tvw->tsize) {
         | 
| 259 263 | 
             
                tvw->tsize *= 2;
         | 
| 260 | 
            -
                if (tvw->tsize < TERM_ARR_START_SIZE) 
         | 
| 264 | 
            +
                if (tvw->tsize < TERM_ARR_START_SIZE) {
         | 
| 261 265 | 
             
                  tvw->tsize = TERM_ARR_START_SIZE;
         | 
| 266 | 
            +
                }
         | 
| 267 | 
            +
             | 
| 262 268 | 
             
                REALLOC_N(tvw->terms, TVTerm *, tvw->tsize);
         | 
| 263 269 | 
             
              }
         | 
| 264 270 | 
             
              tvw->terms[tvw->tcnt] = tvt_create(text, freq, positions, offsets);
         | 
| @@ -267,31 +273,36 @@ void tvw_add_term(TermVectorsWriter *tvw, | |
| 267 273 |  | 
| 268 274 | 
             
            void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors)
         | 
| 269 275 | 
             
            {
         | 
| 270 | 
            -
              tvw_open_doc(tvw);
         | 
| 271 | 
            -
             | 
| 272 276 | 
             
              int i, j, store_positions, store_offsets;
         | 
| 273 277 | 
             
              TermVector *tv;
         | 
| 278 | 
            +
             | 
| 279 | 
            +
              tvw_open_doc(tvw);
         | 
| 280 | 
            +
             | 
| 274 281 | 
             
              for (i = 0; i < vectors->size; i++) {
         | 
| 275 282 | 
             
                tv = vectors->elems[i];
         | 
| 276 283 |  | 
| 277 284 | 
             
                store_positions = (tv->tcnt > 0 && tv->positions != NULL);
         | 
| 278 285 | 
             
                store_offsets = (tv->tcnt > 0 && tv->offsets != NULL);
         | 
| 279 286 |  | 
| 280 | 
            -
                tvw_create_field(tvw, fis_get_number(tvw->fis, tv->field),
         | 
| 287 | 
            +
                tvw_create_field(tvw, (int)fis_get_number(tvw->fis, tv->field),
         | 
| 281 288 | 
             
                             store_positions, store_offsets);
         | 
| 282 289 |  | 
| 283 290 | 
             
                if (store_positions && store_offsets) {
         | 
| 284 | 
            -
                  for (j = 0; j < tv->tcnt; j++)
         | 
| 291 | 
            +
                  for (j = 0; j < tv->tcnt; j++) {
         | 
| 285 292 | 
             
                    tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], tv->offsets[j]);
         | 
| 293 | 
            +
                  }
         | 
| 286 294 | 
             
                } else if (store_positions) {
         | 
| 287 | 
            -
                  for (j = 0; j < tv->tcnt; j++)
         | 
| 295 | 
            +
                  for (j = 0; j < tv->tcnt; j++) {
         | 
| 288 296 | 
             
                    tvw_add_term(tvw, tv->terms[j], tv->freqs[j], tv->positions[j], NULL);
         | 
| 297 | 
            +
                  }
         | 
| 289 298 | 
             
                } else if (store_offsets) {
         | 
| 290 | 
            -
                  for (j = 0; j < tv->tcnt; j++)
         | 
| 299 | 
            +
                  for (j = 0; j < tv->tcnt; j++) {
         | 
| 291 300 | 
             
                    tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, tv->offsets[j]);
         | 
| 301 | 
            +
                  }
         | 
| 292 302 | 
             
                } else {
         | 
| 293 | 
            -
                  for (j = 0; j < tv->tcnt; j++)
         | 
| 303 | 
            +
                  for (j = 0; j < tv->tcnt; j++) {
         | 
| 294 304 | 
             
                    tvw_add_term(tvw, tv->terms[j], tv->freqs[j], NULL, NULL);
         | 
| 305 | 
            +
                  }
         | 
| 295 306 | 
             
                }
         | 
| 296 307 | 
             
                tvw_close_field(tvw);
         | 
| 297 308 | 
             
              }
         | 
| @@ -333,10 +344,9 @@ TermVector *tv_create( | |
| 333 344 | 
             
              return tv;
         | 
| 334 345 | 
             
            }
         | 
| 335 346 |  | 
| 336 | 
            -
            void tv_destroy( | 
| 347 | 
            +
            void tv_destroy(TermVector *tv)
         | 
| 337 348 | 
             
            {
         | 
| 338 349 | 
             
              int i, j;
         | 
| 339 | 
            -
              TermVector *tv = (TermVector *)p;
         | 
| 340 350 | 
             
              for (i = 0; i < tv->tcnt; i++) {
         | 
| 341 351 | 
             
                free(tv->terms[i]);
         | 
| 342 352 | 
             
              }
         | 
| @@ -357,12 +367,11 @@ void tv_destroy(void *p) | |
| 357 367 | 
             
                free(tv->offsets);
         | 
| 358 368 | 
             
              }
         | 
| 359 369 | 
             
              free(tv->freqs);
         | 
| 360 | 
            -
              free( | 
| 370 | 
            +
              free(tv);
         | 
| 361 371 | 
             
            }
         | 
| 362 372 |  | 
| 363 | 
            -
            void tv_destroy_except_data( | 
| 373 | 
            +
            void tv_destroy_except_data(TermVector *tv)
         | 
| 364 374 | 
             
            {
         | 
| 365 | 
            -
              TermVector *tv = (TermVector *)p;
         | 
| 366 375 | 
             
              free(tv->terms);
         | 
| 367 376 | 
             
              if (tv->positions != NULL) {
         | 
| 368 377 | 
             
                free(tv->positions);
         | 
| @@ -371,23 +380,23 @@ void tv_destroy_except_data(void *p) | |
| 371 380 | 
             
                free(tv->offsets);
         | 
| 372 381 | 
             
              }
         | 
| 373 382 | 
             
              free(tv->freqs);
         | 
| 374 | 
            -
              free( | 
| 383 | 
            +
              free(tv);
         | 
| 375 384 | 
             
            }
         | 
| 376 385 |  | 
| 377 386 | 
             
            int tvr_check_valid_format(InStream *is)
         | 
| 378 387 | 
             
            {
         | 
| 379 388 | 
             
              int format = is_read_int(is);
         | 
| 380 389 | 
             
              if (format > FORMAT_VERSION)
         | 
| 381 | 
            -
                RAISE( | 
| 390 | 
            +
                RAISE(EXCEPTION, FORMAT_VERSION_ERROR_MSG);
         | 
| 382 391 | 
             
              return format;
         | 
| 383 392 | 
             
            }
         | 
| 384 393 |  | 
| 385 394 | 
             
            TermVectorsReader *tvr_clone(TermVectorsReader *orig)
         | 
| 386 395 | 
             
            {
         | 
| 387 396 | 
             
              TermVectorsReader *clone = NULL;
         | 
| 397 | 
            +
              clone = ALLOC(TermVectorsReader);
         | 
| 398 | 
            +
              memcpy(clone, orig, sizeof(TermVectorsReader));
         | 
| 388 399 | 
             
              if (orig->tvx && orig->tvd && orig->tvf) { 
         | 
| 389 | 
            -
                clone = ALLOC(TermVectorsReader);
         | 
| 390 | 
            -
                memcpy(clone, orig, sizeof(TermVectorsReader));
         | 
| 391 400 | 
             
                clone->tvx = is_clone(orig->tvx);
         | 
| 392 401 | 
             
                clone->tvd = is_clone(orig->tvd);
         | 
| 393 402 | 
             
                clone->tvf = is_clone(orig->tvf);
         | 
| @@ -400,23 +409,30 @@ TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis) | |
| 400 409 | 
             
              TermVectorsReader *tvr = ALLOC(TermVectorsReader);
         | 
| 401 410 | 
             
              // Open files for TermVector storage
         | 
| 402 411 | 
             
              char fname[SEGMENT_NAME_MAX_LENGTH];
         | 
| 403 | 
            -
               | 
| 412 | 
            +
              size_t segment_len = strlen(segment);
         | 
| 413 | 
            +
              InStream *is;
         | 
| 414 | 
            +
             | 
| 404 415 | 
             
              strcpy(fname, segment);
         | 
| 405 416 |  | 
| 406 417 | 
             
              strcpy(fname + segment_len, TVX_EXTENSION);
         | 
| 407 | 
            -
               | 
| 408 | 
            -
             | 
| 409 | 
            -
             | 
| 418 | 
            +
              if (!store->exists(store, fname)) {
         | 
| 419 | 
            +
                tvr->tvx = tvr->tvd = tvr->tvf = NULL;
         | 
| 420 | 
            +
                tvr->size = 0;
         | 
| 421 | 
            +
              } else {
         | 
| 422 | 
            +
                is = tvr->tvx = store->open_input(store, fname);
         | 
| 423 | 
            +
                tvr_check_valid_format(is);
         | 
| 424 | 
            +
                tvr->size = is_length(is)/8;
         | 
| 410 425 |  | 
| 411 | 
            -
             | 
| 412 | 
            -
             | 
| 413 | 
            -
             | 
| 426 | 
            +
                strcpy(fname + segment_len, TVD_EXTENSION);
         | 
| 427 | 
            +
                is = tvr->tvd = store->open_input(store, fname);
         | 
| 428 | 
            +
                tvr->tvd_format = tvr_check_valid_format(is);
         | 
| 414 429 |  | 
| 415 | 
            -
             | 
| 416 | 
            -
             | 
| 417 | 
            -
             | 
| 430 | 
            +
                strcpy(fname + segment_len, TVF_EXTENSION);
         | 
| 431 | 
            +
                is = tvr->tvf = store->open_input(store, fname);
         | 
| 432 | 
            +
                tvr->tvf_format = tvr_check_valid_format(is);
         | 
| 418 433 |  | 
| 419 | 
            -
             | 
| 434 | 
            +
                tvr->fis = fis;
         | 
| 435 | 
            +
              }
         | 
| 420 436 | 
             
              return tvr;
         | 
| 421 437 | 
             
            }
         | 
| 422 438 |  | 
| @@ -426,9 +442,15 @@ void tvr_close(TermVectorsReader *tvr) | |
| 426 442 | 
             
               * exception, everything else will also be closed.  */
         | 
| 427 443 | 
             
              TRY
         | 
| 428 444 | 
             
              XFINALLY
         | 
| 429 | 
            -
                 | 
| 430 | 
            -
             | 
| 431 | 
            -
                 | 
| 445 | 
            +
                if (tvr->tvx) {
         | 
| 446 | 
            +
                  is_close(tvr->tvx);
         | 
| 447 | 
            +
                }
         | 
| 448 | 
            +
                if (tvr->tvd) {
         | 
| 449 | 
            +
                  is_close(tvr->tvd);
         | 
| 450 | 
            +
                }
         | 
| 451 | 
            +
                if (tvr->tvf) {
         | 
| 452 | 
            +
                  is_close(tvr->tvf);
         | 
| 453 | 
            +
                }
         | 
| 432 454 | 
             
                free(tvr);
         | 
| 433 455 | 
             
              XENDTRY
         | 
| 434 456 | 
             
            }
         | 
| @@ -436,17 +458,29 @@ void tvr_close(TermVectorsReader *tvr) | |
| 436 458 | 
             
            TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
         | 
| 437 459 | 
             
                char *field, int tvf_pointer)
         | 
| 438 460 | 
             
            {
         | 
| 439 | 
            -
              int i, j, store_positions, store_offsets, bits;
         | 
| 440 | 
            -
               | 
| 441 | 
            -
               | 
| 442 | 
            -
               | 
| 461 | 
            +
              int i, j, store_positions, store_offsets, bits, num_terms;
         | 
| 462 | 
            +
              char **terms;
         | 
| 463 | 
            +
              int *term_freqs;
         | 
| 464 | 
            +
              
         | 
| 465 | 
            +
              /*  we may not need these, but declare them */
         | 
| 466 | 
            +
              int **positions = NULL; 
         | 
| 467 | 
            +
              TVOffsetInfo ***offsets = NULL;
         | 
| 468 | 
            +
              int start, delta_length, total_length, freq, prev_pos;
         | 
| 469 | 
            +
              int start_offset, end_offset, prev_offset;
         | 
| 470 | 
            +
              int *pos;
         | 
| 471 | 
            +
              TVOffsetInfo **offs;
         | 
| 472 | 
            +
              char buffer[MAX_WORD_SIZE] = "";
         | 
| 473 | 
            +
              
         | 
| 474 | 
            +
              /* Now read the data from specified position. We don't need to offset
         | 
| 475 | 
            +
               * offset by the FORMAT here since the pointer already includes the offset */
         | 
| 443 476 | 
             
              is_seek(tvr->tvf, tvf_pointer);
         | 
| 444 | 
            -
             | 
| 445 | 
            -
               | 
| 446 | 
            -
               | 
| 447 | 
            -
             | 
| 448 | 
            -
              if (num_terms == 0)
         | 
| 477 | 
            +
              num_terms = (int)is_read_vint(tvr->tvf);
         | 
| 478 | 
            +
              
         | 
| 479 | 
            +
              /* If no terms - return a constant empty termvector. However, this should
         | 
| 480 | 
            +
               * never occur! */
         | 
| 481 | 
            +
              if (num_terms == 0) {
         | 
| 449 482 | 
             
                return tv_create(field, NULL, 0, NULL, NULL, NULL);
         | 
| 483 | 
            +
              }
         | 
| 450 484 |  | 
| 451 485 | 
             
              if(tvr->tvf_format == FORMAT_VERSION) {
         | 
| 452 486 | 
             
                bits = is_read_byte(tvr->tvf);
         | 
| @@ -458,41 +492,34 @@ TermVector *tvr_read_term_vector(TermVectorsReader *tvr, | |
| 458 492 | 
             
                store_offsets = false;
         | 
| 459 493 | 
             
              }
         | 
| 460 494 |  | 
| 461 | 
            -
               | 
| 462 | 
            -
               | 
| 463 | 
            -
              
         | 
| 464 | 
            -
              //  we may not need these, but declare them
         | 
| 465 | 
            -
              int **positions = NULL; 
         | 
| 466 | 
            -
              TVOffsetInfo ***offsets = NULL;
         | 
| 495 | 
            +
              terms = ALLOC_N(char *, num_terms);
         | 
| 496 | 
            +
              term_freqs = ALLOC_N(int, num_terms);
         | 
| 467 497 |  | 
| 468 | 
            -
              if(store_positions)
         | 
| 498 | 
            +
              if (store_positions) {
         | 
| 469 499 | 
             
                positions = ALLOC_N(int *, num_terms);
         | 
| 500 | 
            +
              }
         | 
| 470 501 |  | 
| 471 | 
            -
              if(store_offsets)
         | 
| 502 | 
            +
              if (store_offsets) {
         | 
| 472 503 | 
             
                offsets = ALLOC_N(TVOffsetInfo **, num_terms);
         | 
| 504 | 
            +
              }
         | 
| 473 505 |  | 
| 474 | 
            -
              int start, delta_length, total_length, freq, prev_pos;
         | 
| 475 | 
            -
              int start_offset, end_offset, prev_offset;
         | 
| 476 | 
            -
              int *pos;
         | 
| 477 | 
            -
              TVOffsetInfo **offs;
         | 
| 478 | 
            -
              char buffer[MAX_WORD_SIZE] = "";
         | 
| 479 506 |  | 
| 480 507 | 
             
              for (i = 0; i < num_terms; i++) {
         | 
| 481 | 
            -
                start = is_read_vint(tvr->tvf);
         | 
| 482 | 
            -
                delta_length = is_read_vint(tvr->tvf);
         | 
| 508 | 
            +
                start = (int)is_read_vint(tvr->tvf);
         | 
| 509 | 
            +
                delta_length = (int)is_read_vint(tvr->tvf);
         | 
| 483 510 | 
             
                total_length = start + delta_length;
         | 
| 484 511 | 
             
                is_read_chars(tvr->tvf, buffer, start, delta_length);
         | 
| 485 512 | 
             
                buffer[total_length] = '\0';
         | 
| 486 513 | 
             
                terms[i] = estrdup(buffer);
         | 
| 487 | 
            -
                freq = is_read_vint(tvr->tvf);
         | 
| 514 | 
            +
                freq = (int)is_read_vint(tvr->tvf);
         | 
| 488 515 | 
             
                term_freqs[i] = freq;
         | 
| 489 516 |  | 
| 490 | 
            -
                if (store_positions) { | 
| 517 | 
            +
                if (store_positions) {/* read in the positions */
         | 
| 491 518 | 
             
                  pos = ALLOC_N(int, freq);
         | 
| 492 519 | 
             
                  positions[i] = pos;
         | 
| 493 520 | 
             
                  prev_pos = 0;
         | 
| 494 521 | 
             
                  for (j = 0; j < freq; j++) {
         | 
| 495 | 
            -
                    pos[j] = prev_pos + is_read_vint(tvr->tvf);
         | 
| 522 | 
            +
                    pos[j] = prev_pos + (int)is_read_vint(tvr->tvf);
         | 
| 496 523 | 
             
                    prev_pos = pos[j];
         | 
| 497 524 | 
             
                  }
         | 
| 498 525 | 
             
                }
         | 
| @@ -502,8 +529,8 @@ TermVector *tvr_read_term_vector(TermVectorsReader *tvr, | |
| 502 529 | 
             
                  offsets[i] = offs;
         | 
| 503 530 | 
             
                  prev_offset = 0;
         | 
| 504 531 | 
             
                  for (j = 0; j < freq; j++) {
         | 
| 505 | 
            -
                    start_offset = prev_offset + is_read_vint(tvr->tvf);
         | 
| 506 | 
            -
                    end_offset = start_offset + is_read_vint(tvr->tvf);
         | 
| 532 | 
            +
                    start_offset = prev_offset + (int)is_read_vint(tvr->tvf);
         | 
| 533 | 
            +
                    end_offset = start_offset + (int)is_read_vint(tvr->tvf);
         | 
| 507 534 | 
             
                    offs[j] = tvoi_create(start_offset, end_offset);
         | 
| 508 535 | 
             
                    prev_offset = end_offset;
         | 
| 509 536 | 
             
                  }
         | 
| @@ -516,39 +543,41 @@ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num) | |
| 516 543 | 
             
            {
         | 
| 517 544 | 
             
              int i;
         | 
| 518 545 | 
             
              Array *tvs = NULL;
         | 
| 519 | 
            -
               | 
| 546 | 
            +
              /* Check if no term vectors are available for this segment at all */
         | 
| 520 547 | 
             
              if (tvr->tvx != NULL) {
         | 
| 521 | 
            -
                 | 
| 548 | 
            +
                int position, field_count;
         | 
| 549 | 
            +
                /* We need to offset by */
         | 
| 522 550 | 
             
                is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
         | 
| 523 551 |  | 
| 524 | 
            -
                 | 
| 552 | 
            +
                position = (int)is_read_long(tvr->tvx);
         | 
| 525 553 |  | 
| 526 554 | 
             
                is_seek(tvr->tvd, position);
         | 
| 527 | 
            -
                 | 
| 555 | 
            +
                field_count = (int)is_read_vint(tvr->tvd);
         | 
| 528 556 |  | 
| 529 | 
            -
                 | 
| 557 | 
            +
                /* No fields are vectorized for this document */
         | 
| 530 558 | 
             
                if (field_count > 0) {
         | 
| 531 559 | 
             
                  int number = 0;
         | 
| 560 | 
            +
                  int position = 0;
         | 
| 561 | 
            +
                  int *tvf_pointers = ALLOC_N(int, field_count);
         | 
| 532 562 | 
             
                  char **fields = ALLOC_N(char *, field_count);
         | 
| 533 563 |  | 
| 534 564 | 
             
                  for (i = 0; i < field_count; i++) {
         | 
| 535 | 
            -
                    if (tvr->tvd_format == FORMAT_VERSION)
         | 
| 536 | 
            -
                      number = is_read_vint(tvr->tvd);
         | 
| 537 | 
            -
                    else
         | 
| 538 | 
            -
                      number += is_read_vint(tvr->tvd);
         | 
| 565 | 
            +
                    if (tvr->tvd_format == FORMAT_VERSION) {
         | 
| 566 | 
            +
                      number = (int)is_read_vint(tvr->tvd);
         | 
| 567 | 
            +
                    } else {
         | 
| 568 | 
            +
                      number += (int)is_read_vint(tvr->tvd);
         | 
| 569 | 
            +
                    }
         | 
| 539 570 |  | 
| 540 571 | 
             
                    fields[i] = tvr->fis->by_number[number]->name;
         | 
| 541 572 | 
             
                  }
         | 
| 542 573 |  | 
| 543 | 
            -
                   | 
| 544 | 
            -
                  int position = 0;
         | 
| 545 | 
            -
                  int *tvf_pointers = ALLOC_N(int, field_count);
         | 
| 574 | 
            +
                  /* Compute position in the tvf file */
         | 
| 546 575 | 
             
                  for (i = 0; i < field_count; i++) {
         | 
| 547 | 
            -
                    position += is_read_vint(tvr->tvd);
         | 
| 576 | 
            +
                    position += (int)is_read_vint(tvr->tvd);
         | 
| 548 577 | 
             
                    tvf_pointers[i] = position;
         | 
| 549 578 | 
             
                  }
         | 
| 550 579 |  | 
| 551 | 
            -
                  tvs = ary_create(field_count, &tv_destroy);
         | 
| 580 | 
            +
                  tvs = ary_create(field_count, (free_ft)&tv_destroy);
         | 
| 552 581 | 
             
                  for (i = 0; i < field_count; i++) {
         | 
| 553 582 | 
             
                    ary_append(tvs, tvr_read_term_vector(tvr, fields[i], tvf_pointers[i]));
         | 
| 554 583 | 
             
                  }
         | 
| @@ -562,45 +591,45 @@ Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num) | |
| 562 591 | 
             
            TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field)
         | 
| 563 592 | 
             
            {
         | 
| 564 593 | 
             
              int i;
         | 
| 565 | 
            -
               | 
| 566 | 
            -
              int field_number = fis_get_number(tvr->fis, field);
         | 
| 594 | 
            +
              /* Check if no term vectors are available for this segment at all */
         | 
| 595 | 
            +
              int field_number = (int)fis_get_number(tvr->fis, field);
         | 
| 567 596 | 
             
              TermVector *tv = NULL;
         | 
| 568 597 |  | 
| 569 598 | 
             
              if (tvr->tvx != NULL) {
         | 
| 570 | 
            -
                 | 
| 571 | 
            -
                 | 
| 572 | 
            -
             | 
| 599 | 
            +
                int pos, field_count, number = 0, found = -1;
         | 
| 600 | 
            +
                /* We need to account for the FORMAT_SIZE at when seeking in the @tvx
         | 
| 601 | 
            +
                 * We don't need to do this in other seeks because we already have the
         | 
| 602 | 
            +
                 * file pointer that was written in another file */
         | 
| 573 603 | 
             
                is_seek(tvr->tvx, (doc_num * 8) + FORMAT_SIZE);
         | 
| 574 | 
            -
                // | 
| 575 | 
            -
                 | 
| 604 | 
            +
                //printf("TVX Pointer: %d\n", is_pos(tvr->tvx));
         | 
| 605 | 
            +
                pos = (int)is_read_long(tvr->tvx);
         | 
| 576 606 |  | 
| 577 607 | 
             
                is_seek(tvr->tvd, pos);
         | 
| 578 | 
            -
                 | 
| 579 | 
            -
                // | 
| 580 | 
            -
                 | 
| 581 | 
            -
             | 
| 582 | 
            -
             | 
| 583 | 
            -
                int number = 0;
         | 
| 584 | 
            -
                int found = -1;
         | 
| 585 | 
            -
             | 
| 608 | 
            +
                field_count = (int)is_read_vint(tvr->tvd);
         | 
| 609 | 
            +
                //printf("Num Fields: %d\n", field_count);
         | 
| 610 | 
            +
                /* There are only a few fields per document. We opt for a full scan
         | 
| 611 | 
            +
                 * rather then requiring that they be ordered. We need to read through
         | 
| 612 | 
            +
                 * all of the fields anyway to get to the tvf pointers. */
         | 
| 586 613 | 
             
                for (i = 0; i < field_count; i++) {
         | 
| 587 | 
            -
                  if (tvr->tvd_format == FORMAT_VERSION)
         | 
| 588 | 
            -
                    number = is_read_vint(tvr->tvd);
         | 
| 589 | 
            -
                  else
         | 
| 590 | 
            -
                    number += is_read_vint(tvr->tvd);
         | 
| 614 | 
            +
                  if (tvr->tvd_format == FORMAT_VERSION) {
         | 
| 615 | 
            +
                    number = (int)is_read_vint(tvr->tvd);
         | 
| 616 | 
            +
                  } else {
         | 
| 617 | 
            +
                    number += (int)is_read_vint(tvr->tvd);
         | 
| 618 | 
            +
                  }
         | 
| 591 619 |  | 
| 592 | 
            -
                  if (number == field_number)
         | 
| 620 | 
            +
                  if (number == field_number) {
         | 
| 593 621 | 
             
                    found = i;
         | 
| 622 | 
            +
                  }
         | 
| 594 623 | 
             
                }
         | 
| 595 624 |  | 
| 596 | 
            -
                 | 
| 597 | 
            -
             | 
| 625 | 
            +
                /* This field, although valid in the segment, was not found in this
         | 
| 626 | 
            +
                 * document */
         | 
| 598 627 | 
             
                if (found != -1) {
         | 
| 599 | 
            -
                   | 
| 628 | 
            +
                  /* Compute pos in the tvf file */
         | 
| 600 629 | 
             
                  pos = 0;
         | 
| 601 | 
            -
                  for (i = 0; i <= found; i++)
         | 
| 602 | 
            -
                    pos += is_read_vint(tvr->tvd);
         | 
| 603 | 
            -
             | 
| 630 | 
            +
                  for (i = 0; i <= found; i++) {
         | 
| 631 | 
            +
                    pos += (int)is_read_vint(tvr->tvd);
         | 
| 632 | 
            +
                  }
         | 
| 604 633 | 
             
                  tv = tvr_read_term_vector(tvr, field, pos);
         | 
| 605 634 | 
             
                }
         | 
| 606 635 | 
             
              }
         |