ferret 0.9.4 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README +1 -1
- data/Rakefile +1 -0
- data/ext/field.c +87 -87
- data/ext/index.h +253 -255
- data/ext/index_io.c +15 -6
- data/ext/index_rw.c +6 -0
- data/ext/nix_io.c +4 -6
- data/ext/q_boolean.c +0 -6
- data/ext/q_fuzzy.c +10 -7
- data/ext/q_multi_phrase.c +2 -2
- data/ext/q_term.c +2 -2
- data/ext/q_wildcard.c +5 -4
- data/ext/search.c +3 -5
- data/ext/search.h +439 -400
- data/ext/store.h +1 -0
- data/ext/termdocs.c +3 -7
- data/ext/vector.c +1 -1
- data/lib/ferret.rb +1 -1
- data/lib/ferret/store/ram_store.rb +5 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/index/tc_index_reader.rb +6 -1
- data/test/unit/search/tc_search_and_sort.rb +1 -1
- data/test/unit/store/tc_fs_store.rb +1 -1
- metadata +4 -4
data/README
CHANGED
@@ -26,7 +26,7 @@ Run the following;
|
|
26
26
|
$ rake ext
|
27
27
|
$ ruby setup.rb config
|
28
28
|
$ ruby setup.rb setup
|
29
|
-
# ruby setup.rb install
|
29
|
+
# sudo ruby setup.rb install
|
30
30
|
|
31
31
|
These simple steps install ferret in the default location of Ruby libraries.
|
32
32
|
You can also install files into your favorite directory by supplying setup.rb
|
data/Rakefile
CHANGED
@@ -211,6 +211,7 @@ else
|
|
211
211
|
#### Load-time details: library and application (you will need one or both).
|
212
212
|
|
213
213
|
s.require_path = 'lib' # Use these for libraries.
|
214
|
+
s.autorequire = 'ferret'
|
214
215
|
|
215
216
|
#s.bindir = "bin" # Use these for applications.
|
216
217
|
#s.executables = ["rake"]
|
data/ext/field.c
CHANGED
@@ -213,93 +213,6 @@ FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc)
|
|
213
213
|
return fis;
|
214
214
|
}
|
215
215
|
|
216
|
-
/****************************************************************************
|
217
|
-
*
|
218
|
-
* FieldsWriter
|
219
|
-
*
|
220
|
-
****************************************************************************/
|
221
|
-
|
222
|
-
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis)
|
223
|
-
{
|
224
|
-
FieldsWriter *fw = ALLOC(FieldsWriter);
|
225
|
-
char buf[SEGMENT_NAME_MAX_LENGTH];
|
226
|
-
int slen = (int)strlen(segment);
|
227
|
-
|
228
|
-
strcpy(buf, segment);
|
229
|
-
|
230
|
-
fw->fis = fis;
|
231
|
-
strcpy(buf+slen, ".fdt");
|
232
|
-
fw->fields_out = store->create_output(store, buf);
|
233
|
-
strcpy(buf+slen, ".fdx");
|
234
|
-
fw->index_out = store->create_output(store, buf);
|
235
|
-
return fw;
|
236
|
-
}
|
237
|
-
|
238
|
-
void fw_close(FieldsWriter *fw)
|
239
|
-
{
|
240
|
-
os_close(fw->fields_out);
|
241
|
-
os_close(fw->index_out);
|
242
|
-
free(fw);
|
243
|
-
}
|
244
|
-
|
245
|
-
void save_data(OutStream *fout, char *data, int dlen)
|
246
|
-
{
|
247
|
-
os_write_vint(fout, dlen);
|
248
|
-
os_write_bytes(fout, (uchar *)data, dlen);
|
249
|
-
}
|
250
|
-
|
251
|
-
void fw_add_doc(FieldsWriter *fw, Document *doc)
|
252
|
-
{
|
253
|
-
int i, bits;
|
254
|
-
DocField *df;
|
255
|
-
char *data;
|
256
|
-
int stored_count = 0;
|
257
|
-
OutStream *fout = fw->fields_out, *iout = fw->index_out;
|
258
|
-
|
259
|
-
os_write_long(iout, os_pos(fout));
|
260
|
-
|
261
|
-
for (i = 0; i < doc->dfcnt; i++) {
|
262
|
-
if (doc->df_arr[i]->is_stored)
|
263
|
-
stored_count++;
|
264
|
-
}
|
265
|
-
os_write_vint(fout, stored_count);
|
266
|
-
|
267
|
-
for (i = 0; i < doc->dfcnt; i++) {
|
268
|
-
df = doc->df_arr[i];
|
269
|
-
if (df->is_stored) {
|
270
|
-
os_write_vint(fout, ((FieldInfo *)ht_get(fw->fis->by_name, df->name))->number);
|
271
|
-
|
272
|
-
bits = 0;
|
273
|
-
if (df->is_tokenized) {
|
274
|
-
bits |= FIELD_IS_TOKENIZED;
|
275
|
-
}
|
276
|
-
if (df->is_binary) {
|
277
|
-
bits |= FIELD_IS_BINARY;
|
278
|
-
}
|
279
|
-
if (df->is_compressed) {
|
280
|
-
bits |= FIELD_IS_COMPRESSED;
|
281
|
-
}
|
282
|
-
os_write_byte(fout, bits);
|
283
|
-
|
284
|
-
data = NULL;
|
285
|
-
if (df->is_compressed) {
|
286
|
-
/* Not compressing just yet but we'll save it anyway */
|
287
|
-
if (df->is_binary) {
|
288
|
-
save_data(fout, df->data, df->blen);
|
289
|
-
} else {
|
290
|
-
os_write_string(fout, df->data);
|
291
|
-
}
|
292
|
-
} else {
|
293
|
-
if (df->is_binary) {
|
294
|
-
save_data(fout, df->data, df->blen);
|
295
|
-
} else {
|
296
|
-
os_write_string(fout, df->data);
|
297
|
-
}
|
298
|
-
}
|
299
|
-
}
|
300
|
-
}
|
301
|
-
}
|
302
|
-
|
303
216
|
/****************************************************************************
|
304
217
|
*
|
305
218
|
* FieldsReader
|
@@ -406,3 +319,90 @@ Document *fr_get_doc(FieldsReader *fr, int doc_num)
|
|
406
319
|
return doc;
|
407
320
|
}
|
408
321
|
|
322
|
+
/****************************************************************************
|
323
|
+
*
|
324
|
+
* FieldsWriter
|
325
|
+
*
|
326
|
+
****************************************************************************/
|
327
|
+
|
328
|
+
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis)
|
329
|
+
{
|
330
|
+
FieldsWriter *fw = ALLOC(FieldsWriter);
|
331
|
+
char buf[SEGMENT_NAME_MAX_LENGTH];
|
332
|
+
int slen = (int)strlen(segment);
|
333
|
+
|
334
|
+
strcpy(buf, segment);
|
335
|
+
|
336
|
+
fw->fis = fis;
|
337
|
+
strcpy(buf+slen, ".fdt");
|
338
|
+
fw->fields_out = store->create_output(store, buf);
|
339
|
+
strcpy(buf+slen, ".fdx");
|
340
|
+
fw->index_out = store->create_output(store, buf);
|
341
|
+
return fw;
|
342
|
+
}
|
343
|
+
|
344
|
+
void fw_close(FieldsWriter *fw)
|
345
|
+
{
|
346
|
+
os_close(fw->fields_out);
|
347
|
+
os_close(fw->index_out);
|
348
|
+
free(fw);
|
349
|
+
}
|
350
|
+
|
351
|
+
void save_data(OutStream *fout, char *data, int dlen)
|
352
|
+
{
|
353
|
+
os_write_vint(fout, dlen);
|
354
|
+
os_write_bytes(fout, (uchar *)data, dlen);
|
355
|
+
}
|
356
|
+
|
357
|
+
void fw_add_doc(FieldsWriter *fw, Document *doc)
|
358
|
+
{
|
359
|
+
int i, bits;
|
360
|
+
DocField *df;
|
361
|
+
char *data;
|
362
|
+
int stored_count = 0;
|
363
|
+
OutStream *fout = fw->fields_out, *iout = fw->index_out;
|
364
|
+
|
365
|
+
os_write_long(iout, os_pos(fout));
|
366
|
+
|
367
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
368
|
+
if (doc->df_arr[i]->is_stored)
|
369
|
+
stored_count++;
|
370
|
+
}
|
371
|
+
os_write_vint(fout, stored_count);
|
372
|
+
|
373
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
374
|
+
df = doc->df_arr[i];
|
375
|
+
if (df->is_stored) {
|
376
|
+
os_write_vint(fout, ((FieldInfo *)ht_get(fw->fis->by_name, df->name))->number);
|
377
|
+
|
378
|
+
bits = 0;
|
379
|
+
if (df->is_tokenized) {
|
380
|
+
bits |= FIELD_IS_TOKENIZED;
|
381
|
+
}
|
382
|
+
if (df->is_binary) {
|
383
|
+
bits |= FIELD_IS_BINARY;
|
384
|
+
}
|
385
|
+
if (df->is_compressed) {
|
386
|
+
bits |= FIELD_IS_COMPRESSED;
|
387
|
+
}
|
388
|
+
os_write_byte(fout, bits);
|
389
|
+
|
390
|
+
data = NULL;
|
391
|
+
if (df->is_compressed) {
|
392
|
+
/* Not compressing just yet but we'll save it anyway */
|
393
|
+
if (df->is_binary) {
|
394
|
+
save_data(fout, df->data, df->blen);
|
395
|
+
} else {
|
396
|
+
os_write_string(fout, df->data);
|
397
|
+
}
|
398
|
+
} else {
|
399
|
+
if (df->is_binary) {
|
400
|
+
save_data(fout, df->data, df->blen);
|
401
|
+
} else {
|
402
|
+
os_write_string(fout, df->data);
|
403
|
+
}
|
404
|
+
}
|
405
|
+
}
|
406
|
+
}
|
407
|
+
}
|
408
|
+
|
data/ext/index.h
CHANGED
@@ -162,7 +162,6 @@ int ti_eq(TermInfo *ti1, TermInfo *ti2);
|
|
162
162
|
*
|
163
163
|
****************************************************************************/
|
164
164
|
|
165
|
-
typedef struct TermEnumFilter TermEnumFilter;
|
166
165
|
typedef struct TermEnum TermEnum;
|
167
166
|
struct TermEnum {
|
168
167
|
void *data;
|
@@ -301,10 +300,11 @@ void tvf_destroy(void *p);
|
|
301
300
|
****************************************************************************/
|
302
301
|
|
303
302
|
typedef struct TVTerm {
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
303
|
+
int field_num;
|
304
|
+
char *text;
|
305
|
+
int freq;
|
306
|
+
int *positions;
|
307
|
+
TVOffsetInfo **offsets;
|
308
308
|
} TVTerm;
|
309
309
|
|
310
310
|
TVTerm *tvt_create(char *text,
|
@@ -320,12 +320,12 @@ void tvt_destroy(void *p);
|
|
320
320
|
****************************************************************************/
|
321
321
|
|
322
322
|
typedef struct TermVector {
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
323
|
+
char *field;
|
324
|
+
char **terms;
|
325
|
+
int tcnt;
|
326
|
+
int *freqs;
|
327
|
+
int **positions;
|
328
|
+
TVOffsetInfo ***offsets;
|
329
329
|
} TermVector;
|
330
330
|
|
331
331
|
TermVector *tv_create(const char *field,
|
@@ -344,27 +344,27 @@ void tv_destroy(TermVector *tv);
|
|
344
344
|
|
345
345
|
#define STORE_POSITIONS_WITH_TERMVECTOR 0x1
|
346
346
|
#define STORE_OFFSET_WITH_TERMVECTOR 0x2
|
347
|
-
|
347
|
+
|
348
348
|
#define FORMAT_VERSION 2
|
349
349
|
#define FORMAT_SIZE 4
|
350
|
-
|
350
|
+
|
351
351
|
#define TVX_EXTENSION ".tvx"
|
352
352
|
#define TVD_EXTENSION ".tvd"
|
353
353
|
#define TVF_EXTENSION ".tvf"
|
354
354
|
|
355
355
|
typedef struct TermVectorsWriter {
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
356
|
+
TVField *curr_field;
|
357
|
+
int curr_doc_pointer;
|
358
|
+
OutStream *tvx;
|
359
|
+
OutStream *tvd;
|
360
|
+
OutStream *tvf;
|
361
|
+
FieldInfos *fis;
|
362
|
+
TVField **fields;
|
363
|
+
int fcnt;
|
364
|
+
int fsize;
|
365
|
+
TVTerm **terms;
|
366
|
+
int tcnt;
|
367
|
+
int tsize;
|
368
368
|
} TermVectorsWriter;
|
369
369
|
|
370
370
|
TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
|
@@ -384,23 +384,40 @@ void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
|
|
384
384
|
****************************************************************************/
|
385
385
|
|
386
386
|
typedef struct TermVectorsReader {
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
387
|
+
int size;
|
388
|
+
InStream *tvx;
|
389
|
+
InStream *tvd;
|
390
|
+
InStream *tvf;
|
391
|
+
FieldInfos *fis;
|
392
|
+
int tvd_format;
|
393
|
+
int tvf_format;
|
394
394
|
} TermVectorsReader;
|
395
395
|
|
396
396
|
TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
|
397
397
|
TermVectorsReader *tvr_clone(TermVectorsReader *orig);
|
398
398
|
void tvr_close(TermVectorsReader *tvr);
|
399
399
|
TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
400
|
-
|
400
|
+
char *field, int tvf_pointer);
|
401
401
|
Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
|
402
402
|
TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
|
403
403
|
|
404
|
+
/****************************************************************************
|
405
|
+
*
|
406
|
+
* FieldsReader
|
407
|
+
*
|
408
|
+
****************************************************************************/
|
409
|
+
|
410
|
+
typedef struct FieldsReader {
|
411
|
+
int len;
|
412
|
+
FieldInfos *fis;
|
413
|
+
InStream *fields_in;
|
414
|
+
InStream *index_in;
|
415
|
+
} FieldsReader;
|
416
|
+
|
417
|
+
FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
|
418
|
+
void fr_close(FieldsReader *fr);
|
419
|
+
Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
420
|
+
|
404
421
|
/****************************************************************************
|
405
422
|
*
|
406
423
|
* FieldsWriter
|
@@ -412,9 +429,9 @@ TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
|
|
412
429
|
#define FIELD_IS_COMPRESSED 0X4
|
413
430
|
|
414
431
|
typedef struct FieldsWriter {
|
415
|
-
|
416
|
-
|
417
|
-
|
432
|
+
FieldInfos *fis;
|
433
|
+
OutStream *fields_out;
|
434
|
+
OutStream *index_out;
|
418
435
|
} FieldsWriter;
|
419
436
|
|
420
437
|
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis);
|
@@ -429,15 +446,15 @@ void fw_add_doc(FieldsWriter *fw, Document *doc);
|
|
429
446
|
|
430
447
|
typedef struct TermDocEnum TermDocEnum;
|
431
448
|
struct TermDocEnum {
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
449
|
+
void *data;
|
450
|
+
void (*seek)(TermDocEnum *tde, Term *term);
|
451
|
+
int (*doc_num)(TermDocEnum *tde);
|
452
|
+
int (*freq)(TermDocEnum *tde);
|
453
|
+
bool (*next)(TermDocEnum *tde);
|
454
|
+
int (*read)(TermDocEnum *tde, int *docs, int *freqs, int req_num);
|
455
|
+
bool (*skip_to)(TermDocEnum *tde, int target);
|
456
|
+
int (*next_position)(TermDocEnum *tde);
|
457
|
+
void (*close)(TermDocEnum *tde);
|
441
458
|
};
|
442
459
|
|
443
460
|
/* * SegmentTermDocEnum * */
|
@@ -445,27 +462,27 @@ struct TermDocEnum {
|
|
445
462
|
typedef struct SegmentTermDocEnum SegmentTermDocEnum;
|
446
463
|
|
447
464
|
struct SegmentTermDocEnum {
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
465
|
+
SegmentReader *parent;
|
466
|
+
InStream *freq_in;
|
467
|
+
int count; /* number of docs for this term skipped */
|
468
|
+
int doc_freq; /* number of doc this term appears in */
|
469
|
+
BitVector *deleted_docs;
|
470
|
+
int doc_num;
|
471
|
+
int freq;
|
472
|
+
int skip_interval;
|
473
|
+
int num_skips;
|
474
|
+
int skip_count;
|
475
|
+
InStream *skip_in;
|
476
|
+
int skip_doc;
|
477
|
+
int freq_pointer;
|
478
|
+
int prox_pointer;
|
479
|
+
int skip_pointer;
|
480
|
+
unsigned int have_skipped : 1;
|
481
|
+
void (*skip_prox)(SegmentTermDocEnum *stde);
|
482
|
+
InStream *prox_in;
|
483
|
+
int prox_cnt;
|
484
|
+
int position;
|
485
|
+
void (*seek_prox)(SegmentTermDocEnum *stde, int prox_pointer);
|
469
486
|
};
|
470
487
|
|
471
488
|
TermDocEnum *stde_create(IndexReader *ir);
|
@@ -477,15 +494,15 @@ TermDocEnum *stpe_create(IndexReader *ir);
|
|
477
494
|
/* * MultiTermDocEnum * */
|
478
495
|
typedef struct MultiTermDocEnum MultiTermDocEnum;
|
479
496
|
struct MultiTermDocEnum {
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
497
|
+
IndexReader **irs;
|
498
|
+
int *starts;
|
499
|
+
int ir_cnt;
|
500
|
+
Term *term;
|
501
|
+
int base;
|
502
|
+
int pointer;
|
503
|
+
TermDocEnum **irs_tde;
|
504
|
+
TermDocEnum *curr_tde;
|
505
|
+
TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
|
489
506
|
};
|
490
507
|
|
491
508
|
TermDocEnum *mtde_create(IndexReader **readers, int *starts, int ir_cnt);
|
@@ -499,33 +516,16 @@ TermDocEnum *mtpe_create(IndexReader **readers, int *starts, int ir_cnt);
|
|
499
516
|
|
500
517
|
#define MTDPE_POS_QUEUE_INIT_CAPA 8
|
501
518
|
typedef struct {
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
519
|
+
int doc_num;
|
520
|
+
int freq;
|
521
|
+
PriorityQueue *pq;
|
522
|
+
int *pos_queue;
|
523
|
+
int pos_queue_index;
|
524
|
+
int pos_queue_capa;
|
508
525
|
} MultipleTermDocPosEnum;
|
509
526
|
|
510
527
|
TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
|
511
528
|
|
512
|
-
/****************************************************************************
|
513
|
-
*
|
514
|
-
* FieldsReader
|
515
|
-
*
|
516
|
-
****************************************************************************/
|
517
|
-
|
518
|
-
typedef struct FieldsReader {
|
519
|
-
int len;
|
520
|
-
FieldInfos *fis;
|
521
|
-
InStream *fields_in;
|
522
|
-
InStream *index_in;
|
523
|
-
} FieldsReader;
|
524
|
-
|
525
|
-
FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
|
526
|
-
void fr_close(FieldsReader *fr);
|
527
|
-
Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
528
|
-
|
529
529
|
/****************************************************************************
|
530
530
|
*
|
531
531
|
* Posting
|
@@ -533,11 +533,11 @@ Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
|
533
533
|
****************************************************************************/
|
534
534
|
|
535
535
|
typedef struct Posting {
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
536
|
+
Term *term;
|
537
|
+
int freq;
|
538
|
+
int size;
|
539
|
+
int *positions;
|
540
|
+
TVOffsetInfo **offsets;
|
541
541
|
} Posting;
|
542
542
|
|
543
543
|
Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
|
@@ -552,22 +552,22 @@ void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset);
|
|
552
552
|
****************************************************************************/
|
553
553
|
|
554
554
|
typedef struct DocumentWriter {
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
555
|
+
Store *store;
|
556
|
+
Analyzer *analyzer;
|
557
|
+
Similarity *similarity;
|
558
|
+
HshTable *postingtable;
|
559
|
+
int pcnt;
|
560
|
+
FieldInfos *fis;
|
561
|
+
float *field_boosts;
|
562
|
+
int *field_lengths;
|
563
|
+
int *field_positions;
|
564
|
+
int *field_offsets;
|
565
|
+
int max_field_length;
|
566
|
+
int term_index_interval;
|
567
567
|
} DocumentWriter;
|
568
568
|
|
569
569
|
DocumentWriter *dw_open(Store *store, Analyzer *analyzer,
|
570
|
-
|
570
|
+
Similarity *similarity, int max_field_length, int term_index_interval);
|
571
571
|
void dw_close(DocumentWriter *dw);
|
572
572
|
void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
|
573
573
|
|
@@ -578,9 +578,9 @@ void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
|
|
578
578
|
****************************************************************************/
|
579
579
|
|
580
580
|
typedef struct SegmentInfo {
|
581
|
-
|
582
|
-
|
583
|
-
|
581
|
+
char *name;
|
582
|
+
int doc_cnt;
|
583
|
+
Store *store;
|
584
584
|
} SegmentInfo;
|
585
585
|
|
586
586
|
SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
|
@@ -596,13 +596,13 @@ bool si_has_separate_norms(SegmentInfo *si);
|
|
596
596
|
****************************************************************************/
|
597
597
|
|
598
598
|
typedef struct SegmentInfos {
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
599
|
+
Store *store;
|
600
|
+
SegmentInfo **segs;
|
601
|
+
int scnt;
|
602
|
+
int size;
|
603
|
+
int counter;
|
604
|
+
int version;
|
605
|
+
int format;
|
606
606
|
} SegmentInfos;
|
607
607
|
|
608
608
|
SegmentInfos *sis_create();
|
@@ -622,65 +622,63 @@ int sis_read_current_version(Store *store);
|
|
622
622
|
****************************************************************************/
|
623
623
|
|
624
624
|
enum FIELD_TYPE {
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
625
|
+
/* all fields */
|
626
|
+
IR_ALL,
|
627
|
+
/* all indexed fields */
|
628
|
+
IR_INDEXED,
|
629
|
+
/* all fields which are not indexed */
|
630
|
+
IR_UNINDEXED,
|
631
|
+
/* all fields which are indexed with termvectors enables */
|
632
|
+
IR_INDEXED_WITH_TERM_VECTOR,
|
633
|
+
/* all fields which are indexed but don't have termvectors enabled */
|
634
|
+
IR_INDEXED_NO_TERM_VECTOR,
|
635
|
+
/* all fields where termvectors are enabled. Please note that only standard */
|
636
|
+
/* termvector fields are returned */
|
637
|
+
IR_TERM_VECTOR,
|
638
|
+
/* all field with termvectors wiht positions enabled */
|
639
|
+
IR_TERM_VECTOR_WITH_POSITION,
|
640
|
+
/* all fields where termvectors with offset position are set */
|
641
|
+
IR_TERM_VECTOR_WITH_OFFSET,
|
642
|
+
/* all fields where termvectors with offset and position values set */
|
643
|
+
IR_TERM_VECTOR_WITH_POSITION_OFFSET
|
644
644
|
};
|
645
645
|
|
646
646
|
struct IndexReader {
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
int (*write_fields_i)(IndexReader *ir, OutStream *fdt_out,
|
683
|
-
OutStream *fdx_out);
|
647
|
+
mutex_t mutex;
|
648
|
+
HshTable *cache;
|
649
|
+
HshTable *sort_cache;
|
650
|
+
void *data;
|
651
|
+
Store *store;
|
652
|
+
Lock *write_lock;
|
653
|
+
SegmentInfos *sis;
|
654
|
+
bool has_changes : 1;
|
655
|
+
bool is_stale : 1;
|
656
|
+
bool is_owner : 1;
|
657
|
+
TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
|
658
|
+
Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
|
659
|
+
int (*num_docs)(IndexReader *ir);
|
660
|
+
int (*max_doc)(IndexReader *ir);
|
661
|
+
Document *(*get_doc)(IndexReader *ir, int doc_num);
|
662
|
+
uchar *(*get_norms)(IndexReader *ir, char *field);
|
663
|
+
uchar *(*get_norms_always)(IndexReader *ir, char *field);
|
664
|
+
void (*do_set_norm)(IndexReader *ir, int doc_num, char *field,
|
665
|
+
uchar val);
|
666
|
+
void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf,
|
667
|
+
int offset);
|
668
|
+
TermEnum *(*terms)(IndexReader *ir);
|
669
|
+
TermEnum *(*terms_from)(IndexReader *ir, Term *term);
|
670
|
+
int (*doc_freq)(IndexReader *ir, Term *t);
|
671
|
+
TermDocEnum *(*term_docs)(IndexReader *ir);
|
672
|
+
TermDocEnum *(*term_positions)(IndexReader *ir);
|
673
|
+
void (*do_delete_doc)(IndexReader *ir, int doc_num);
|
674
|
+
void (*do_undelete_all)(IndexReader *ir);
|
675
|
+
bool (*is_deleted)(IndexReader *ir, int doc_num);
|
676
|
+
bool (*has_deletions)(IndexReader *ir);
|
677
|
+
bool (*has_norms)(IndexReader *ir, char *field);
|
678
|
+
HashSet *(*get_field_names)(IndexReader *ir, int field_type);
|
679
|
+
void (*do_commit)(IndexReader *ir);
|
680
|
+
void (*do_close)(IndexReader *ir);
|
681
|
+
void (*acquire_write_lock)(IndexReader *ir);
|
684
682
|
};
|
685
683
|
|
686
684
|
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
|
@@ -705,10 +703,10 @@ bool ir_is_latest(IndexReader *ir);
|
|
705
703
|
****************************************************************************/
|
706
704
|
|
707
705
|
typedef struct Norm {
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
706
|
+
bool is_dirty : 1;
|
707
|
+
int field_num;
|
708
|
+
InStream *is;
|
709
|
+
uchar *bytes;
|
712
710
|
} Norm;
|
713
711
|
|
714
712
|
/****************************************************************************
|
@@ -718,22 +716,22 @@ typedef struct Norm {
|
|
718
716
|
****************************************************************************/
|
719
717
|
|
720
718
|
struct SegmentReader {
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
719
|
+
FieldInfos *fis;
|
720
|
+
FieldsReader *fr;
|
721
|
+
char *segment;
|
722
|
+
BitVector *deleted_docs;
|
723
|
+
bool deleted_docs_dirty : 1;
|
724
|
+
bool undelete_all : 1;
|
725
|
+
bool norms_dirty : 1;
|
726
|
+
InStream *freq_in;
|
727
|
+
InStream *prox_in;
|
728
|
+
TermInfosReader *tir;
|
729
|
+
TermVectorsReader *orig_tvr;
|
730
|
+
thread_key_t thread_tvr;
|
731
|
+
Array *tvr_bucket;
|
732
|
+
HshTable *norms;
|
733
|
+
Store *cfs_store;
|
734
|
+
uchar *fake_norms;
|
737
735
|
};
|
738
736
|
|
739
737
|
IndexReader *sr_open(SegmentInfos *sis, int si_num, bool is_owner);
|
@@ -746,17 +744,17 @@ IndexReader *sr_open_si(SegmentInfo *si);
|
|
746
744
|
****************************************************************************/
|
747
745
|
|
748
746
|
typedef struct MultiReader {
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
747
|
+
bool has_deletions : 1;
|
748
|
+
int max_doc;
|
749
|
+
int num_docs_cache;
|
750
|
+
int rcnt;
|
751
|
+
int *starts;
|
752
|
+
IndexReader **sub_readers;
|
753
|
+
HshTable *norms_cache;
|
756
754
|
} MultiReader;
|
757
755
|
|
758
756
|
IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
|
759
|
-
|
757
|
+
int rcnt);
|
760
758
|
|
761
759
|
/****************************************************************************
|
762
760
|
*
|
@@ -765,12 +763,12 @@ IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
|
|
765
763
|
****************************************************************************/
|
766
764
|
|
767
765
|
typedef struct SegmentMergeInfo {
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
766
|
+
int base;
|
767
|
+
IndexReader *ir;
|
768
|
+
TermEnum *te;
|
769
|
+
TermBuffer *tb;
|
770
|
+
TermDocEnum *postings;
|
771
|
+
int *doc_map;
|
774
772
|
} SegmentMergeInfo;
|
775
773
|
|
776
774
|
SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
|
@@ -785,24 +783,24 @@ bool smi_lt(SegmentMergeInfo *smi1, SegmentMergeInfo *smi2);
|
|
785
783
|
****************************************************************************/
|
786
784
|
|
787
785
|
typedef struct SegmentMerger {
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
786
|
+
Store *store;
|
787
|
+
char *name;
|
788
|
+
Array *readers;
|
789
|
+
FieldInfos *fis;
|
790
|
+
OutStream *freq_out;
|
791
|
+
OutStream *prox_out;
|
792
|
+
TermInfosWriter *tiw;
|
793
|
+
Term *terms_buf;
|
794
|
+
int terms_buf_pointer;
|
795
|
+
int terms_buf_size;
|
796
|
+
PriorityQueue *queue;
|
797
|
+
TermInfo *ti;
|
798
|
+
int term_index_interval;
|
799
|
+
OutStream *skip_buffer;
|
800
|
+
int skip_interval;
|
801
|
+
int last_skip_doc;
|
802
|
+
int last_skip_freq_pointer;
|
803
|
+
int last_skip_prox_pointer;
|
806
804
|
} SegmentMerger;
|
807
805
|
|
808
806
|
SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
|
@@ -821,25 +819,25 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
|
|
821
819
|
#define WRITE_LOCK_NAME "write"
|
822
820
|
#define COMMIT_LOCK_NAME "commit"
|
823
821
|
struct IndexWriter {
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
822
|
+
mutex_t mutex;
|
823
|
+
HshTable *postings;
|
824
|
+
FieldInfos *fis;
|
825
|
+
int merge_factor;
|
826
|
+
int min_merge_docs;
|
827
|
+
int max_merge_docs;
|
828
|
+
int max_field_length;
|
829
|
+
int term_index_interval;
|
830
|
+
Store *store;
|
831
|
+
Analyzer *analyzer;
|
832
|
+
Similarity *similarity;
|
833
|
+
SegmentInfos *sis;
|
834
|
+
Store *ram_store;
|
835
|
+
Lock *write_lock;
|
836
|
+
bool use_compound_file : 1;
|
839
837
|
};
|
840
838
|
|
841
839
|
IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
842
|
-
|
840
|
+
bool create);
|
843
841
|
void iw_flush_ram_segments(IndexWriter *iw);
|
844
842
|
void iw_close(IndexWriter *iw);
|
845
843
|
int iw_doc_count(IndexWriter *iw);
|
@@ -855,11 +853,11 @@ void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
|
|
855
853
|
****************************************************************************/
|
856
854
|
|
857
855
|
typedef struct CompoundWriter {
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
856
|
+
Store *store;
|
857
|
+
const char *name;
|
858
|
+
HashSet *ids;
|
859
|
+
Array *file_entries;
|
860
|
+
bool merged;
|
863
861
|
} CompoundWriter;
|
864
862
|
|
865
863
|
CompoundWriter *open_cw(Store *store, char *name);
|