ferret 0.9.4 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +1 -1
- data/Rakefile +1 -0
- data/ext/field.c +87 -87
- data/ext/index.h +253 -255
- data/ext/index_io.c +15 -6
- data/ext/index_rw.c +6 -0
- data/ext/nix_io.c +4 -6
- data/ext/q_boolean.c +0 -6
- data/ext/q_fuzzy.c +10 -7
- data/ext/q_multi_phrase.c +2 -2
- data/ext/q_term.c +2 -2
- data/ext/q_wildcard.c +5 -4
- data/ext/search.c +3 -5
- data/ext/search.h +439 -400
- data/ext/store.h +1 -0
- data/ext/termdocs.c +3 -7
- data/ext/vector.c +1 -1
- data/lib/ferret.rb +1 -1
- data/lib/ferret/store/ram_store.rb +5 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/index/tc_index_reader.rb +6 -1
- data/test/unit/search/tc_search_and_sort.rb +1 -1
- data/test/unit/store/tc_fs_store.rb +1 -1
- metadata +4 -4
data/README
CHANGED
@@ -26,7 +26,7 @@ Run the following;
|
|
26
26
|
$ rake ext
|
27
27
|
$ ruby setup.rb config
|
28
28
|
$ ruby setup.rb setup
|
29
|
-
# ruby setup.rb install
|
29
|
+
# sudo ruby setup.rb install
|
30
30
|
|
31
31
|
These simple steps install ferret in the default location of Ruby libraries.
|
32
32
|
You can also install files into your favorite directory by supplying setup.rb
|
data/Rakefile
CHANGED
@@ -211,6 +211,7 @@ else
|
|
211
211
|
#### Load-time details: library and application (you will need one or both).
|
212
212
|
|
213
213
|
s.require_path = 'lib' # Use these for libraries.
|
214
|
+
s.autorequire = 'ferret'
|
214
215
|
|
215
216
|
#s.bindir = "bin" # Use these for applications.
|
216
217
|
#s.executables = ["rake"]
|
data/ext/field.c
CHANGED
@@ -213,93 +213,6 @@ FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc)
|
|
213
213
|
return fis;
|
214
214
|
}
|
215
215
|
|
216
|
-
/****************************************************************************
|
217
|
-
*
|
218
|
-
* FieldsWriter
|
219
|
-
*
|
220
|
-
****************************************************************************/
|
221
|
-
|
222
|
-
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis)
|
223
|
-
{
|
224
|
-
FieldsWriter *fw = ALLOC(FieldsWriter);
|
225
|
-
char buf[SEGMENT_NAME_MAX_LENGTH];
|
226
|
-
int slen = (int)strlen(segment);
|
227
|
-
|
228
|
-
strcpy(buf, segment);
|
229
|
-
|
230
|
-
fw->fis = fis;
|
231
|
-
strcpy(buf+slen, ".fdt");
|
232
|
-
fw->fields_out = store->create_output(store, buf);
|
233
|
-
strcpy(buf+slen, ".fdx");
|
234
|
-
fw->index_out = store->create_output(store, buf);
|
235
|
-
return fw;
|
236
|
-
}
|
237
|
-
|
238
|
-
void fw_close(FieldsWriter *fw)
|
239
|
-
{
|
240
|
-
os_close(fw->fields_out);
|
241
|
-
os_close(fw->index_out);
|
242
|
-
free(fw);
|
243
|
-
}
|
244
|
-
|
245
|
-
void save_data(OutStream *fout, char *data, int dlen)
|
246
|
-
{
|
247
|
-
os_write_vint(fout, dlen);
|
248
|
-
os_write_bytes(fout, (uchar *)data, dlen);
|
249
|
-
}
|
250
|
-
|
251
|
-
void fw_add_doc(FieldsWriter *fw, Document *doc)
|
252
|
-
{
|
253
|
-
int i, bits;
|
254
|
-
DocField *df;
|
255
|
-
char *data;
|
256
|
-
int stored_count = 0;
|
257
|
-
OutStream *fout = fw->fields_out, *iout = fw->index_out;
|
258
|
-
|
259
|
-
os_write_long(iout, os_pos(fout));
|
260
|
-
|
261
|
-
for (i = 0; i < doc->dfcnt; i++) {
|
262
|
-
if (doc->df_arr[i]->is_stored)
|
263
|
-
stored_count++;
|
264
|
-
}
|
265
|
-
os_write_vint(fout, stored_count);
|
266
|
-
|
267
|
-
for (i = 0; i < doc->dfcnt; i++) {
|
268
|
-
df = doc->df_arr[i];
|
269
|
-
if (df->is_stored) {
|
270
|
-
os_write_vint(fout, ((FieldInfo *)ht_get(fw->fis->by_name, df->name))->number);
|
271
|
-
|
272
|
-
bits = 0;
|
273
|
-
if (df->is_tokenized) {
|
274
|
-
bits |= FIELD_IS_TOKENIZED;
|
275
|
-
}
|
276
|
-
if (df->is_binary) {
|
277
|
-
bits |= FIELD_IS_BINARY;
|
278
|
-
}
|
279
|
-
if (df->is_compressed) {
|
280
|
-
bits |= FIELD_IS_COMPRESSED;
|
281
|
-
}
|
282
|
-
os_write_byte(fout, bits);
|
283
|
-
|
284
|
-
data = NULL;
|
285
|
-
if (df->is_compressed) {
|
286
|
-
/* Not compressing just yet but we'll save it anyway */
|
287
|
-
if (df->is_binary) {
|
288
|
-
save_data(fout, df->data, df->blen);
|
289
|
-
} else {
|
290
|
-
os_write_string(fout, df->data);
|
291
|
-
}
|
292
|
-
} else {
|
293
|
-
if (df->is_binary) {
|
294
|
-
save_data(fout, df->data, df->blen);
|
295
|
-
} else {
|
296
|
-
os_write_string(fout, df->data);
|
297
|
-
}
|
298
|
-
}
|
299
|
-
}
|
300
|
-
}
|
301
|
-
}
|
302
|
-
|
303
216
|
/****************************************************************************
|
304
217
|
*
|
305
218
|
* FieldsReader
|
@@ -406,3 +319,90 @@ Document *fr_get_doc(FieldsReader *fr, int doc_num)
|
|
406
319
|
return doc;
|
407
320
|
}
|
408
321
|
|
322
|
+
/****************************************************************************
|
323
|
+
*
|
324
|
+
* FieldsWriter
|
325
|
+
*
|
326
|
+
****************************************************************************/
|
327
|
+
|
328
|
+
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis)
|
329
|
+
{
|
330
|
+
FieldsWriter *fw = ALLOC(FieldsWriter);
|
331
|
+
char buf[SEGMENT_NAME_MAX_LENGTH];
|
332
|
+
int slen = (int)strlen(segment);
|
333
|
+
|
334
|
+
strcpy(buf, segment);
|
335
|
+
|
336
|
+
fw->fis = fis;
|
337
|
+
strcpy(buf+slen, ".fdt");
|
338
|
+
fw->fields_out = store->create_output(store, buf);
|
339
|
+
strcpy(buf+slen, ".fdx");
|
340
|
+
fw->index_out = store->create_output(store, buf);
|
341
|
+
return fw;
|
342
|
+
}
|
343
|
+
|
344
|
+
void fw_close(FieldsWriter *fw)
|
345
|
+
{
|
346
|
+
os_close(fw->fields_out);
|
347
|
+
os_close(fw->index_out);
|
348
|
+
free(fw);
|
349
|
+
}
|
350
|
+
|
351
|
+
void save_data(OutStream *fout, char *data, int dlen)
|
352
|
+
{
|
353
|
+
os_write_vint(fout, dlen);
|
354
|
+
os_write_bytes(fout, (uchar *)data, dlen);
|
355
|
+
}
|
356
|
+
|
357
|
+
void fw_add_doc(FieldsWriter *fw, Document *doc)
|
358
|
+
{
|
359
|
+
int i, bits;
|
360
|
+
DocField *df;
|
361
|
+
char *data;
|
362
|
+
int stored_count = 0;
|
363
|
+
OutStream *fout = fw->fields_out, *iout = fw->index_out;
|
364
|
+
|
365
|
+
os_write_long(iout, os_pos(fout));
|
366
|
+
|
367
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
368
|
+
if (doc->df_arr[i]->is_stored)
|
369
|
+
stored_count++;
|
370
|
+
}
|
371
|
+
os_write_vint(fout, stored_count);
|
372
|
+
|
373
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
374
|
+
df = doc->df_arr[i];
|
375
|
+
if (df->is_stored) {
|
376
|
+
os_write_vint(fout, ((FieldInfo *)ht_get(fw->fis->by_name, df->name))->number);
|
377
|
+
|
378
|
+
bits = 0;
|
379
|
+
if (df->is_tokenized) {
|
380
|
+
bits |= FIELD_IS_TOKENIZED;
|
381
|
+
}
|
382
|
+
if (df->is_binary) {
|
383
|
+
bits |= FIELD_IS_BINARY;
|
384
|
+
}
|
385
|
+
if (df->is_compressed) {
|
386
|
+
bits |= FIELD_IS_COMPRESSED;
|
387
|
+
}
|
388
|
+
os_write_byte(fout, bits);
|
389
|
+
|
390
|
+
data = NULL;
|
391
|
+
if (df->is_compressed) {
|
392
|
+
/* Not compressing just yet but we'll save it anyway */
|
393
|
+
if (df->is_binary) {
|
394
|
+
save_data(fout, df->data, df->blen);
|
395
|
+
} else {
|
396
|
+
os_write_string(fout, df->data);
|
397
|
+
}
|
398
|
+
} else {
|
399
|
+
if (df->is_binary) {
|
400
|
+
save_data(fout, df->data, df->blen);
|
401
|
+
} else {
|
402
|
+
os_write_string(fout, df->data);
|
403
|
+
}
|
404
|
+
}
|
405
|
+
}
|
406
|
+
}
|
407
|
+
}
|
408
|
+
|
data/ext/index.h
CHANGED
@@ -162,7 +162,6 @@ int ti_eq(TermInfo *ti1, TermInfo *ti2);
|
|
162
162
|
*
|
163
163
|
****************************************************************************/
|
164
164
|
|
165
|
-
typedef struct TermEnumFilter TermEnumFilter;
|
166
165
|
typedef struct TermEnum TermEnum;
|
167
166
|
struct TermEnum {
|
168
167
|
void *data;
|
@@ -301,10 +300,11 @@ void tvf_destroy(void *p);
|
|
301
300
|
****************************************************************************/
|
302
301
|
|
303
302
|
typedef struct TVTerm {
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
303
|
+
int field_num;
|
304
|
+
char *text;
|
305
|
+
int freq;
|
306
|
+
int *positions;
|
307
|
+
TVOffsetInfo **offsets;
|
308
308
|
} TVTerm;
|
309
309
|
|
310
310
|
TVTerm *tvt_create(char *text,
|
@@ -320,12 +320,12 @@ void tvt_destroy(void *p);
|
|
320
320
|
****************************************************************************/
|
321
321
|
|
322
322
|
typedef struct TermVector {
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
323
|
+
char *field;
|
324
|
+
char **terms;
|
325
|
+
int tcnt;
|
326
|
+
int *freqs;
|
327
|
+
int **positions;
|
328
|
+
TVOffsetInfo ***offsets;
|
329
329
|
} TermVector;
|
330
330
|
|
331
331
|
TermVector *tv_create(const char *field,
|
@@ -344,27 +344,27 @@ void tv_destroy(TermVector *tv);
|
|
344
344
|
|
345
345
|
#define STORE_POSITIONS_WITH_TERMVECTOR 0x1
|
346
346
|
#define STORE_OFFSET_WITH_TERMVECTOR 0x2
|
347
|
-
|
347
|
+
|
348
348
|
#define FORMAT_VERSION 2
|
349
349
|
#define FORMAT_SIZE 4
|
350
|
-
|
350
|
+
|
351
351
|
#define TVX_EXTENSION ".tvx"
|
352
352
|
#define TVD_EXTENSION ".tvd"
|
353
353
|
#define TVF_EXTENSION ".tvf"
|
354
354
|
|
355
355
|
typedef struct TermVectorsWriter {
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
356
|
+
TVField *curr_field;
|
357
|
+
int curr_doc_pointer;
|
358
|
+
OutStream *tvx;
|
359
|
+
OutStream *tvd;
|
360
|
+
OutStream *tvf;
|
361
|
+
FieldInfos *fis;
|
362
|
+
TVField **fields;
|
363
|
+
int fcnt;
|
364
|
+
int fsize;
|
365
|
+
TVTerm **terms;
|
366
|
+
int tcnt;
|
367
|
+
int tsize;
|
368
368
|
} TermVectorsWriter;
|
369
369
|
|
370
370
|
TermVectorsWriter *tvw_open(Store *store, char *segment, FieldInfos *fis);
|
@@ -384,23 +384,40 @@ void tvw_add_all_doc_vectors(TermVectorsWriter *tvw, Array *vectors);
|
|
384
384
|
****************************************************************************/
|
385
385
|
|
386
386
|
typedef struct TermVectorsReader {
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
387
|
+
int size;
|
388
|
+
InStream *tvx;
|
389
|
+
InStream *tvd;
|
390
|
+
InStream *tvf;
|
391
|
+
FieldInfos *fis;
|
392
|
+
int tvd_format;
|
393
|
+
int tvf_format;
|
394
394
|
} TermVectorsReader;
|
395
395
|
|
396
396
|
TermVectorsReader *tvr_open(Store *store, char *segment, FieldInfos *fis);
|
397
397
|
TermVectorsReader *tvr_clone(TermVectorsReader *orig);
|
398
398
|
void tvr_close(TermVectorsReader *tvr);
|
399
399
|
TermVector *tvr_read_term_vector(TermVectorsReader *tvr,
|
400
|
-
|
400
|
+
char *field, int tvf_pointer);
|
401
401
|
Array *tvr_get_tv(TermVectorsReader *tvr, int doc_num);
|
402
402
|
TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
|
403
403
|
|
404
|
+
/****************************************************************************
|
405
|
+
*
|
406
|
+
* FieldsReader
|
407
|
+
*
|
408
|
+
****************************************************************************/
|
409
|
+
|
410
|
+
typedef struct FieldsReader {
|
411
|
+
int len;
|
412
|
+
FieldInfos *fis;
|
413
|
+
InStream *fields_in;
|
414
|
+
InStream *index_in;
|
415
|
+
} FieldsReader;
|
416
|
+
|
417
|
+
FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
|
418
|
+
void fr_close(FieldsReader *fr);
|
419
|
+
Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
420
|
+
|
404
421
|
/****************************************************************************
|
405
422
|
*
|
406
423
|
* FieldsWriter
|
@@ -412,9 +429,9 @@ TermVector *tvr_get_field_tv(TermVectorsReader *tvr, int doc_num, char *field);
|
|
412
429
|
#define FIELD_IS_COMPRESSED 0X4
|
413
430
|
|
414
431
|
typedef struct FieldsWriter {
|
415
|
-
|
416
|
-
|
417
|
-
|
432
|
+
FieldInfos *fis;
|
433
|
+
OutStream *fields_out;
|
434
|
+
OutStream *index_out;
|
418
435
|
} FieldsWriter;
|
419
436
|
|
420
437
|
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis);
|
@@ -429,15 +446,15 @@ void fw_add_doc(FieldsWriter *fw, Document *doc);
|
|
429
446
|
|
430
447
|
typedef struct TermDocEnum TermDocEnum;
|
431
448
|
struct TermDocEnum {
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
449
|
+
void *data;
|
450
|
+
void (*seek)(TermDocEnum *tde, Term *term);
|
451
|
+
int (*doc_num)(TermDocEnum *tde);
|
452
|
+
int (*freq)(TermDocEnum *tde);
|
453
|
+
bool (*next)(TermDocEnum *tde);
|
454
|
+
int (*read)(TermDocEnum *tde, int *docs, int *freqs, int req_num);
|
455
|
+
bool (*skip_to)(TermDocEnum *tde, int target);
|
456
|
+
int (*next_position)(TermDocEnum *tde);
|
457
|
+
void (*close)(TermDocEnum *tde);
|
441
458
|
};
|
442
459
|
|
443
460
|
/* * SegmentTermDocEnum * */
|
@@ -445,27 +462,27 @@ struct TermDocEnum {
|
|
445
462
|
typedef struct SegmentTermDocEnum SegmentTermDocEnum;
|
446
463
|
|
447
464
|
struct SegmentTermDocEnum {
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
465
|
+
SegmentReader *parent;
|
466
|
+
InStream *freq_in;
|
467
|
+
int count; /* number of docs for this term skipped */
|
468
|
+
int doc_freq; /* number of doc this term appears in */
|
469
|
+
BitVector *deleted_docs;
|
470
|
+
int doc_num;
|
471
|
+
int freq;
|
472
|
+
int skip_interval;
|
473
|
+
int num_skips;
|
474
|
+
int skip_count;
|
475
|
+
InStream *skip_in;
|
476
|
+
int skip_doc;
|
477
|
+
int freq_pointer;
|
478
|
+
int prox_pointer;
|
479
|
+
int skip_pointer;
|
480
|
+
unsigned int have_skipped : 1;
|
481
|
+
void (*skip_prox)(SegmentTermDocEnum *stde);
|
482
|
+
InStream *prox_in;
|
483
|
+
int prox_cnt;
|
484
|
+
int position;
|
485
|
+
void (*seek_prox)(SegmentTermDocEnum *stde, int prox_pointer);
|
469
486
|
};
|
470
487
|
|
471
488
|
TermDocEnum *stde_create(IndexReader *ir);
|
@@ -477,15 +494,15 @@ TermDocEnum *stpe_create(IndexReader *ir);
|
|
477
494
|
/* * MultiTermDocEnum * */
|
478
495
|
typedef struct MultiTermDocEnum MultiTermDocEnum;
|
479
496
|
struct MultiTermDocEnum {
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
497
|
+
IndexReader **irs;
|
498
|
+
int *starts;
|
499
|
+
int ir_cnt;
|
500
|
+
Term *term;
|
501
|
+
int base;
|
502
|
+
int pointer;
|
503
|
+
TermDocEnum **irs_tde;
|
504
|
+
TermDocEnum *curr_tde;
|
505
|
+
TermDocEnum *(*term_docs_from_reader)(IndexReader *ir);
|
489
506
|
};
|
490
507
|
|
491
508
|
TermDocEnum *mtde_create(IndexReader **readers, int *starts, int ir_cnt);
|
@@ -499,33 +516,16 @@ TermDocEnum *mtpe_create(IndexReader **readers, int *starts, int ir_cnt);
|
|
499
516
|
|
500
517
|
#define MTDPE_POS_QUEUE_INIT_CAPA 8
|
501
518
|
typedef struct {
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
519
|
+
int doc_num;
|
520
|
+
int freq;
|
521
|
+
PriorityQueue *pq;
|
522
|
+
int *pos_queue;
|
523
|
+
int pos_queue_index;
|
524
|
+
int pos_queue_capa;
|
508
525
|
} MultipleTermDocPosEnum;
|
509
526
|
|
510
527
|
TermDocEnum *mtdpe_create(IndexReader *ir, Term **terms, int t_cnt);
|
511
528
|
|
512
|
-
/****************************************************************************
|
513
|
-
*
|
514
|
-
* FieldsReader
|
515
|
-
*
|
516
|
-
****************************************************************************/
|
517
|
-
|
518
|
-
typedef struct FieldsReader {
|
519
|
-
int len;
|
520
|
-
FieldInfos *fis;
|
521
|
-
InStream *fields_in;
|
522
|
-
InStream *index_in;
|
523
|
-
} FieldsReader;
|
524
|
-
|
525
|
-
FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis);
|
526
|
-
void fr_close(FieldsReader *fr);
|
527
|
-
Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
528
|
-
|
529
529
|
/****************************************************************************
|
530
530
|
*
|
531
531
|
* Posting
|
@@ -533,11 +533,11 @@ Document *fr_get_doc(FieldsReader *fr, int doc_num);
|
|
533
533
|
****************************************************************************/
|
534
534
|
|
535
535
|
typedef struct Posting {
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
536
|
+
Term *term;
|
537
|
+
int freq;
|
538
|
+
int size;
|
539
|
+
int *positions;
|
540
|
+
TVOffsetInfo **offsets;
|
541
541
|
} Posting;
|
542
542
|
|
543
543
|
Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
|
@@ -552,22 +552,22 @@ void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset);
|
|
552
552
|
****************************************************************************/
|
553
553
|
|
554
554
|
typedef struct DocumentWriter {
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
555
|
+
Store *store;
|
556
|
+
Analyzer *analyzer;
|
557
|
+
Similarity *similarity;
|
558
|
+
HshTable *postingtable;
|
559
|
+
int pcnt;
|
560
|
+
FieldInfos *fis;
|
561
|
+
float *field_boosts;
|
562
|
+
int *field_lengths;
|
563
|
+
int *field_positions;
|
564
|
+
int *field_offsets;
|
565
|
+
int max_field_length;
|
566
|
+
int term_index_interval;
|
567
567
|
} DocumentWriter;
|
568
568
|
|
569
569
|
DocumentWriter *dw_open(Store *store, Analyzer *analyzer,
|
570
|
-
|
570
|
+
Similarity *similarity, int max_field_length, int term_index_interval);
|
571
571
|
void dw_close(DocumentWriter *dw);
|
572
572
|
void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
|
573
573
|
|
@@ -578,9 +578,9 @@ void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc);
|
|
578
578
|
****************************************************************************/
|
579
579
|
|
580
580
|
typedef struct SegmentInfo {
|
581
|
-
|
582
|
-
|
583
|
-
|
581
|
+
char *name;
|
582
|
+
int doc_cnt;
|
583
|
+
Store *store;
|
584
584
|
} SegmentInfo;
|
585
585
|
|
586
586
|
SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
|
@@ -596,13 +596,13 @@ bool si_has_separate_norms(SegmentInfo *si);
|
|
596
596
|
****************************************************************************/
|
597
597
|
|
598
598
|
typedef struct SegmentInfos {
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
599
|
+
Store *store;
|
600
|
+
SegmentInfo **segs;
|
601
|
+
int scnt;
|
602
|
+
int size;
|
603
|
+
int counter;
|
604
|
+
int version;
|
605
|
+
int format;
|
606
606
|
} SegmentInfos;
|
607
607
|
|
608
608
|
SegmentInfos *sis_create();
|
@@ -622,65 +622,63 @@ int sis_read_current_version(Store *store);
|
|
622
622
|
****************************************************************************/
|
623
623
|
|
624
624
|
enum FIELD_TYPE {
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
625
|
+
/* all fields */
|
626
|
+
IR_ALL,
|
627
|
+
/* all indexed fields */
|
628
|
+
IR_INDEXED,
|
629
|
+
/* all fields which are not indexed */
|
630
|
+
IR_UNINDEXED,
|
631
|
+
/* all fields which are indexed with termvectors enables */
|
632
|
+
IR_INDEXED_WITH_TERM_VECTOR,
|
633
|
+
/* all fields which are indexed but don't have termvectors enabled */
|
634
|
+
IR_INDEXED_NO_TERM_VECTOR,
|
635
|
+
/* all fields where termvectors are enabled. Please note that only standard */
|
636
|
+
/* termvector fields are returned */
|
637
|
+
IR_TERM_VECTOR,
|
638
|
+
/* all field with termvectors wiht positions enabled */
|
639
|
+
IR_TERM_VECTOR_WITH_POSITION,
|
640
|
+
/* all fields where termvectors with offset position are set */
|
641
|
+
IR_TERM_VECTOR_WITH_OFFSET,
|
642
|
+
/* all fields where termvectors with offset and position values set */
|
643
|
+
IR_TERM_VECTOR_WITH_POSITION_OFFSET
|
644
644
|
};
|
645
645
|
|
646
646
|
struct IndexReader {
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
int (*write_fields_i)(IndexReader *ir, OutStream *fdt_out,
|
683
|
-
OutStream *fdx_out);
|
647
|
+
mutex_t mutex;
|
648
|
+
HshTable *cache;
|
649
|
+
HshTable *sort_cache;
|
650
|
+
void *data;
|
651
|
+
Store *store;
|
652
|
+
Lock *write_lock;
|
653
|
+
SegmentInfos *sis;
|
654
|
+
bool has_changes : 1;
|
655
|
+
bool is_stale : 1;
|
656
|
+
bool is_owner : 1;
|
657
|
+
TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
|
658
|
+
Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
|
659
|
+
int (*num_docs)(IndexReader *ir);
|
660
|
+
int (*max_doc)(IndexReader *ir);
|
661
|
+
Document *(*get_doc)(IndexReader *ir, int doc_num);
|
662
|
+
uchar *(*get_norms)(IndexReader *ir, char *field);
|
663
|
+
uchar *(*get_norms_always)(IndexReader *ir, char *field);
|
664
|
+
void (*do_set_norm)(IndexReader *ir, int doc_num, char *field,
|
665
|
+
uchar val);
|
666
|
+
void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf,
|
667
|
+
int offset);
|
668
|
+
TermEnum *(*terms)(IndexReader *ir);
|
669
|
+
TermEnum *(*terms_from)(IndexReader *ir, Term *term);
|
670
|
+
int (*doc_freq)(IndexReader *ir, Term *t);
|
671
|
+
TermDocEnum *(*term_docs)(IndexReader *ir);
|
672
|
+
TermDocEnum *(*term_positions)(IndexReader *ir);
|
673
|
+
void (*do_delete_doc)(IndexReader *ir, int doc_num);
|
674
|
+
void (*do_undelete_all)(IndexReader *ir);
|
675
|
+
bool (*is_deleted)(IndexReader *ir, int doc_num);
|
676
|
+
bool (*has_deletions)(IndexReader *ir);
|
677
|
+
bool (*has_norms)(IndexReader *ir, char *field);
|
678
|
+
HashSet *(*get_field_names)(IndexReader *ir, int field_type);
|
679
|
+
void (*do_commit)(IndexReader *ir);
|
680
|
+
void (*do_close)(IndexReader *ir);
|
681
|
+
void (*acquire_write_lock)(IndexReader *ir);
|
684
682
|
};
|
685
683
|
|
686
684
|
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
|
@@ -705,10 +703,10 @@ bool ir_is_latest(IndexReader *ir);
|
|
705
703
|
****************************************************************************/
|
706
704
|
|
707
705
|
typedef struct Norm {
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
706
|
+
bool is_dirty : 1;
|
707
|
+
int field_num;
|
708
|
+
InStream *is;
|
709
|
+
uchar *bytes;
|
712
710
|
} Norm;
|
713
711
|
|
714
712
|
/****************************************************************************
|
@@ -718,22 +716,22 @@ typedef struct Norm {
|
|
718
716
|
****************************************************************************/
|
719
717
|
|
720
718
|
struct SegmentReader {
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
719
|
+
FieldInfos *fis;
|
720
|
+
FieldsReader *fr;
|
721
|
+
char *segment;
|
722
|
+
BitVector *deleted_docs;
|
723
|
+
bool deleted_docs_dirty : 1;
|
724
|
+
bool undelete_all : 1;
|
725
|
+
bool norms_dirty : 1;
|
726
|
+
InStream *freq_in;
|
727
|
+
InStream *prox_in;
|
728
|
+
TermInfosReader *tir;
|
729
|
+
TermVectorsReader *orig_tvr;
|
730
|
+
thread_key_t thread_tvr;
|
731
|
+
Array *tvr_bucket;
|
732
|
+
HshTable *norms;
|
733
|
+
Store *cfs_store;
|
734
|
+
uchar *fake_norms;
|
737
735
|
};
|
738
736
|
|
739
737
|
IndexReader *sr_open(SegmentInfos *sis, int si_num, bool is_owner);
|
@@ -746,17 +744,17 @@ IndexReader *sr_open_si(SegmentInfo *si);
|
|
746
744
|
****************************************************************************/
|
747
745
|
|
748
746
|
typedef struct MultiReader {
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
747
|
+
bool has_deletions : 1;
|
748
|
+
int max_doc;
|
749
|
+
int num_docs_cache;
|
750
|
+
int rcnt;
|
751
|
+
int *starts;
|
752
|
+
IndexReader **sub_readers;
|
753
|
+
HshTable *norms_cache;
|
756
754
|
} MultiReader;
|
757
755
|
|
758
756
|
IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
|
759
|
-
|
757
|
+
int rcnt);
|
760
758
|
|
761
759
|
/****************************************************************************
|
762
760
|
*
|
@@ -765,12 +763,12 @@ IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
|
|
765
763
|
****************************************************************************/
|
766
764
|
|
767
765
|
typedef struct SegmentMergeInfo {
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
766
|
+
int base;
|
767
|
+
IndexReader *ir;
|
768
|
+
TermEnum *te;
|
769
|
+
TermBuffer *tb;
|
770
|
+
TermDocEnum *postings;
|
771
|
+
int *doc_map;
|
774
772
|
} SegmentMergeInfo;
|
775
773
|
|
776
774
|
SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
|
@@ -785,24 +783,24 @@ bool smi_lt(SegmentMergeInfo *smi1, SegmentMergeInfo *smi2);
|
|
785
783
|
****************************************************************************/
|
786
784
|
|
787
785
|
typedef struct SegmentMerger {
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
786
|
+
Store *store;
|
787
|
+
char *name;
|
788
|
+
Array *readers;
|
789
|
+
FieldInfos *fis;
|
790
|
+
OutStream *freq_out;
|
791
|
+
OutStream *prox_out;
|
792
|
+
TermInfosWriter *tiw;
|
793
|
+
Term *terms_buf;
|
794
|
+
int terms_buf_pointer;
|
795
|
+
int terms_buf_size;
|
796
|
+
PriorityQueue *queue;
|
797
|
+
TermInfo *ti;
|
798
|
+
int term_index_interval;
|
799
|
+
OutStream *skip_buffer;
|
800
|
+
int skip_interval;
|
801
|
+
int last_skip_doc;
|
802
|
+
int last_skip_freq_pointer;
|
803
|
+
int last_skip_prox_pointer;
|
806
804
|
} SegmentMerger;
|
807
805
|
|
808
806
|
SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
|
@@ -821,25 +819,25 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
|
|
821
819
|
#define WRITE_LOCK_NAME "write"
|
822
820
|
#define COMMIT_LOCK_NAME "commit"
|
823
821
|
struct IndexWriter {
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
822
|
+
mutex_t mutex;
|
823
|
+
HshTable *postings;
|
824
|
+
FieldInfos *fis;
|
825
|
+
int merge_factor;
|
826
|
+
int min_merge_docs;
|
827
|
+
int max_merge_docs;
|
828
|
+
int max_field_length;
|
829
|
+
int term_index_interval;
|
830
|
+
Store *store;
|
831
|
+
Analyzer *analyzer;
|
832
|
+
Similarity *similarity;
|
833
|
+
SegmentInfos *sis;
|
834
|
+
Store *ram_store;
|
835
|
+
Lock *write_lock;
|
836
|
+
bool use_compound_file : 1;
|
839
837
|
};
|
840
838
|
|
841
839
|
IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
842
|
-
|
840
|
+
bool create);
|
843
841
|
void iw_flush_ram_segments(IndexWriter *iw);
|
844
842
|
void iw_close(IndexWriter *iw);
|
845
843
|
int iw_doc_count(IndexWriter *iw);
|
@@ -855,11 +853,11 @@ void iw_add_readers(IndexWriter *iw, IndexReader **stores, int cnt);
|
|
855
853
|
****************************************************************************/
|
856
854
|
|
857
855
|
typedef struct CompoundWriter {
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
856
|
+
Store *store;
|
857
|
+
const char *name;
|
858
|
+
HashSet *ids;
|
859
|
+
Array *file_entries;
|
860
|
+
bool merged;
|
863
861
|
} CompoundWriter;
|
864
862
|
|
865
863
|
CompoundWriter *open_cw(Store *store, char *name);
|