ferret 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -5
- data/Rakefile +34 -13
- data/TODO +1 -0
- data/TUTORIAL +1 -1
- data/ext/analysis.c +87 -70
- data/ext/analysis.h +18 -6
- data/ext/array.c +1 -2
- data/ext/array.h +1 -1
- data/ext/bitvector.c +10 -6
- data/ext/bitvector.h +2 -2
- data/ext/compound_io.c +30 -27
- data/ext/document.c +15 -15
- data/ext/document.h +5 -5
- data/ext/except.c +2 -0
- data/ext/except.h +25 -23
- data/ext/extconf.rb +1 -0
- data/ext/ferret.c +10 -8
- data/ext/ferret.h +9 -8
- data/ext/field.c +29 -25
- data/ext/filter.c +52 -14
- data/ext/frtio.h +13 -0
- data/ext/fs_store.c +115 -170
- data/ext/global.c +9 -8
- data/ext/global.h +17 -13
- data/ext/hash.c +13 -19
- data/ext/hash.h +11 -11
- data/ext/hashset.c +5 -7
- data/ext/hashset.h +9 -8
- data/ext/helper.c +1 -1
- data/ext/helper.h +2 -1
- data/ext/inc/except.h +25 -23
- data/ext/inc/lang.h +11 -1
- data/ext/ind.c +33 -21
- data/ext/index.h +44 -39
- data/ext/index_io.c +61 -57
- data/ext/index_rw.c +418 -361
- data/ext/lang.c +10 -0
- data/ext/lang.h +11 -1
- data/ext/nix_io.c +135 -0
- data/ext/priorityqueue.c +16 -16
- data/ext/priorityqueue.h +9 -6
- data/ext/q_boolean.c +128 -76
- data/ext/q_const_score.c +20 -20
- data/ext/q_filtered_query.c +20 -20
- data/ext/q_fuzzy.c +37 -23
- data/ext/q_match_all.c +15 -19
- data/ext/q_multi_phrase.c +87 -46
- data/ext/q_parser.c +247 -119
- data/ext/q_phrase.c +86 -52
- data/ext/q_prefix.c +25 -14
- data/ext/q_range.c +59 -14
- data/ext/q_span.c +263 -172
- data/ext/q_term.c +62 -51
- data/ext/q_wildcard.c +24 -13
- data/ext/r_analysis.c +328 -80
- data/ext/r_doc.c +11 -6
- data/ext/r_index_io.c +40 -32
- data/ext/r_qparser.c +15 -14
- data/ext/r_search.c +270 -152
- data/ext/r_store.c +32 -17
- data/ext/ram_store.c +38 -22
- data/ext/search.c +617 -87
- data/ext/search.h +227 -163
- data/ext/similarity.c +54 -45
- data/ext/similarity.h +3 -3
- data/ext/sort.c +132 -53
- data/ext/store.c +21 -2
- data/ext/store.h +14 -14
- data/ext/tags +4322 -232
- data/ext/term.c +140 -109
- data/ext/termdocs.c +74 -60
- data/ext/vector.c +181 -152
- data/ext/w32_io.c +150 -0
- data/lib/ferret.rb +1 -1
- data/lib/ferret/analysis/standard_tokenizer.rb +4 -3
- data/lib/ferret/document/field.rb +1 -1
- data/lib/ferret/index/field_infos.rb +1 -1
- data/lib/ferret/index/term.rb +1 -1
- data/lib/ferret/query_parser/query_parser.tab.rb +8 -24
- data/lib/ferret/search.rb +1 -0
- data/lib/ferret/search/boolean_query.rb +0 -4
- data/lib/ferret/search/index_searcher.rb +21 -8
- data/lib/ferret/search/multi_phrase_query.rb +7 -0
- data/lib/ferret/search/multi_searcher.rb +261 -0
- data/lib/ferret/search/phrase_query.rb +1 -1
- data/lib/ferret/search/query.rb +34 -5
- data/lib/ferret/search/sort.rb +7 -3
- data/lib/ferret/search/sort_field.rb +8 -4
- data/lib/ferret/store/fs_store.rb +13 -6
- data/lib/ferret/store/index_io.rb +0 -14
- data/lib/ferret/store/ram_store.rb +3 -2
- data/lib/rferret.rb +1 -1
- data/test/unit/analysis/ctc_analyzer.rb +131 -0
- data/test/unit/analysis/ctc_tokenstream.rb +98 -9
- data/test/unit/index/tc_index.rb +40 -1
- data/test/unit/index/tc_term.rb +7 -0
- data/test/unit/index/th_doc.rb +8 -0
- data/test/unit/query_parser/tc_query_parser.rb +6 -4
- data/test/unit/search/rtc_sort_field.rb +6 -6
- data/test/unit/search/tc_index_searcher.rb +8 -0
- data/test/unit/search/tc_multi_searcher.rb +275 -0
- data/test/unit/search/tc_multi_searcher2.rb +126 -0
- data/test/unit/search/tc_search_and_sort.rb +66 -0
- metadata +31 -26
- data/test/unit/query_parser/rtc_query_parser.rb +0 -138
data/ext/ind.c
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
#include <string.h>
|
2
1
|
#include "search.h"
|
2
|
+
#include <string.h>
|
3
|
+
|
3
4
|
|
4
5
|
static char * const NON_UNIQUE_KEY_ERROR_MSG = "Tried to use a key that was not unique";
|
5
6
|
|
@@ -45,23 +46,22 @@ Index *index_create(Store *store, Analyzer *analyzer, HashSet *def_fields,
|
|
45
46
|
self->has_writes = false;
|
46
47
|
if (store) {
|
47
48
|
self->store = store;
|
48
|
-
|
49
|
+
ref(store);
|
49
50
|
} else {
|
50
51
|
self->store = open_ram_store();
|
51
52
|
create = true;
|
52
|
-
self->close_store = true;
|
53
53
|
}
|
54
54
|
if (analyzer) {
|
55
55
|
self->analyzer = analyzer;
|
56
|
-
|
56
|
+
ref(analyzer);
|
57
57
|
} else {
|
58
58
|
self->analyzer = mb_standard_analyzer_create(true);
|
59
|
-
self->close_analyzer = true;
|
60
59
|
}
|
61
60
|
self->use_compound_file = true;
|
62
61
|
|
63
62
|
if (create) {
|
64
|
-
|
63
|
+
ref(self->analyzer);
|
64
|
+
self->iw = iw_open(self->store, self->analyzer, create);
|
65
65
|
iw_close(self->iw);
|
66
66
|
self->iw = NULL;
|
67
67
|
}
|
@@ -73,6 +73,7 @@ Index *index_create(Store *store, Analyzer *analyzer, HashSet *def_fields,
|
|
73
73
|
self->auto_flush = false;
|
74
74
|
self->check_latest = true;
|
75
75
|
|
76
|
+
ref(self->analyzer);
|
76
77
|
self->qp = qp_create(all_fields, def_fields, self->analyzer);
|
77
78
|
/* Index is a convenience class so set qp convenience options */
|
78
79
|
self->qp->allow_any_fields = true;
|
@@ -87,8 +88,8 @@ void index_destroy(Index *self)
|
|
87
88
|
mutex_destroy(&self->mutex);
|
88
89
|
INDEX_CLOSE_READER(self);
|
89
90
|
if (self->iw) iw_close(self->iw);
|
90
|
-
|
91
|
-
|
91
|
+
store_deref(self->store);
|
92
|
+
a_deref(self->analyzer);
|
92
93
|
if (self->qp) qp_destroy(self->qp);
|
93
94
|
if (self->id_field != ((char *)ID_STRING)) free(self->id_field);
|
94
95
|
if (self->def_field != ((char *)ID_STRING)) free(self->def_field);
|
@@ -106,13 +107,19 @@ void index_flush(Index *self)
|
|
106
107
|
}
|
107
108
|
self->has_writes = false;
|
108
109
|
}
|
110
|
+
|
109
111
|
inline void ensure_writer_open(Index *self)
|
110
112
|
{
|
111
113
|
if (!self->iw) {
|
112
114
|
INDEX_CLOSE_READER(self);
|
113
|
-
|
115
|
+
|
116
|
+
/* make sure the analzyer isn't deleted by the IndexWriter */
|
117
|
+
ref(self->analyzer);
|
118
|
+
self->iw = iw_open(self->store, self->analyzer, false);
|
114
119
|
self->iw->use_compound_file = self->use_compound_file;
|
115
|
-
} else {
|
120
|
+
} else if (self->analyzer != self->iw->analyzer) {
|
121
|
+
a_deref(self->iw->analyzer);
|
122
|
+
ref(self->analyzer);
|
116
123
|
self->iw->analyzer = self->analyzer; /* in case it has changed */
|
117
124
|
}
|
118
125
|
}
|
@@ -122,14 +129,14 @@ inline void ensure_reader_open(Index *self)
|
|
122
129
|
if (self->ir) {
|
123
130
|
if (self->check_latest && !ir_is_latest(self->ir)) {
|
124
131
|
INDEX_CLOSE_READER(self);
|
125
|
-
self->ir = ir_open(self->store
|
132
|
+
self->ir = ir_open(self->store);
|
126
133
|
}
|
127
134
|
} else {
|
128
135
|
if (self->iw) {
|
129
136
|
iw_close(self->iw);
|
130
137
|
self->iw = NULL;
|
131
138
|
}
|
132
|
-
self->ir = ir_open(self->store
|
139
|
+
self->ir = ir_open(self->store);
|
133
140
|
}
|
134
141
|
}
|
135
142
|
|
@@ -203,7 +210,7 @@ static void inline index_add_doc_i(Index *self, Document *doc)
|
|
203
210
|
} else if (td->total_hits == 1) {
|
204
211
|
ir_delete_doc(self->ir, td->hits[0]->doc);
|
205
212
|
}
|
206
|
-
|
213
|
+
q_deref(q);
|
207
214
|
td_destroy(td);
|
208
215
|
}
|
209
216
|
ensure_writer_open(self);
|
@@ -215,11 +222,16 @@ void index_add_doc_a(Index *self, Document *doc, Analyzer *analyzer)
|
|
215
222
|
{
|
216
223
|
Analyzer *tmp_analyzer;
|
217
224
|
mutex_lock(&self->store->ext_mutex);
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
225
|
+
if (analyzer != self->analyzer) {
|
226
|
+
ref(analyzer);
|
227
|
+
tmp_analyzer = self->analyzer;
|
228
|
+
self->analyzer = analyzer;
|
229
|
+
index_add_doc_i(self, doc);
|
230
|
+
self->analyzer = tmp_analyzer;
|
231
|
+
a_deref(analyzer);
|
232
|
+
} else {
|
233
|
+
index_add_doc_i(self, doc);
|
234
|
+
}
|
223
235
|
mutex_unlock(&self->store->ext_mutex);
|
224
236
|
}
|
225
237
|
|
@@ -272,7 +284,7 @@ TopDocs *index_search_str(Index *self, char *qstr, int first_doc,
|
|
272
284
|
TopDocs *td;
|
273
285
|
query = index_get_query(self, qstr); /* will ensure_searcher is open */
|
274
286
|
td = sea_search(self->sea, query, first_doc, num_docs, filter, sort);
|
275
|
-
|
287
|
+
q_deref(query);
|
276
288
|
return td;
|
277
289
|
}
|
278
290
|
|
@@ -363,7 +375,7 @@ void index_delete_id(Index *self, char *id)
|
|
363
375
|
index_delete_term(self, &t);
|
364
376
|
}
|
365
377
|
|
366
|
-
static void index_qdel_i(Searcher *sea, int doc_num, void *arg)
|
378
|
+
static void index_qdel_i(Searcher *sea, int doc_num, float score, void *arg)
|
367
379
|
{
|
368
380
|
ir_delete_doc(sea->ir, doc_num);
|
369
381
|
}
|
@@ -381,7 +393,7 @@ void index_delete_query_str(Index *self, char *qstr, Filter *f)
|
|
381
393
|
{
|
382
394
|
Query *q = index_get_query(self, qstr);
|
383
395
|
index_delete_query(self, q, f);
|
384
|
-
|
396
|
+
q_deref(q);
|
385
397
|
}
|
386
398
|
|
387
399
|
Explanation *index_explain(Index *self, Query *q, int doc_num)
|
data/ext/index.h
CHANGED
@@ -15,6 +15,7 @@
|
|
15
15
|
|
16
16
|
|
17
17
|
#define SEGMENT_NAME_MAX_LENGTH 100
|
18
|
+
#define NOT_A_FIELD 0xFFFFFFFF
|
18
19
|
|
19
20
|
typedef struct Config {
|
20
21
|
int merge_factor;
|
@@ -75,7 +76,7 @@ FieldInfo *fi_create(char *name,
|
|
75
76
|
bool store_pos,
|
76
77
|
bool store_offset,
|
77
78
|
bool omit_norms);
|
78
|
-
void fi_destroy(
|
79
|
+
void fi_destroy(FieldInfo *fi);
|
79
80
|
|
80
81
|
/****************************************************************************
|
81
82
|
*
|
@@ -91,7 +92,7 @@ typedef struct FieldInfos {
|
|
91
92
|
|
92
93
|
FieldInfos *fis_create();
|
93
94
|
FieldInfos *fis_open(Store *store, char *filename);
|
94
|
-
void fis_destroy(
|
95
|
+
void fis_destroy(FieldInfos *fis);
|
95
96
|
FieldInfo *fis_add(FieldInfos *fis,
|
96
97
|
char *name,
|
97
98
|
bool is_indexed,
|
@@ -111,8 +112,9 @@ bool fis_has_vectors(FieldInfos *fis);
|
|
111
112
|
void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext);
|
112
113
|
FieldInfos *fis_read(FieldInfos *fis, InStream *is);
|
113
114
|
FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
|
114
|
-
|
115
|
+
ullong fis_get_number(FieldInfos *fis, char *name);
|
115
116
|
FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
|
117
|
+
bool fis_reorder_required(FieldInfos *fis, Document *doc);
|
116
118
|
|
117
119
|
/****************************************************************************
|
118
120
|
*
|
@@ -126,7 +128,7 @@ typedef struct TermBuffer {
|
|
126
128
|
} TermBuffer;
|
127
129
|
|
128
130
|
TermBuffer *tb_create();
|
129
|
-
void tb_destroy(
|
131
|
+
void tb_destroy(TermBuffer *tb);
|
130
132
|
TermBuffer *tb_set_term(TermBuffer *tb, Term *t);
|
131
133
|
Term *tb_get_term(TermBuffer *tb);
|
132
134
|
int tb_cmp(TermBuffer *tb1, TermBuffer *tb2);
|
@@ -149,7 +151,7 @@ typedef struct TermInfo {
|
|
149
151
|
|
150
152
|
TermInfo *ti_create(int doc_freq, int freq_pointer, int prox_pointer, int skip_offset);
|
151
153
|
TermInfo *ti_set(TermInfo *ti, int df, int fp, int pp, int so);
|
152
|
-
void ti_destroy(
|
154
|
+
void ti_destroy(TermInfo *ti);
|
153
155
|
TermInfo *ti_cpy(TermInfo *ti1, TermInfo *ti2);
|
154
156
|
TermInfo *ti_clone(TermInfo *other);
|
155
157
|
int ti_eq(TermInfo *ti1, TermInfo *ti2);
|
@@ -226,7 +228,7 @@ typedef struct TermInfosWriter {
|
|
226
228
|
TermInfo *last_term_info;
|
227
229
|
FieldInfos *fis;
|
228
230
|
char *curr_field;
|
229
|
-
|
231
|
+
ullong curr_field_num;
|
230
232
|
} TermInfosWriter;
|
231
233
|
|
232
234
|
TermInfosWriter *tiw_open(Store *store,
|
@@ -332,7 +334,7 @@ TermVector *tv_create(const char *field,
|
|
332
334
|
int *freqs,
|
333
335
|
int **positions,
|
334
336
|
TVOffsetInfo ***offsets);
|
335
|
-
void tv_destroy(
|
337
|
+
void tv_destroy(TermVector *tv);
|
336
338
|
|
337
339
|
/****************************************************************************
|
338
340
|
*
|
@@ -441,11 +443,12 @@ struct TermDocEnum {
|
|
441
443
|
/* * SegmentTermDocEnum * */
|
442
444
|
|
443
445
|
typedef struct SegmentTermDocEnum SegmentTermDocEnum;
|
446
|
+
|
444
447
|
struct SegmentTermDocEnum {
|
445
448
|
SegmentReader *parent;
|
446
449
|
InStream *freq_in;
|
447
|
-
int count;
|
448
|
-
int doc_freq;
|
450
|
+
int count; /* number of docs for this term skipped */
|
451
|
+
int doc_freq; /* number of doc this term appears in */
|
449
452
|
BitVector *deleted_docs;
|
450
453
|
int doc_num;
|
451
454
|
int freq;
|
@@ -538,8 +541,8 @@ typedef struct Posting {
|
|
538
541
|
} Posting;
|
539
542
|
|
540
543
|
Posting *p_create(Term *term, int position, TVOffsetInfo *offset);
|
541
|
-
void p_destroy(
|
542
|
-
void p_add_occurance(Posting *
|
544
|
+
void p_destroy(Posting *self);
|
545
|
+
void p_add_occurance(Posting *self, int position, TVOffsetInfo *offset);
|
543
546
|
|
544
547
|
|
545
548
|
/****************************************************************************
|
@@ -581,7 +584,7 @@ typedef struct SegmentInfo {
|
|
581
584
|
} SegmentInfo;
|
582
585
|
|
583
586
|
SegmentInfo *si_create(char *name, int doc_cnt, Store *store);
|
584
|
-
void si_destroy(
|
587
|
+
void si_destroy(SegmentInfo *si);
|
585
588
|
bool si_has_deletions(SegmentInfo *si);
|
586
589
|
bool si_uses_compound_file(SegmentInfo *si);
|
587
590
|
bool si_has_separate_norms(SegmentInfo *si);
|
@@ -598,12 +601,12 @@ typedef struct SegmentInfos {
|
|
598
601
|
int scnt;
|
599
602
|
int size;
|
600
603
|
int counter;
|
601
|
-
|
604
|
+
int version;
|
602
605
|
int format;
|
603
606
|
} SegmentInfos;
|
604
607
|
|
605
608
|
SegmentInfos *sis_create();
|
606
|
-
void sis_destroy(
|
609
|
+
void sis_destroy(SegmentInfos *sis);
|
607
610
|
void sis_add_si(SegmentInfos *sis, SegmentInfo *si);
|
608
611
|
void sis_del_at(SegmentInfos *sis, int at);
|
609
612
|
void sis_del_from_to(SegmentInfos *sis, int from, int to);
|
@@ -619,24 +622,24 @@ int sis_read_current_version(Store *store);
|
|
619
622
|
****************************************************************************/
|
620
623
|
|
621
624
|
enum FIELD_TYPE {
|
622
|
-
|
625
|
+
/* all fields */
|
623
626
|
IR_ALL,
|
624
|
-
|
627
|
+
/* all indexed fields */
|
625
628
|
IR_INDEXED,
|
626
|
-
|
629
|
+
/* all fields which are not indexed */
|
627
630
|
IR_UNINDEXED,
|
628
|
-
|
631
|
+
/* all fields which are indexed with termvectors enables */
|
629
632
|
IR_INDEXED_WITH_TERM_VECTOR,
|
630
|
-
|
633
|
+
/* all fields which are indexed but don't have termvectors enabled */
|
631
634
|
IR_INDEXED_NO_TERM_VECTOR,
|
632
|
-
|
633
|
-
|
635
|
+
/* all fields where termvectors are enabled. Please note that only standard */
|
636
|
+
/* termvector fields are returned */
|
634
637
|
IR_TERM_VECTOR,
|
635
|
-
|
638
|
+
/* all field with termvectors wiht positions enabled */
|
636
639
|
IR_TERM_VECTOR_WITH_POSITION,
|
637
|
-
|
640
|
+
/* all fields where termvectors with offset position are set */
|
638
641
|
IR_TERM_VECTOR_WITH_OFFSET,
|
639
|
-
|
642
|
+
/* all fields where termvectors with offset and position values set */
|
640
643
|
IR_TERM_VECTOR_WITH_POSITION_OFFSET
|
641
644
|
};
|
642
645
|
|
@@ -651,7 +654,6 @@ struct IndexReader {
|
|
651
654
|
bool has_changes : 1;
|
652
655
|
bool is_stale : 1;
|
653
656
|
bool is_owner : 1;
|
654
|
-
bool close_store : 1;
|
655
657
|
TermVector *(*get_term_vector)(IndexReader *ir, int doc_num, char *field);
|
656
658
|
Array *(*get_term_vectors)(IndexReader *ir, int doc_num);
|
657
659
|
int (*num_docs)(IndexReader *ir);
|
@@ -659,8 +661,10 @@ struct IndexReader {
|
|
659
661
|
Document *(*get_doc)(IndexReader *ir, int doc_num);
|
660
662
|
uchar *(*get_norms)(IndexReader *ir, char *field);
|
661
663
|
uchar *(*get_norms_always)(IndexReader *ir, char *field);
|
662
|
-
void (*do_set_norm)(IndexReader *ir, int doc_num, char *field,
|
663
|
-
|
664
|
+
void (*do_set_norm)(IndexReader *ir, int doc_num, char *field,
|
665
|
+
uchar val);
|
666
|
+
void (*get_norms_into)(IndexReader *ir, char *field, uchar *buf,
|
667
|
+
int offset);
|
664
668
|
TermEnum *(*terms)(IndexReader *ir);
|
665
669
|
TermEnum *(*terms_from)(IndexReader *ir, Term *term);
|
666
670
|
int (*doc_freq)(IndexReader *ir, Term *t);
|
@@ -675,17 +679,19 @@ struct IndexReader {
|
|
675
679
|
void (*do_commit)(IndexReader *ir);
|
676
680
|
void (*do_close)(IndexReader *ir);
|
677
681
|
void (*acquire_write_lock)(IndexReader *ir);
|
682
|
+
int (*write_fields_i)(IndexReader *ir, OutStream *fdt_out,
|
683
|
+
OutStream *fdx_out);
|
678
684
|
};
|
679
685
|
|
680
|
-
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner
|
681
|
-
IndexReader *ir_open(Store *store
|
686
|
+
IndexReader *ir_create(Store *store, SegmentInfos *sis, int is_owner);
|
687
|
+
IndexReader *ir_open(Store *store);
|
682
688
|
bool ir_index_exists(Store *store);
|
683
689
|
void ir_close(IndexReader *ir);
|
684
690
|
void ir_commit(IndexReader *ir);
|
685
691
|
void ir_delete_doc(IndexReader *ir, int doc_num);
|
686
692
|
void ir_undelete_all(IndexReader *ir);
|
687
693
|
void ir_set_norm(IndexReader *ir, int doc_num, char *field, uchar val);
|
688
|
-
void ir_destroy(
|
694
|
+
void ir_destroy(IndexReader *self);
|
689
695
|
Document *ir_get_doc_with_term(IndexReader *ir, Term *term);
|
690
696
|
TermDocEnum *ir_term_docs_for(IndexReader *ir, Term *term);
|
691
697
|
TermDocEnum *ir_term_positions_for(IndexReader *ir, Term *term);
|
@@ -730,9 +736,8 @@ struct SegmentReader {
|
|
730
736
|
uchar *fake_norms;
|
731
737
|
};
|
732
738
|
|
733
|
-
IndexReader *sr_open(SegmentInfos *sis, int si_num,
|
739
|
+
IndexReader *sr_open(SegmentInfos *sis, int si_num, bool is_owner);
|
734
740
|
IndexReader *sr_open_si(SegmentInfo *si);
|
735
|
-
//int sr_has_deletions(IndexReader *ir);
|
736
741
|
|
737
742
|
/****************************************************************************
|
738
743
|
*
|
@@ -751,7 +756,7 @@ typedef struct MultiReader {
|
|
751
756
|
} MultiReader;
|
752
757
|
|
753
758
|
IndexReader *mr_open(Store *store, SegmentInfos *sis, IndexReader **readers,
|
754
|
-
int rcnt
|
759
|
+
int rcnt);
|
755
760
|
|
756
761
|
/****************************************************************************
|
757
762
|
*
|
@@ -769,9 +774,9 @@ typedef struct SegmentMergeInfo {
|
|
769
774
|
} SegmentMergeInfo;
|
770
775
|
|
771
776
|
SegmentMergeInfo *smi_create(int base, TermEnum *te, IndexReader *ir);
|
772
|
-
void smi_destroy(
|
777
|
+
void smi_destroy(SegmentMergeInfo *smi);
|
773
778
|
TermBuffer *smi_next(SegmentMergeInfo *smi);
|
774
|
-
bool smi_lt(
|
779
|
+
bool smi_lt(SegmentMergeInfo *smi1, SegmentMergeInfo *smi2);
|
775
780
|
|
776
781
|
/****************************************************************************
|
777
782
|
*
|
@@ -801,7 +806,7 @@ typedef struct SegmentMerger {
|
|
801
806
|
} SegmentMerger;
|
802
807
|
|
803
808
|
SegmentMerger *sm_create(Store *store, char *name, int term_index_interval);
|
804
|
-
void sm_destroy(
|
809
|
+
void sm_destroy(SegmentMerger *sm);
|
805
810
|
void sm_add(SegmentMerger *sm, IndexReader *ir);
|
806
811
|
int sm_merge(SegmentMerger *sm);
|
807
812
|
Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
|
@@ -817,6 +822,8 @@ Array *sm_create_compound_file(SegmentMerger *sm, char *fname);
|
|
817
822
|
#define COMMIT_LOCK_NAME "commit"
|
818
823
|
struct IndexWriter {
|
819
824
|
mutex_t mutex;
|
825
|
+
HshTable *postings;
|
826
|
+
FieldInfos *fis;
|
820
827
|
int merge_factor;
|
821
828
|
int min_merge_docs;
|
822
829
|
int max_merge_docs;
|
@@ -828,13 +835,11 @@ struct IndexWriter {
|
|
828
835
|
SegmentInfos *sis;
|
829
836
|
Store *ram_store;
|
830
837
|
Lock *write_lock;
|
831
|
-
bool close_store : 1;
|
832
|
-
bool close_analyzer : 1;
|
833
838
|
bool use_compound_file : 1;
|
834
839
|
};
|
835
840
|
|
836
841
|
IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
837
|
-
bool create
|
842
|
+
bool create);
|
838
843
|
void iw_flush_ram_segments(IndexWriter *iw);
|
839
844
|
void iw_close(IndexWriter *iw);
|
840
845
|
int iw_doc_count(IndexWriter *iw);
|
data/ext/index_io.c
CHANGED
@@ -56,15 +56,17 @@ void os_seek(OutStream *os, int new_pos)
|
|
56
56
|
|
57
57
|
inline void os_write_byte(OutStream *os, uchar b)
|
58
58
|
{
|
59
|
-
if (os->buf.pos >= BUFFER_SIZE)
|
59
|
+
if (os->buf.pos >= BUFFER_SIZE) {
|
60
60
|
os_flush(os);
|
61
|
+
}
|
61
62
|
write_byte(os, b);
|
62
63
|
}
|
63
64
|
|
64
65
|
void os_write_bytes(OutStream *os, uchar *b, int len)
|
65
66
|
{
|
66
|
-
if (os->buf.pos > 0)
|
67
|
+
if (os->buf.pos > 0) { /* flush buffer */
|
67
68
|
os_flush(os);
|
69
|
+
}
|
68
70
|
|
69
71
|
if (len < BUFFER_SIZE) {
|
70
72
|
os->flush_internal(os, b, len);
|
@@ -99,8 +101,9 @@ void is_refill(InStream *is)
|
|
99
101
|
int start = is->buf.start + is->buf.pos;
|
100
102
|
int last = start + BUFFER_SIZE;
|
101
103
|
int flen = is->length_internal(is);
|
102
|
-
if (last > flen)
|
104
|
+
if (last > flen) { /* don't read past EOF */
|
103
105
|
last = flen;
|
106
|
+
}
|
104
107
|
|
105
108
|
is->buf.len = last - start;
|
106
109
|
if (is->buf.len <= 0) {
|
@@ -116,8 +119,9 @@ void is_refill(InStream *is)
|
|
116
119
|
#define read_byte(is) is->buf.buf[is->buf.pos++]
|
117
120
|
inline uchar is_read_byte(InStream *is)
|
118
121
|
{
|
119
|
-
if (is->buf.pos >= is->buf.len)
|
122
|
+
if (is->buf.pos >= is->buf.len) {
|
120
123
|
is_refill(is);
|
124
|
+
}
|
121
125
|
|
122
126
|
return read_byte(is);
|
123
127
|
}
|
@@ -182,17 +186,17 @@ is_read_int(InStream *is)
|
|
182
186
|
(int)is_read_byte(is);
|
183
187
|
}
|
184
188
|
|
185
|
-
|
189
|
+
llong
|
186
190
|
is_read_long(InStream *is)
|
187
191
|
{
|
188
|
-
return ((
|
189
|
-
((
|
190
|
-
((
|
191
|
-
((
|
192
|
-
((
|
193
|
-
((
|
194
|
-
((
|
195
|
-
(
|
192
|
+
return ((llong)is_read_byte(is) << 56) |
|
193
|
+
((llong)is_read_byte(is) << 48) |
|
194
|
+
((llong)is_read_byte(is) << 40) |
|
195
|
+
((llong)is_read_byte(is) << 32) |
|
196
|
+
((llong)is_read_byte(is) << 24) |
|
197
|
+
((llong)is_read_byte(is) << 16) |
|
198
|
+
((llong)is_read_byte(is) << 8) |
|
199
|
+
(llong)is_read_byte(is);
|
196
200
|
}
|
197
201
|
|
198
202
|
unsigned int
|
@@ -204,24 +208,24 @@ is_read_uint(InStream *is)
|
|
204
208
|
(unsigned int)is_read_byte(is);
|
205
209
|
}
|
206
210
|
|
207
|
-
|
211
|
+
ullong
|
208
212
|
is_read_ulong(InStream *is)
|
209
213
|
{
|
210
|
-
return ((
|
211
|
-
((
|
212
|
-
((
|
213
|
-
((
|
214
|
-
((
|
215
|
-
((
|
216
|
-
((
|
217
|
-
(
|
214
|
+
return ((ullong)is_read_byte(is) << 56) |
|
215
|
+
((ullong)is_read_byte(is) << 48) |
|
216
|
+
((ullong)is_read_byte(is) << 40) |
|
217
|
+
((ullong)is_read_byte(is) << 32) |
|
218
|
+
((ullong)is_read_byte(is) << 24) |
|
219
|
+
((ullong)is_read_byte(is) << 16) |
|
220
|
+
((ullong)is_read_byte(is) << 8) |
|
221
|
+
(ullong)is_read_byte(is);
|
218
222
|
}
|
219
223
|
|
220
224
|
/* optimized to use unchecked read_byte if there is definitely space */
|
221
|
-
inline
|
225
|
+
inline ullong
|
222
226
|
is_read_vint(InStream *is)
|
223
227
|
{
|
224
|
-
register
|
228
|
+
register ullong res, b;
|
225
229
|
register int shift = 7;
|
226
230
|
|
227
231
|
if (is->buf.pos > (is->buf.len - VINT_MAX_LEN)) {
|
@@ -283,63 +287,63 @@ is_read_string(InStream *is)
|
|
283
287
|
void
|
284
288
|
os_write_int(OutStream *os, int l)
|
285
289
|
{
|
286
|
-
os_write_byte(os, (l >> 24) & 0xFF);
|
287
|
-
os_write_byte(os, (l >> 16) & 0xFF);
|
288
|
-
os_write_byte(os, (l >> 8) & 0xFF);
|
289
|
-
os_write_byte(os, l & 0xFF);
|
290
|
+
os_write_byte(os, (uchar)((l >> 24) & 0xFF));
|
291
|
+
os_write_byte(os, (uchar)((l >> 16) & 0xFF));
|
292
|
+
os_write_byte(os, (uchar)((l >> 8) & 0xFF));
|
293
|
+
os_write_byte(os, (uchar)(l & 0xFF));
|
290
294
|
}
|
291
295
|
|
292
296
|
void
|
293
|
-
os_write_long(OutStream *os,
|
297
|
+
os_write_long(OutStream *os, llong l)
|
294
298
|
{
|
295
|
-
os_write_byte(os, (l >> 56) & 0xFF);
|
296
|
-
os_write_byte(os, (l >> 48) & 0xFF);
|
297
|
-
os_write_byte(os, (l >> 40) & 0xFF);
|
298
|
-
os_write_byte(os, (l >> 32) & 0xFF);
|
299
|
-
os_write_byte(os, (l >> 24) & 0xFF);
|
300
|
-
os_write_byte(os, (l >> 16) & 0xFF);
|
301
|
-
os_write_byte(os, (l >> 8) & 0xFF);
|
302
|
-
os_write_byte(os, l & 0xFF);
|
299
|
+
os_write_byte(os, (uchar)((l >> 56) & 0xFF));
|
300
|
+
os_write_byte(os, (uchar)((l >> 48) & 0xFF));
|
301
|
+
os_write_byte(os, (uchar)((l >> 40) & 0xFF));
|
302
|
+
os_write_byte(os, (uchar)((l >> 32) & 0xFF));
|
303
|
+
os_write_byte(os, (uchar)((l >> 24) & 0xFF));
|
304
|
+
os_write_byte(os, (uchar)((l >> 16) & 0xFF));
|
305
|
+
os_write_byte(os, (uchar)((l >> 8) & 0xFF));
|
306
|
+
os_write_byte(os, (uchar)(l & 0xFF));
|
303
307
|
}
|
304
308
|
|
305
309
|
void
|
306
310
|
os_write_uint(OutStream *os, unsigned int l)
|
307
311
|
{
|
308
|
-
os_write_byte(os, (l >> 24) & 0xFF);
|
309
|
-
os_write_byte(os, (l >> 16) & 0xFF);
|
310
|
-
os_write_byte(os, (l >> 8) & 0xFF);
|
311
|
-
os_write_byte(os, l & 0xFF);
|
312
|
+
os_write_byte(os, (uchar)((l >> 24) & 0xFF));
|
313
|
+
os_write_byte(os, (uchar)((l >> 16) & 0xFF));
|
314
|
+
os_write_byte(os, (uchar)((l >> 8) & 0xFF));
|
315
|
+
os_write_byte(os, (uchar)(l & 0xFF));
|
312
316
|
}
|
313
317
|
|
314
318
|
void
|
315
|
-
os_write_ulong(OutStream *os,
|
319
|
+
os_write_ulong(OutStream *os, ullong l)
|
316
320
|
{
|
317
|
-
os_write_byte(os, (l >> 56) & 0xFF);
|
318
|
-
os_write_byte(os, (l >> 48) & 0xFF);
|
319
|
-
os_write_byte(os, (l >> 40) & 0xFF);
|
320
|
-
os_write_byte(os, (l >> 32) & 0xFF);
|
321
|
-
os_write_byte(os, (l >> 24) & 0xFF);
|
322
|
-
os_write_byte(os, (l >> 16) & 0xFF);
|
323
|
-
os_write_byte(os, (l >> 8) & 0xFF);
|
324
|
-
os_write_byte(os, l & 0xFF);
|
321
|
+
os_write_byte(os, (uchar)((l >> 56) & 0xFF));
|
322
|
+
os_write_byte(os, (uchar)((l >> 48) & 0xFF));
|
323
|
+
os_write_byte(os, (uchar)((l >> 40) & 0xFF));
|
324
|
+
os_write_byte(os, (uchar)((l >> 32) & 0xFF));
|
325
|
+
os_write_byte(os, (uchar)((l >> 24) & 0xFF));
|
326
|
+
os_write_byte(os, (uchar)((l >> 16) & 0xFF));
|
327
|
+
os_write_byte(os, (uchar)((l >> 8) & 0xFF));
|
328
|
+
os_write_byte(os, (uchar)(l & 0xFF));
|
325
329
|
}
|
326
330
|
|
327
331
|
/* optimized to use an unchecked write if there is space */
|
328
332
|
inline void
|
329
|
-
os_write_vint(OutStream *os, register
|
333
|
+
os_write_vint(OutStream *os, register ullong i)
|
330
334
|
{
|
331
335
|
if (os->buf.pos > VINT_END) {
|
332
336
|
while (i > 127) {
|
333
|
-
os_write_byte(os, (i & 0x7f) | 0x80);
|
337
|
+
os_write_byte(os, (uchar)((i & 0x7f) | 0x80));
|
334
338
|
i >>= 7;
|
335
339
|
}
|
336
|
-
os_write_byte(os, i);
|
340
|
+
os_write_byte(os, (uchar)(i));
|
337
341
|
} else {
|
338
342
|
while (i > 127) {
|
339
|
-
write_byte(os, (i & 0x7f) | 0x80);
|
343
|
+
write_byte(os, (uchar)((i & 0x7f) | 0x80));
|
340
344
|
i >>= 7;
|
341
345
|
}
|
342
|
-
write_byte(os, i);
|
346
|
+
write_byte(os, (uchar)(i));
|
343
347
|
}
|
344
348
|
}
|
345
349
|
|
@@ -356,7 +360,7 @@ os_write_chars(OutStream *os, char *buf, int start, int length)
|
|
356
360
|
void
|
357
361
|
os_write_string(OutStream *os, char *str)
|
358
362
|
{
|
359
|
-
int len = strlen(str);
|
363
|
+
int len = (int)strlen(str);
|
360
364
|
os_write_vint(os, len);
|
361
365
|
|
362
366
|
os_write_chars(os, str, 0, len);
|
@@ -364,6 +368,6 @@ os_write_string(OutStream *os, char *str)
|
|
364
368
|
|
365
369
|
int file_is_lock(char *filename)
|
366
370
|
{
|
367
|
-
int start = strlen(filename) - 4;
|
371
|
+
int start = (int)strlen(filename) - 4;
|
368
372
|
return ((start > 0) && (strcmp(".lck", &filename[start]) == 0));
|
369
373
|
}
|