ferret 0.9.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +23 -5
- data/TODO +2 -1
- data/ext/analysis.c +838 -177
- data/ext/analysis.h +55 -7
- data/ext/api.c +69 -0
- data/ext/api.h +27 -0
- data/ext/array.c +8 -5
- data/ext/compound_io.c +132 -96
- data/ext/document.c +58 -28
- data/ext/except.c +59 -0
- data/ext/except.h +88 -0
- data/ext/ferret.c +47 -3
- data/ext/ferret.h +3 -0
- data/ext/field.c +15 -9
- data/ext/filter.c +1 -1
- data/ext/fs_store.c +215 -34
- data/ext/global.c +72 -3
- data/ext/global.h +4 -3
- data/ext/hash.c +44 -3
- data/ext/hash.h +9 -0
- data/ext/header.h +58 -0
- data/ext/inc/except.h +88 -0
- data/ext/inc/lang.h +23 -13
- data/ext/ind.c +16 -10
- data/ext/index.h +2 -22
- data/ext/index_io.c +3 -11
- data/ext/index_rw.c +245 -193
- data/ext/lang.h +23 -13
- data/ext/libstemmer.c +92 -0
- data/ext/libstemmer.h +79 -0
- data/ext/modules.h +162 -0
- data/ext/q_boolean.c +34 -21
- data/ext/q_const_score.c +6 -12
- data/ext/q_filtered_query.c +206 -0
- data/ext/q_fuzzy.c +18 -15
- data/ext/q_match_all.c +3 -7
- data/ext/q_multi_phrase.c +10 -14
- data/ext/q_parser.c +29 -2
- data/ext/q_phrase.c +14 -21
- data/ext/q_prefix.c +15 -12
- data/ext/q_range.c +30 -28
- data/ext/q_span.c +13 -21
- data/ext/q_term.c +17 -26
- data/ext/r_analysis.c +693 -21
- data/ext/r_doc.c +11 -12
- data/ext/r_index_io.c +4 -1
- data/ext/r_qparser.c +21 -2
- data/ext/r_search.c +285 -18
- data/ext/ram_store.c +5 -2
- data/ext/search.c +11 -17
- data/ext/search.h +21 -45
- data/ext/similarity.h +67 -0
- data/ext/sort.c +30 -25
- data/ext/stem_ISO_8859_1_danish.c +338 -0
- data/ext/stem_ISO_8859_1_danish.h +16 -0
- data/ext/stem_ISO_8859_1_dutch.c +635 -0
- data/ext/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/stem_ISO_8859_1_english.c +1156 -0
- data/ext/stem_ISO_8859_1_english.h +16 -0
- data/ext/stem_ISO_8859_1_finnish.c +792 -0
- data/ext/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/stem_ISO_8859_1_french.c +1276 -0
- data/ext/stem_ISO_8859_1_french.h +16 -0
- data/ext/stem_ISO_8859_1_german.c +512 -0
- data/ext/stem_ISO_8859_1_german.h +16 -0
- data/ext/stem_ISO_8859_1_italian.c +1091 -0
- data/ext/stem_ISO_8859_1_italian.h +16 -0
- data/ext/stem_ISO_8859_1_norwegian.c +296 -0
- data/ext/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/stem_ISO_8859_1_porter.c +776 -0
- data/ext/stem_ISO_8859_1_porter.h +16 -0
- data/ext/stem_ISO_8859_1_portuguese.c +1035 -0
- data/ext/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/stem_ISO_8859_1_spanish.c +1119 -0
- data/ext/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/stem_KOI8_R_russian.c +701 -0
- data/ext/stem_KOI8_R_russian.h +16 -0
- data/ext/stem_UTF_8_danish.c +344 -0
- data/ext/stem_UTF_8_danish.h +16 -0
- data/ext/stem_UTF_8_dutch.c +653 -0
- data/ext/stem_UTF_8_dutch.h +16 -0
- data/ext/stem_UTF_8_english.c +1176 -0
- data/ext/stem_UTF_8_english.h +16 -0
- data/ext/stem_UTF_8_finnish.c +808 -0
- data/ext/stem_UTF_8_finnish.h +16 -0
- data/ext/stem_UTF_8_french.c +1296 -0
- data/ext/stem_UTF_8_french.h +16 -0
- data/ext/stem_UTF_8_german.c +526 -0
- data/ext/stem_UTF_8_german.h +16 -0
- data/ext/stem_UTF_8_italian.c +1113 -0
- data/ext/stem_UTF_8_italian.h +16 -0
- data/ext/stem_UTF_8_norwegian.c +302 -0
- data/ext/stem_UTF_8_norwegian.h +16 -0
- data/ext/stem_UTF_8_porter.c +794 -0
- data/ext/stem_UTF_8_porter.h +16 -0
- data/ext/stem_UTF_8_portuguese.c +1055 -0
- data/ext/stem_UTF_8_portuguese.h +16 -0
- data/ext/stem_UTF_8_russian.c +709 -0
- data/ext/stem_UTF_8_russian.h +16 -0
- data/ext/stem_UTF_8_spanish.c +1137 -0
- data/ext/stem_UTF_8_spanish.h +16 -0
- data/ext/stem_UTF_8_swedish.c +313 -0
- data/ext/stem_UTF_8_swedish.h +16 -0
- data/ext/stopwords.c +325 -0
- data/ext/store.c +34 -2
- data/ext/tags +2953 -0
- data/ext/term.c +21 -15
- data/ext/termdocs.c +5 -3
- data/ext/utilities.c +446 -0
- data/ext/vector.c +27 -13
- data/lib/ferret/document/document.rb +1 -1
- data/lib/ferret/index/index.rb +44 -6
- data/lib/ferret/query_parser/query_parser.tab.rb +7 -3
- data/lib/rferret.rb +2 -1
- data/test/test_helper.rb +2 -2
- data/test/unit/analysis/ctc_analyzer.rb +401 -0
- data/test/unit/analysis/ctc_tokenstream.rb +423 -0
- data/test/unit/analysis/{tc_letter_tokenizer.rb → rtc_letter_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_lower_case_filter.rb → rtc_lower_case_filter.rb} +0 -0
- data/test/unit/analysis/{tc_lower_case_tokenizer.rb → rtc_lower_case_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_per_field_analyzer_wrapper.rb → rtc_per_field_analyzer_wrapper.rb} +0 -0
- data/test/unit/analysis/{tc_porter_stem_filter.rb → rtc_porter_stem_filter.rb} +0 -0
- data/test/unit/analysis/{tc_standard_analyzer.rb → rtc_standard_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_standard_tokenizer.rb → rtc_standard_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_stop_analyzer.rb → rtc_stop_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_stop_filter.rb → rtc_stop_filter.rb} +0 -0
- data/test/unit/analysis/{tc_white_space_analyzer.rb → rtc_white_space_analyzer.rb} +0 -0
- data/test/unit/analysis/{tc_white_space_tokenizer.rb → rtc_white_space_tokenizer.rb} +0 -0
- data/test/unit/analysis/{tc_word_list_loader.rb → rtc_word_list_loader.rb} +0 -0
- data/test/unit/analysis/tc_analyzer.rb +1 -2
- data/test/unit/analysis/{c_token.rb → tc_token.rb} +0 -0
- data/test/unit/document/rtc_field.rb +28 -0
- data/test/unit/document/{c_document.rb → tc_document.rb} +0 -0
- data/test/unit/document/tc_field.rb +82 -12
- data/test/unit/index/{tc_compound_file_io.rb → rtc_compound_file_io.rb} +0 -0
- data/test/unit/index/{tc_field_infos.rb → rtc_field_infos.rb} +0 -0
- data/test/unit/index/{tc_fields_io.rb → rtc_fields_io.rb} +0 -0
- data/test/unit/index/{tc_multiple_term_doc_pos_enum.rb → rtc_multiple_term_doc_pos_enum.rb} +0 -0
- data/test/unit/index/{tc_segment_infos.rb → rtc_segment_infos.rb} +0 -0
- data/test/unit/index/{tc_segment_term_docs.rb → rtc_segment_term_docs.rb} +0 -0
- data/test/unit/index/{tc_segment_term_enum.rb → rtc_segment_term_enum.rb} +0 -0
- data/test/unit/index/{tc_segment_term_vector.rb → rtc_segment_term_vector.rb} +0 -0
- data/test/unit/index/{tc_term_buffer.rb → rtc_term_buffer.rb} +0 -0
- data/test/unit/index/{tc_term_info.rb → rtc_term_info.rb} +0 -0
- data/test/unit/index/{tc_term_infos_io.rb → rtc_term_infos_io.rb} +0 -0
- data/test/unit/index/{tc_term_vectors_io.rb → rtc_term_vectors_io.rb} +0 -0
- data/test/unit/index/{c_index.rb → tc_index.rb} +26 -6
- data/test/unit/index/{c_index_reader.rb → tc_index_reader.rb} +0 -0
- data/test/unit/index/{c_index_writer.rb → tc_index_writer.rb} +0 -0
- data/test/unit/index/{c_term.rb → tc_term.rb} +0 -0
- data/test/unit/index/{c_term_voi.rb → tc_term_voi.rb} +0 -0
- data/test/unit/query_parser/{c_query_parser.rb → rtc_query_parser.rb} +14 -14
- data/test/unit/query_parser/tc_query_parser.rb +24 -16
- data/test/unit/search/{tc_similarity.rb → rtc_similarity.rb} +0 -0
- data/test/unit/search/rtc_sort_field.rb +14 -0
- data/test/unit/search/{c_filter.rb → tc_filter.rb} +11 -11
- data/test/unit/search/{c_fuzzy_query.rb → tc_fuzzy_query.rb} +0 -0
- data/test/unit/search/{c_index_searcher.rb → tc_index_searcher.rb} +0 -0
- data/test/unit/search/{c_search_and_sort.rb → tc_search_and_sort.rb} +0 -0
- data/test/unit/search/{c_sort.rb → tc_sort.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +20 -7
- data/test/unit/search/{c_spans.rb → tc_spans.rb} +0 -0
- data/test/unit/store/rtc_fs_store.rb +62 -0
- data/test/unit/store/rtc_ram_store.rb +15 -0
- data/test/unit/store/rtm_store.rb +150 -0
- data/test/unit/store/rtm_store_lock.rb +2 -0
- data/test/unit/store/tc_fs_store.rb +54 -40
- data/test/unit/store/tc_ram_store.rb +20 -0
- data/test/unit/store/tm_store.rb +30 -146
- data/test/unit/store/tm_store_lock.rb +66 -0
- data/test/unit/utils/{tc_bit_vector.rb → rtc_bit_vector.rb} +0 -0
- data/test/unit/utils/{tc_date_tools.rb → rtc_date_tools.rb} +0 -0
- data/test/unit/utils/{tc_number_tools.rb → rtc_number_tools.rb} +0 -0
- data/test/unit/utils/{tc_parameter.rb → rtc_parameter.rb} +0 -0
- data/test/unit/utils/{tc_priority_queue.rb → rtc_priority_queue.rb} +0 -0
- data/test/unit/utils/{tc_string_helper.rb → rtc_string_helper.rb} +0 -0
- data/test/unit/utils/{tc_thread.rb → rtc_thread.rb} +0 -0
- data/test/unit/utils/{tc_weak_key_hash.rb → rtc_weak_key_hash.rb} +0 -0
- metadata +360 -289
- data/test/unit/document/c_field.rb +0 -98
- data/test/unit/search/c_sort_field.rb +0 -27
- data/test/unit/store/c_fs_store.rb +0 -76
- data/test/unit/store/c_ram_store.rb +0 -35
- data/test/unit/store/m_store.rb +0 -34
- data/test/unit/store/m_store_lock.rb +0 -68
data/ext/index.h
CHANGED
@@ -11,6 +11,8 @@
|
|
11
11
|
#include "store.h"
|
12
12
|
#include "document.h"
|
13
13
|
#include "analysis.h"
|
14
|
+
#include "similarity.h"
|
15
|
+
|
14
16
|
|
15
17
|
#define SEGMENT_NAME_MAX_LENGTH 100
|
16
18
|
|
@@ -112,26 +114,6 @@ FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc);
|
|
112
114
|
unsigned long long fis_get_number(FieldInfos *fis, char *name);
|
113
115
|
FieldInfo *fis_get_fi(FieldInfos *fis, char *name);
|
114
116
|
|
115
|
-
|
116
|
-
/****************************************************************************
|
117
|
-
*
|
118
|
-
* Term
|
119
|
-
*
|
120
|
-
****************************************************************************/
|
121
|
-
|
122
|
-
typedef struct Term {
|
123
|
-
char *field;
|
124
|
-
char *text;
|
125
|
-
} Term;
|
126
|
-
|
127
|
-
Term *term_clone(Term *term);
|
128
|
-
Term *term_create(const char *field, char *text);
|
129
|
-
void term_destroy(void *p);
|
130
|
-
int term_cmp(void *t1, void *t2);
|
131
|
-
int term_eq(const void *t1, const void *t2);
|
132
|
-
unsigned int term_hash(const void *t);
|
133
|
-
char *term_to_s(Term *term);
|
134
|
-
|
135
117
|
/****************************************************************************
|
136
118
|
*
|
137
119
|
* TermBuffer
|
@@ -566,8 +548,6 @@ void p_add_occurance(Posting *p, int position, TVOffsetInfo *offset);
|
|
566
548
|
*
|
567
549
|
****************************************************************************/
|
568
550
|
|
569
|
-
#include "search.h"
|
570
|
-
|
571
551
|
typedef struct DocumentWriter {
|
572
552
|
Store *store;
|
573
553
|
Analyzer *analyzer;
|
data/ext/index_io.c
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#include <store.h>
|
2
2
|
#include <string.h>
|
3
3
|
|
4
|
+
static char * const STORE_EOF_ERROR_MSG = "EOF Error when trying to refill";
|
5
|
+
|
4
6
|
Buffer *buf_create()
|
5
7
|
{
|
6
8
|
Buffer *buf = ALLOC(Buffer);
|
@@ -102,7 +104,7 @@ void is_refill(InStream *is)
|
|
102
104
|
|
103
105
|
is->buf.len = last - start;
|
104
106
|
if (is->buf.len <= 0) {
|
105
|
-
|
107
|
+
RAISE(EOF_ERROR, STORE_EOF_ERROR_MSG);
|
106
108
|
}
|
107
109
|
|
108
110
|
is->read_internal(is, is->buf.buf, 0, is->buf.len);
|
@@ -166,16 +168,6 @@ InStream *is_clone(InStream *is)
|
|
166
168
|
{
|
167
169
|
InStream *new_index_i = ALLOC(InStream);
|
168
170
|
memcpy(new_index_i, is, sizeof(InStream));
|
169
|
-
//new_index_i->buf.start = is->buf.start;
|
170
|
-
//new_index_i->buf.pos = is->buf.pos;
|
171
|
-
//new_index_i->buf.len = is->buf.len;
|
172
|
-
//new_index_i->file = is->file;
|
173
|
-
//new_index_i->d = is->d;
|
174
|
-
//new_index_i->read_internal = is->read_internal;
|
175
|
-
//new_index_i->seek_internal = is->seek_internal;
|
176
|
-
//new_index_i->length_internal = is->length_internal;
|
177
|
-
//new_index_i->clone_internal = is->clone_internal;
|
178
|
-
//new_index_i->close_internal = is->close_internal;
|
179
171
|
new_index_i->is_clone = true;
|
180
172
|
is->clone_internal(is, new_index_i);
|
181
173
|
return new_index_i;
|
data/ext/index_rw.c
CHANGED
@@ -2,6 +2,13 @@
|
|
2
2
|
#include <stdlib.h>
|
3
3
|
#include <string.h>
|
4
4
|
#include <array.h>
|
5
|
+
static char * const FORMAT_VERSION_ERROR_MSG = "Unknown format version";
|
6
|
+
static char * const WRITE_LOCK_ERROR_MSG = "Could not obtain write lock when trying to write index";
|
7
|
+
static char * const COMMIT_LOCK_ERROR_MSG = "Could not obtain commit lock when trying to write index";
|
8
|
+
static char * const DELETED_DOC_ERROR_MSG = "Tried to get doc that has already been deleted";
|
9
|
+
static char * const INVALID_FIELD_TYPE_MSG = "Invalid field-type";
|
10
|
+
static char * const DOC_ORDER_ERROR_MSG = "docs out of order curent";
|
11
|
+
static char * const STALE_READER_ERROR_MSG = "IndexReader out of date and no longer valid for delete, undelete, or set_norm operations";
|
5
12
|
|
6
13
|
const char *INDEX_EXTENSIONS[] = {
|
7
14
|
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
|
@@ -260,81 +267,84 @@ Posting **dw_sort_posting_table(DocumentWriter *dw)
|
|
260
267
|
|
261
268
|
void dw_write_postings(DocumentWriter *dw, Posting **postings, char *segment)
|
262
269
|
{
|
263
|
-
OutStream *freq_out, *prox_out;
|
264
|
-
TermInfosWriter *tiw;
|
265
|
-
TermVectorsWriter *tvw = NULL;
|
266
|
-
FieldInfo *fi;
|
270
|
+
OutStream * volatile freq_out = NULL, * volatile prox_out = NULL;
|
271
|
+
TermInfosWriter * volatile tiw = NULL;
|
272
|
+
TermVectorsWriter * volatile tvw = NULL;
|
267
273
|
Store *store = dw->store;
|
268
|
-
TermInfo *ti;
|
274
|
+
TermInfo * volatile ti = NULL;
|
269
275
|
Posting *posting;
|
270
276
|
int i, j, posting_freq, position, last_position;
|
271
277
|
char fname[SEGMENT_NAME_MAX_LENGTH], *curr_field = NULL, *term_field;
|
272
278
|
strcpy(fname, segment);
|
273
279
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
280
|
+
TRY
|
281
|
+
//open files for inverse index storage
|
282
|
+
sprintf(fname, "%s.frq", segment);
|
283
|
+
freq_out = store->create_output(store, fname);
|
284
|
+
sprintf(fname, "%s.prx", segment);
|
285
|
+
prox_out = store->create_output(store, fname);
|
286
|
+
tiw = tiw_open(store, segment, dw->fis, dw->term_index_interval);
|
287
|
+
ti = ti_create(0, 0, 0, 0);
|
288
|
+
|
289
|
+
for (i = 0; i < dw->pcnt; i++) {
|
290
|
+
posting = postings[i];
|
291
|
+
|
292
|
+
// add an entry to the dictionary with pointers to prox and freq_out files
|
293
|
+
ti_set(ti, 1, os_pos(freq_out), os_pos(prox_out), -1);
|
294
|
+
tiw_add(tiw, posting->term, ti);
|
295
|
+
|
296
|
+
// add an entry to the freq_out file
|
297
|
+
posting_freq = posting->freq;
|
298
|
+
if (posting_freq == 1) { // optimize freq=1
|
299
|
+
os_write_vint(freq_out, 1); // set low bit of doc num.
|
300
|
+
} else {
|
301
|
+
os_write_vint(freq_out, 0); // the doc number
|
302
|
+
os_write_vint(freq_out, posting_freq); // frequency in doc
|
303
|
+
}
|
297
304
|
|
298
|
-
|
305
|
+
last_position = 0; // write positions
|
299
306
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
307
|
+
for (j = 0; j < posting_freq; j++) {
|
308
|
+
position = posting->positions[j];
|
309
|
+
os_write_vint(prox_out, position - last_position);
|
310
|
+
last_position = position;
|
311
|
+
}
|
305
312
|
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
if (
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
313
|
+
// check to see if we switched to a new field
|
314
|
+
term_field = posting->term->field;
|
315
|
+
if (curr_field != term_field) {
|
316
|
+
FieldInfo *fi;
|
317
|
+
// changing field - see if there is something to save
|
318
|
+
curr_field = term_field;
|
319
|
+
fi = (FieldInfo *)ht_get(dw->fis->by_name, curr_field);
|
320
|
+
if (fi->store_tv) {
|
321
|
+
if (tvw == NULL) {
|
322
|
+
tvw = tvw_open(store, segment, dw->fis);
|
323
|
+
tvw_open_doc(tvw);
|
324
|
+
}
|
325
|
+
tvw_open_field(tvw, curr_field);
|
318
326
|
|
319
|
-
|
320
|
-
|
327
|
+
} else if (tvw != NULL) {
|
328
|
+
tvw_close_field(tvw);
|
329
|
+
}
|
330
|
+
}
|
331
|
+
// tvw->curr_field != NULL implies field is still open
|
332
|
+
if (tvw != NULL && tvw->curr_field != NULL) {
|
333
|
+
tvw_add_term(tvw, posting->term->text, posting_freq, posting->positions, posting->offsets);
|
321
334
|
}
|
322
335
|
}
|
323
|
-
|
324
|
-
if (tvw
|
325
|
-
|
336
|
+
XFINALLY
|
337
|
+
if (tvw) {
|
338
|
+
tvw_close_doc(tvw);
|
339
|
+
tvw_close(tvw);
|
326
340
|
}
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
os_close(freq_out);
|
335
|
-
os_close(prox_out);
|
336
|
-
tiw_close(tiw);
|
337
|
-
ti_destroy(ti);
|
341
|
+
// make an effort to close all streams we can but remember and re-raise
|
342
|
+
// the last exception encountered in this process
|
343
|
+
if (freq_out) os_close(freq_out);
|
344
|
+
if (prox_out) os_close(prox_out);
|
345
|
+
if (tiw) tiw_close(tiw);
|
346
|
+
if (ti) ti_destroy(ti);
|
347
|
+
XENDTRY
|
338
348
|
}
|
339
349
|
|
340
350
|
void dw_write_norms(DocumentWriter *dw, char *segment)
|
@@ -353,8 +363,11 @@ void dw_write_norms(DocumentWriter *dw, char *segment)
|
|
353
363
|
norm = dw->field_boosts[i] * sim_length_norm(dw->similarity, fi->name, dw->field_lengths[i]);
|
354
364
|
sprintf(fname, "%s.f%d", segment, i);
|
355
365
|
norms_out = dw->store->create_output(dw->store, fname);
|
356
|
-
|
357
|
-
|
366
|
+
TRY
|
367
|
+
os_write_byte(norms_out, sim_encode_norm(dw->similarity, norm));
|
368
|
+
XFINALLY
|
369
|
+
os_close(norms_out);
|
370
|
+
XENDTRY
|
358
371
|
}
|
359
372
|
}
|
360
373
|
}
|
@@ -369,8 +382,11 @@ void dw_add_doc(DocumentWriter *dw, char *segment, Document *doc)
|
|
369
382
|
|
370
383
|
// write field values
|
371
384
|
FieldsWriter *fw = fw_open(dw->store, segment, dw->fis);
|
372
|
-
|
373
|
-
|
385
|
+
TRY
|
386
|
+
fw_add_doc(fw, doc);
|
387
|
+
XFINALLY
|
388
|
+
fw_close(fw);
|
389
|
+
XENDTRY
|
374
390
|
|
375
391
|
// invert doc into posting_table
|
376
392
|
h_clear(dw->postingtable); // clear posting_table
|
@@ -547,34 +563,38 @@ void sis_read(SegmentInfos *sis, Store *store)
|
|
547
563
|
int doc_cnt;
|
548
564
|
char *name;
|
549
565
|
InStream *is = store->open_input(store, SEGMENT_FILENAME);
|
550
|
-
sis->format = is_read_int(is);
|
551
566
|
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
567
|
+
TRY
|
568
|
+
|
569
|
+
sis->format = is_read_int(is);
|
570
|
+
if (sis->format < 0) { // file contains explicit format info
|
571
|
+
// check that it is a format we can understand
|
572
|
+
if (sis->format < FORMAT)
|
573
|
+
RAISE(ERROR, FORMAT_VERSION_ERROR_MSG);
|
574
|
+
sis->version = is_read_long(is);
|
575
|
+
sis->counter = is_read_int(is);
|
576
|
+
} else { // file is in old format without explicit format info
|
577
|
+
sis->counter = sis->format;
|
578
|
+
}
|
579
|
+
|
580
|
+
int seg_count = is_read_int(is);
|
581
|
+
int i;
|
582
|
+
for (i = 0; i < seg_count; i++) {
|
583
|
+
name = is_read_string(is);
|
584
|
+
doc_cnt = is_read_int(is);
|
585
|
+
sis_add_si(sis, si_create(name, doc_cnt, store));
|
586
|
+
}
|
587
|
+
|
588
|
+
if (sis->format >= 0) {
|
589
|
+
// in old format the version number may be at the end of the file
|
590
|
+
if (is_pos(is) >= is_length(is))
|
591
|
+
sis->version = 0; // old file format without version number
|
592
|
+
else
|
593
|
+
sis->version = is_read_long(is); // read version
|
594
|
+
}
|
595
|
+
XFINALLY
|
596
|
+
is_close(is);
|
597
|
+
XENDTRY
|
578
598
|
}
|
579
599
|
|
580
600
|
void sis_write(SegmentInfos *sis, Store *store)
|
@@ -582,17 +602,20 @@ void sis_write(SegmentInfos *sis, Store *store)
|
|
582
602
|
int i;
|
583
603
|
SegmentInfo *si;
|
584
604
|
OutStream *os = store->create_output(store, TEMPORARY_SEGMENT_FILENAME);
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
605
|
+
TRY
|
606
|
+
os_write_int(os, FORMAT);
|
607
|
+
os_write_long(os, ++(sis->version)); // every write changes the index
|
608
|
+
os_write_int(os, sis->counter);
|
609
|
+
os_write_int(os, sis->scnt);
|
610
|
+
for (i = 0; i < sis->scnt; i++) {
|
611
|
+
si = sis->segs[i];
|
612
|
+
os_write_string(os, si->name);
|
613
|
+
os_write_int(os, si->doc_cnt);
|
614
|
+
}
|
594
615
|
|
595
|
-
|
616
|
+
XFINALLY
|
617
|
+
os_close(os);
|
618
|
+
XENDTRY
|
596
619
|
|
597
620
|
//install new segment info
|
598
621
|
store->rename(store, TEMPORARY_SEGMENT_FILENAME, SEGMENT_FILENAME);
|
@@ -605,13 +628,17 @@ int sis_read_current_version(Store *store)
|
|
605
628
|
InStream *is = store->open_input(store, SEGMENT_FILENAME);
|
606
629
|
int format = 0;
|
607
630
|
int version = 0;
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
631
|
+
|
632
|
+
TRY
|
633
|
+
format = is_read_int(is);
|
634
|
+
if (format < 0) {
|
635
|
+
if (format < FORMAT)
|
636
|
+
RAISE(ERROR, FORMAT_VERSION_ERROR_MSG);
|
637
|
+
version = is_read_long(is);
|
638
|
+
}
|
639
|
+
XFINALLY
|
640
|
+
is_close(is);
|
641
|
+
XENDTRY
|
615
642
|
|
616
643
|
if (format < 0)
|
617
644
|
return version;
|
@@ -658,22 +685,23 @@ IndexWriter *iw_open(Store *store, Analyzer *analyzer,
|
|
658
685
|
// keep the write_lock obtained until the IndexWriter is closed.
|
659
686
|
iw->write_lock = store->open_lock(store, WRITE_LOCK_NAME);
|
660
687
|
if (!iw->write_lock->obtain(iw->write_lock)) {
|
661
|
-
|
662
|
-
"Could not obtain write lock when trying to write index");
|
688
|
+
RAISE(STATE_ERROR, WRITE_LOCK_ERROR_MSG);
|
663
689
|
}
|
664
690
|
|
665
691
|
if (create) {
|
666
692
|
Lock *commit_lock = store->open_lock(store, COMMIT_LOCK_NAME);
|
667
693
|
if (!commit_lock->obtain(commit_lock)) {
|
668
|
-
|
669
|
-
|
694
|
+
store->close_lock(commit_lock);
|
695
|
+
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
670
696
|
}
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
697
|
+
TRY
|
698
|
+
// commit the index
|
699
|
+
store->clear(store);
|
700
|
+
sis_write(iw->sis, store);
|
701
|
+
XFINALLY
|
702
|
+
commit_lock->release(commit_lock);
|
703
|
+
store->close_lock(commit_lock);
|
704
|
+
XENDTRY
|
677
705
|
} else {
|
678
706
|
sis_read(iw->sis, store);
|
679
707
|
}
|
@@ -743,8 +771,7 @@ void make_compound_file(IndexWriter *iw, char *merged_name, SegmentMerger *merge
|
|
743
771
|
Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
744
772
|
|
745
773
|
if (!commit_lock->obtain(commit_lock)) {
|
746
|
-
|
747
|
-
"Could not obtain commit lock when trying to commit index");
|
774
|
+
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
748
775
|
}
|
749
776
|
|
750
777
|
// make compound file visible for SegmentReaders
|
@@ -789,8 +816,7 @@ void iw_merge_segments_with_max(IndexWriter *iw, int min_segment, int max_segmen
|
|
789
816
|
mutex_lock(&iw->store->mutex);
|
790
817
|
Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
791
818
|
if (!commit_lock->obtain(commit_lock)) {
|
792
|
-
|
793
|
-
"Could not obtain commit lock when trying to commit index");
|
819
|
+
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
794
820
|
}
|
795
821
|
// commit the index
|
796
822
|
sis_write(iw->sis, iw->store);
|
@@ -998,7 +1024,7 @@ void iw_add_readers(IndexWriter *iw, IndexReader **irs, int cnt)
|
|
998
1024
|
|
999
1025
|
Lock *commit_lock = iw->store->open_lock(iw->store, COMMIT_LOCK_NAME);
|
1000
1026
|
if (!commit_lock->obtain(commit_lock)) // obtain write lock
|
1001
|
-
|
1027
|
+
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
1002
1028
|
|
1003
1029
|
sis_write(iw->sis, iw->store); // commit changes
|
1004
1030
|
iw_delete_segments(iw, &ir, del_cnt);
|
@@ -1051,8 +1077,11 @@ void norm_rewrite(Norm *norm, Store *store, char *segment,
|
|
1051
1077
|
char norm_fname[SEGMENT_NAME_MAX_LENGTH];
|
1052
1078
|
sprintf(tmp_fname, "%s.tmp", segment);
|
1053
1079
|
OutStream *os = store->create_output(store, tmp_fname);
|
1054
|
-
|
1055
|
-
|
1080
|
+
TRY
|
1081
|
+
os_write_bytes(os, norm->bytes, doc_count);
|
1082
|
+
XFINALLY
|
1083
|
+
os_close(os);
|
1084
|
+
XENDTRY
|
1056
1085
|
if (cfs_store) {
|
1057
1086
|
sprintf(norm_fname, "%s.s%d", segment, norm->field_num);
|
1058
1087
|
} else {
|
@@ -1190,8 +1219,7 @@ Document *sr_get_doc(IndexReader *ir, int doc_num)
|
|
1190
1219
|
mutex_lock(&ir->mutex);
|
1191
1220
|
if (sr_is_deleted_internal(ir, doc_num)) {
|
1192
1221
|
mutex_unlock(&ir->mutex);
|
1193
|
-
|
1194
|
-
"Tried to get doc <%ld> that has already been deleted", doc_num);
|
1222
|
+
RAISE(STATE_ERROR, DELETED_DOC_ERROR_MSG);
|
1195
1223
|
}
|
1196
1224
|
GET_SR;
|
1197
1225
|
doc = fr_get_doc(sr->fr, doc_num);
|
@@ -1359,7 +1387,7 @@ HashSet *sr_get_field_names(IndexReader *ir, int field_type)
|
|
1359
1387
|
if (fi->store_pos && fi->store_offset) hs_add(field_set, fi->name);
|
1360
1388
|
break;
|
1361
1389
|
default:
|
1362
|
-
|
1390
|
+
RAISE(ARG_ERROR, INVALID_FIELD_TYPE_MSG);
|
1363
1391
|
}
|
1364
1392
|
}
|
1365
1393
|
return field_set;
|
@@ -1500,6 +1528,7 @@ IndexReader *sr_open_internal(IndexReader *ir, SegmentInfo *si)
|
|
1500
1528
|
}
|
1501
1529
|
|
1502
1530
|
sprintf(fname, "%s.fnm", sr->segment);
|
1531
|
+
|
1503
1532
|
sr->fis = fis_open(store, fname);
|
1504
1533
|
sr->fr = fr_open(store, sr->segment, sr->fis);
|
1505
1534
|
|
@@ -2023,19 +2052,22 @@ int sm_merge_fields(SegmentMerger *sm)
|
|
2023
2052
|
// merge field values
|
2024
2053
|
FieldsWriter *fw = fw_open(sm->store, sm->name, fis);
|
2025
2054
|
|
2026
|
-
|
2027
|
-
|
2028
|
-
|
2029
|
-
|
2030
|
-
|
2031
|
-
|
2032
|
-
|
2033
|
-
|
2034
|
-
|
2055
|
+
TRY
|
2056
|
+
for (i = 0; i < sm->readers->size; i++) {
|
2057
|
+
IndexReader *ir = sm->readers->elems[i];
|
2058
|
+
maxdoc = ir->max_doc(ir);
|
2059
|
+
for (j = 0; j < maxdoc; j++) {
|
2060
|
+
if (!ir->is_deleted(ir, j)) { // skip deleted docs
|
2061
|
+
doc = ir->get_doc(ir, j);
|
2062
|
+
fw_add_doc(fw, doc);
|
2063
|
+
doc_destroy(doc);
|
2064
|
+
doc_count++;
|
2065
|
+
}
|
2035
2066
|
}
|
2036
2067
|
}
|
2037
|
-
|
2038
|
-
|
2068
|
+
XFINALLY
|
2069
|
+
fw_close(fw);
|
2070
|
+
XENDTRY
|
2039
2071
|
return doc_count;
|
2040
2072
|
}
|
2041
2073
|
|
@@ -2084,9 +2116,7 @@ int sm_append_postings(SegmentMerger *sm, SegmentMergeInfo **smis, int cnt)
|
|
2084
2116
|
doc += base; // convert to merged space
|
2085
2117
|
|
2086
2118
|
if (doc < last_doc)
|
2087
|
-
|
2088
|
-
"docs out of order curent doc = %ld and previous doc = %ld",
|
2089
|
-
doc, last_doc);
|
2119
|
+
RAISE(STATE_ERROR, DOC_ORDER_ERROR_MSG);
|
2090
2120
|
|
2091
2121
|
df++;
|
2092
2122
|
|
@@ -2205,28 +2235,32 @@ void sm_merge_terms(SegmentMerger *sm)
|
|
2205
2235
|
{
|
2206
2236
|
int i;
|
2207
2237
|
char fname[SEGMENT_NAME_MAX_LENGTH];
|
2208
|
-
sprintf(fname, "%s.frq", sm->name);
|
2209
|
-
sm->freq_out = sm->store->create_output(sm->store, fname);
|
2210
|
-
sprintf(fname, "%s.prx", sm->name);
|
2211
|
-
sm->prox_out = sm->store->create_output(sm->store, fname);
|
2212
|
-
sm->tiw = tiw_open(sm->store, sm->name, sm->fis, sm->term_index_interval);
|
2213
|
-
// terms_buf_pointer holds a buffer of terms since the TermInfosWriter needs
|
2214
|
-
// to keep the last index_interval terms so that it can compare the last term
|
2215
|
-
// put in the index with the next one. So the size of the buffer must by
|
2216
|
-
// index_interval + 2.
|
2217
|
-
sm->terms_buf_pointer = 0;
|
2218
|
-
sm->terms_buf_size = sm->tiw->index_interval + 2;
|
2219
|
-
sm->terms_buf = ALLOC_N(Term, sm->terms_buf_size);
|
2220
|
-
for (i = 0; i < sm->terms_buf_size; i++) {
|
2221
|
-
sm->terms_buf[i].field = NULL;
|
2222
|
-
sm->terms_buf[i].text = ALLOC_N(char, MAX_WORD_SIZE);
|
2223
|
-
}
|
2224
|
-
sm->skip_interval = sm->tiw->skip_interval;
|
2225
|
-
sm->queue = pq_create(sm->readers->size, &smi_lt);
|
2226
|
-
|
2227
|
-
sm_merge_term_infos(sm);
|
2228
2238
|
|
2229
|
-
|
2239
|
+
TRY
|
2240
|
+
sprintf(fname, "%s.frq", sm->name);
|
2241
|
+
sm->freq_out = sm->store->create_output(sm->store, fname);
|
2242
|
+
sprintf(fname, "%s.prx", sm->name);
|
2243
|
+
sm->prox_out = sm->store->create_output(sm->store, fname);
|
2244
|
+
sm->tiw = tiw_open(sm->store, sm->name, sm->fis, sm->term_index_interval);
|
2245
|
+
// terms_buf_pointer holds a buffer of terms since the TermInfosWriter needs
|
2246
|
+
// to keep the last index_interval terms so that it can compare the last term
|
2247
|
+
// put in the index with the next one. So the size of the buffer must by
|
2248
|
+
// index_interval + 2.
|
2249
|
+
sm->terms_buf_pointer = 0;
|
2250
|
+
sm->terms_buf_size = sm->tiw->index_interval + 2;
|
2251
|
+
sm->terms_buf = ALLOC_N(Term, sm->terms_buf_size);
|
2252
|
+
for (i = 0; i < sm->terms_buf_size; i++) {
|
2253
|
+
sm->terms_buf[i].field = NULL;
|
2254
|
+
sm->terms_buf[i].text = ALLOC_N(char, MAX_WORD_SIZE);
|
2255
|
+
}
|
2256
|
+
sm->skip_interval = sm->tiw->skip_interval;
|
2257
|
+
sm->queue = pq_create(sm->readers->size, &smi_lt);
|
2258
|
+
|
2259
|
+
sm_merge_term_infos(sm);
|
2260
|
+
|
2261
|
+
XFINALLY
|
2262
|
+
sm_close(sm);
|
2263
|
+
XENDTRY
|
2230
2264
|
}
|
2231
2265
|
|
2232
2266
|
void sm_merge_norms(SegmentMerger *sm)
|
@@ -2242,20 +2276,23 @@ void sm_merge_norms(SegmentMerger *sm)
|
|
2242
2276
|
if (fi->is_indexed && !fi->omit_norms) {
|
2243
2277
|
sprintf(fname, "%s.f%d", sm->name, i);
|
2244
2278
|
os = sm->store->create_output(sm->store, fname);
|
2245
|
-
|
2246
|
-
|
2247
|
-
|
2248
|
-
|
2249
|
-
|
2250
|
-
|
2251
|
-
|
2252
|
-
|
2253
|
-
|
2279
|
+
TRY
|
2280
|
+
for (j = 0; j < sm->readers->size; j++) {
|
2281
|
+
ir = sm->readers->elems[j];
|
2282
|
+
max_doc = ir->max_doc(ir);
|
2283
|
+
norm_buf = ALLOC_N(uchar, max_doc);
|
2284
|
+
memset(norm_buf, 0, sizeof(uchar) * max_doc);
|
2285
|
+
ir->get_norms_into(ir, fi->name, norm_buf, 0);
|
2286
|
+
for (k = 0; k < max_doc; k++) {
|
2287
|
+
if (!ir->is_deleted(ir, k)) {
|
2288
|
+
os_write_byte(os, norm_buf[k]);
|
2289
|
+
}
|
2254
2290
|
}
|
2291
|
+
free(norm_buf);
|
2255
2292
|
}
|
2256
|
-
|
2257
|
-
|
2258
|
-
|
2293
|
+
XFINALLY
|
2294
|
+
os_close(os);
|
2295
|
+
XENDTRY
|
2259
2296
|
}
|
2260
2297
|
}
|
2261
2298
|
}
|
@@ -2266,19 +2303,22 @@ void sm_merge_vectors(SegmentMerger *sm)
|
|
2266
2303
|
TermVectorsWriter *tvw = tvw_open(sm->store, sm->name, sm->fis);
|
2267
2304
|
IndexReader *ir;
|
2268
2305
|
Array *tvs;
|
2269
|
-
|
2270
|
-
|
2271
|
-
|
2272
|
-
|
2273
|
-
|
2274
|
-
|
2275
|
-
|
2276
|
-
|
2277
|
-
|
2306
|
+
TRY
|
2307
|
+
for (i = 0; i < sm->readers->size; i++) {
|
2308
|
+
ir = sm->readers->elems[i];
|
2309
|
+
max_doc = ir->max_doc(ir);
|
2310
|
+
for (j = 0; j < max_doc; j++) {
|
2311
|
+
// skip deleted docs
|
2312
|
+
if (! ir->is_deleted(ir, j)) {
|
2313
|
+
tvs = ir->get_term_vectors(ir, j);
|
2314
|
+
tvw_add_all_doc_vectors(tvw, tvs);
|
2315
|
+
ary_destroy(tvs);
|
2316
|
+
}
|
2278
2317
|
}
|
2279
2318
|
}
|
2280
|
-
|
2281
|
-
|
2319
|
+
XFINALLY
|
2320
|
+
tvw_close(tvw);
|
2321
|
+
XENDTRY
|
2282
2322
|
}
|
2283
2323
|
|
2284
2324
|
int sm_merge(SegmentMerger *sm)
|
@@ -2342,12 +2382,12 @@ void ir_acquire_not_necessary(IndexReader *ir) {}
|
|
2342
2382
|
void ir_acquire_write_lock(IndexReader *ir)
|
2343
2383
|
{
|
2344
2384
|
if (ir->is_stale)
|
2345
|
-
|
2385
|
+
RAISE(STATE_ERROR, STALE_READER_ERROR_MSG);
|
2346
2386
|
|
2347
2387
|
if (ir->write_lock == NULL) {
|
2348
2388
|
ir->write_lock = ir->store->open_lock(ir->store, WRITE_LOCK_NAME);
|
2349
2389
|
if (!ir->write_lock->obtain(ir->write_lock)) // obtain write lock
|
2350
|
-
|
2390
|
+
RAISE(STATE_ERROR, WRITE_LOCK_ERROR_MSG);
|
2351
2391
|
|
2352
2392
|
// we have to check whether index has changed since this reader was opened.
|
2353
2393
|
// if so, this reader is no longer valid for deletion
|
@@ -2356,7 +2396,7 @@ void ir_acquire_write_lock(IndexReader *ir)
|
|
2356
2396
|
ir->write_lock->release(ir->write_lock);
|
2357
2397
|
ir->store->close_lock(ir->write_lock);
|
2358
2398
|
ir->write_lock = NULL;
|
2359
|
-
|
2399
|
+
RAISE(STATE_ERROR, STALE_READER_ERROR_MSG);
|
2360
2400
|
}
|
2361
2401
|
}
|
2362
2402
|
}
|
@@ -2472,7 +2512,7 @@ void ir_commit_internal(IndexReader *ir)
|
|
2472
2512
|
mutex_lock(&ir->store->mutex);
|
2473
2513
|
Lock *commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
|
2474
2514
|
if (!commit_lock->obtain(commit_lock)) // obtain write lock
|
2475
|
-
|
2515
|
+
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
2476
2516
|
|
2477
2517
|
ir->do_commit(ir);
|
2478
2518
|
sis_write(ir->sis, ir->store);
|
@@ -2538,6 +2578,18 @@ void ir_add_cache(IndexReader *ir)
|
|
2538
2578
|
|
2539
2579
|
bool ir_is_latest(IndexReader *ir)
|
2540
2580
|
{
|
2541
|
-
|
2581
|
+
bool is_latest = false;
|
2582
|
+
Lock *commit_lock = ir->store->open_lock(ir->store, COMMIT_LOCK_NAME);
|
2583
|
+
if (!commit_lock->obtain(commit_lock)) {
|
2584
|
+
ir->store->close_lock(commit_lock);
|
2585
|
+
RAISE(STATE_ERROR, COMMIT_LOCK_ERROR_MSG);
|
2586
|
+
}
|
2587
|
+
TRY
|
2588
|
+
is_latest = (sis_read_current_version(ir->store) == ir->sis->version);
|
2589
|
+
XFINALLY
|
2590
|
+
commit_lock->release(commit_lock);
|
2591
|
+
ir->store->close_lock(commit_lock);
|
2592
|
+
XENDTRY
|
2593
|
+
return is_latest;
|
2542
2594
|
}
|
2543
2595
|
|