ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
- data/Rakefile +51 -25
- data/ext/analysis.c +553 -0
- data/ext/analysis.h +76 -0
- data/ext/array.c +83 -0
- data/ext/array.h +19 -0
- data/ext/bitvector.c +164 -0
- data/ext/bitvector.h +29 -0
- data/ext/compound_io.c +335 -0
- data/ext/document.c +336 -0
- data/ext/document.h +87 -0
- data/ext/ferret.c +88 -47
- data/ext/ferret.h +43 -109
- data/ext/field.c +395 -0
- data/ext/filter.c +103 -0
- data/ext/fs_store.c +352 -0
- data/ext/global.c +219 -0
- data/ext/global.h +73 -0
- data/ext/hash.c +446 -0
- data/ext/hash.h +80 -0
- data/ext/hashset.c +141 -0
- data/ext/hashset.h +37 -0
- data/ext/helper.c +11 -0
- data/ext/helper.h +5 -0
- data/ext/inc/lang.h +41 -0
- data/ext/ind.c +389 -0
- data/ext/index.h +884 -0
- data/ext/index_io.c +269 -415
- data/ext/index_rw.c +2543 -0
- data/ext/lang.c +31 -0
- data/ext/lang.h +41 -0
- data/ext/priorityqueue.c +228 -0
- data/ext/priorityqueue.h +44 -0
- data/ext/q_boolean.c +1331 -0
- data/ext/q_const_score.c +154 -0
- data/ext/q_fuzzy.c +287 -0
- data/ext/q_match_all.c +142 -0
- data/ext/q_multi_phrase.c +343 -0
- data/ext/q_parser.c +2180 -0
- data/ext/q_phrase.c +657 -0
- data/ext/q_prefix.c +75 -0
- data/ext/q_range.c +247 -0
- data/ext/q_span.c +1566 -0
- data/ext/q_term.c +308 -0
- data/ext/q_wildcard.c +146 -0
- data/ext/r_analysis.c +255 -0
- data/ext/r_doc.c +578 -0
- data/ext/r_index_io.c +996 -0
- data/ext/r_qparser.c +158 -0
- data/ext/r_search.c +2321 -0
- data/ext/r_store.c +263 -0
- data/ext/r_term.c +219 -0
- data/ext/ram_store.c +447 -0
- data/ext/search.c +524 -0
- data/ext/search.h +1065 -0
- data/ext/similarity.c +143 -39
- data/ext/sort.c +661 -0
- data/ext/store.c +35 -0
- data/ext/store.h +152 -0
- data/ext/term.c +704 -143
- data/ext/termdocs.c +599 -0
- data/ext/vector.c +594 -0
- data/lib/ferret.rb +9 -10
- data/lib/ferret/analysis/analyzers.rb +2 -2
- data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
- data/lib/ferret/analysis/token.rb +14 -14
- data/lib/ferret/analysis/token_filters.rb +3 -3
- data/lib/ferret/document/field.rb +16 -17
- data/lib/ferret/index/document_writer.rb +4 -4
- data/lib/ferret/index/index.rb +39 -23
- data/lib/ferret/index/index_writer.rb +2 -2
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
- data/lib/ferret/index/segment_term_vector.rb +4 -4
- data/lib/ferret/index/term.rb +5 -1
- data/lib/ferret/index/term_vector_offset_info.rb +6 -6
- data/lib/ferret/index/term_vectors_io.rb +5 -5
- data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
- data/lib/ferret/search.rb +1 -1
- data/lib/ferret/search/boolean_query.rb +2 -1
- data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
- data/lib/ferret/search/fuzzy_query.rb +2 -1
- data/lib/ferret/search/index_searcher.rb +3 -0
- data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
- data/lib/ferret/search/multi_phrase_query.rb +6 -5
- data/lib/ferret/search/phrase_query.rb +3 -6
- data/lib/ferret/search/prefix_query.rb +4 -4
- data/lib/ferret/search/sort.rb +3 -1
- data/lib/ferret/search/sort_field.rb +9 -9
- data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
- data/lib/ferret/search/spans/span_near_query.rb +1 -1
- data/lib/ferret/search/spans/span_weight.rb +1 -1
- data/lib/ferret/search/spans/spans_enum.rb +7 -7
- data/lib/ferret/store/fs_store.rb +10 -6
- data/lib/ferret/store/ram_store.rb +3 -3
- data/lib/rferret.rb +36 -0
- data/test/functional/thread_safety_index_test.rb +2 -2
- data/test/test_helper.rb +16 -2
- data/test/unit/analysis/c_token.rb +25 -0
- data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
- data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
- data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
- data/test/unit/document/c_field.rb +98 -0
- data/test/unit/document/tc_field.rb +0 -66
- data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
- data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
- data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
- data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
- data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
- data/test/unit/index/tc_segment_term_vector.rb +2 -2
- data/test/unit/index/tc_term_vectors_io.rb +4 -4
- data/test/unit/query_parser/c_query_parser.rb +138 -0
- data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
- data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
- data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
- data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
- data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
- data/test/unit/search/c_sort_field.rb +27 -0
- data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
- data/test/unit/search/tc_sort_field.rb +7 -20
- data/test/unit/store/c_fs_store.rb +76 -0
- data/test/unit/store/c_ram_store.rb +35 -0
- data/test/unit/store/m_store.rb +34 -0
- data/test/unit/store/m_store_lock.rb +68 -0
- data/test/unit/store/tc_fs_store.rb +0 -53
- data/test/unit/store/tc_ram_store.rb +0 -20
- data/test/unit/store/tm_store.rb +0 -30
- data/test/unit/store/tm_store_lock.rb +0 -66
- metadata +84 -31
- data/ext/Makefile +0 -140
- data/ext/ferret_ext.so +0 -0
- data/ext/priority_queue.c +0 -232
- data/ext/ram_directory.c +0 -321
- data/ext/segment_merge_queue.c +0 -37
- data/ext/segment_term_enum.c +0 -326
- data/ext/string_helper.c +0 -42
- data/ext/tags +0 -344
- data/ext/term_buffer.c +0 -230
- data/ext/term_infos_reader.c +0 -54
- data/ext/terminfo.c +0 -160
- data/ext/token.c +0 -93
- data/ext/util.c +0 -12
data/ext/ferret.h
CHANGED
@@ -1,128 +1,62 @@
|
|
1
1
|
#ifndef __FERRET_H_
|
2
2
|
#define __FERRET_H_
|
3
3
|
|
4
|
-
#include
|
5
|
-
|
6
|
-
#define BUFFER_SIZE 1024
|
7
|
-
|
8
|
-
typedef unsigned char byte_t;
|
9
|
-
|
10
|
-
typedef struct IndexBuffer {
|
11
|
-
long start;
|
12
|
-
int len;
|
13
|
-
int pos;
|
14
|
-
byte_t *buffer;
|
15
|
-
} IndexBuffer;
|
16
|
-
|
17
|
-
typedef struct Term {
|
18
|
-
VALUE field;
|
19
|
-
char *text;
|
20
|
-
int tlen;
|
21
|
-
} Term;
|
22
|
-
|
23
|
-
typedef struct PriorityQueue {
|
24
|
-
VALUE *heap;
|
25
|
-
int len;
|
26
|
-
int size;
|
27
|
-
} PriorityQueue;
|
28
|
-
|
29
|
-
typedef struct TermInfo {
|
30
|
-
int doc_freq;
|
31
|
-
long freq_pointer;
|
32
|
-
long prox_pointer;
|
33
|
-
int skip_offset;
|
34
|
-
} TermInfo;
|
35
|
-
|
36
|
-
typedef struct RAMFile {
|
37
|
-
void **buffers;
|
38
|
-
int bufcnt;
|
39
|
-
VALUE mtime;
|
40
|
-
char *name;
|
41
|
-
int length;
|
42
|
-
} RAMFile;
|
43
|
-
|
44
|
-
typedef struct SegmentTermEnum {
|
45
|
-
VALUE input;
|
46
|
-
IndexBuffer *buf;
|
47
|
-
VALUE field_infos;
|
48
|
-
VALUE rtb_curr;
|
49
|
-
Term *tb_curr;
|
50
|
-
VALUE rtb_prev;
|
51
|
-
Term *tb_prev;
|
52
|
-
TermInfo *ti;
|
53
|
-
int is_index;
|
54
|
-
int size;
|
55
|
-
int position;
|
56
|
-
int index_pointer;
|
57
|
-
int index_interval;
|
58
|
-
int skip_interval;
|
59
|
-
int format;
|
60
|
-
int format_m1skip_interval;
|
61
|
-
} SegmentTermEnum;
|
4
|
+
#include "global.h"
|
5
|
+
#include "document.h"
|
62
6
|
|
63
7
|
/* IDs */
|
64
8
|
extern ID id_new;
|
65
|
-
extern ID id_close;
|
66
|
-
extern ID id_size;
|
67
|
-
extern ID id_iv_size;
|
68
9
|
|
69
10
|
/* Modules */
|
70
11
|
extern VALUE mFerret;
|
71
|
-
extern VALUE mStore;
|
72
|
-
extern VALUE mIndex;
|
73
|
-
extern VALUE mUtils;
|
74
12
|
extern VALUE mAnalysis;
|
13
|
+
extern VALUE mDocument;
|
14
|
+
extern VALUE mIndex;
|
75
15
|
extern VALUE mSearch;
|
16
|
+
extern VALUE mStore;
|
76
17
|
extern VALUE mStringHelper;
|
18
|
+
extern VALUE mUtils;
|
19
|
+
extern VALUE mSpans;
|
77
20
|
|
78
21
|
/* Classes */
|
79
|
-
extern VALUE
|
80
|
-
extern VALUE cIndexIn;
|
81
|
-
extern VALUE cBufferedIndexIn;
|
82
|
-
extern VALUE cFSIndexIn;
|
83
|
-
extern VALUE cIndexOut;
|
84
|
-
extern VALUE cBufferedIndexOut;
|
85
|
-
extern VALUE cFSIndexOut;
|
86
|
-
extern VALUE cRAMIndexOut;
|
87
|
-
extern VALUE cRAMIndexIn;
|
88
|
-
extern VALUE cTerm;
|
89
|
-
extern VALUE cTermBuffer;
|
90
|
-
extern VALUE cTermInfo;
|
91
|
-
extern VALUE cToken;
|
92
|
-
extern VALUE cPriorityQueue;
|
93
|
-
extern VALUE cSegmentMergeQueue;
|
94
|
-
extern VALUE cTermEnum;
|
95
|
-
extern VALUE cTermInfosReader;
|
96
|
-
extern VALUE cSegmentTermEnum;
|
97
|
-
extern VALUE cSimilarity;
|
98
|
-
extern VALUE cDefaultSimilarity;
|
22
|
+
extern VALUE cDirectory;
|
99
23
|
|
100
24
|
/* Ferret Inits */
|
101
|
-
extern void Init_indexio();
|
102
25
|
extern void Init_term();
|
103
|
-
extern void
|
104
|
-
extern void
|
105
|
-
extern void
|
106
|
-
extern void
|
107
|
-
extern void
|
108
|
-
extern void
|
109
|
-
extern void
|
110
|
-
|
111
|
-
extern void
|
112
|
-
extern void
|
113
|
-
|
114
|
-
|
115
|
-
extern
|
116
|
-
extern
|
117
|
-
extern VALUE
|
118
|
-
extern VALUE
|
119
|
-
extern
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
26
|
+
extern void Init_dir();
|
27
|
+
extern void Init_analysis();
|
28
|
+
extern void Init_doc();
|
29
|
+
extern void Init_index_io();
|
30
|
+
extern void Init_search();
|
31
|
+
extern void Init_qparser();
|
32
|
+
//extern void object_add(void *key, VALUE obj);
|
33
|
+
#define object_add(key, obj) object_add2(key, obj, __FILE__, __LINE__, __func__)
|
34
|
+
extern void object_add2(void *key, VALUE obj, const char *file, int line, const char *func);
|
35
|
+
//extern void object_del(void *key);
|
36
|
+
#define object_del(key) object_del2(key, __FILE__, __LINE__, __func__)
|
37
|
+
extern void object_del2(void *key, const char *file, int line, const char *func);
|
38
|
+
extern void frt_gc_mark(void *key);
|
39
|
+
extern VALUE object_get(void *key);
|
40
|
+
extern VALUE frt_data_alloc(VALUE klass);
|
41
|
+
extern VALUE frt_get_doc(Document *doc);
|
42
|
+
extern void frt_deref_free(void *p);
|
43
|
+
|
44
|
+
|
45
|
+
#define Frt_Make_Struct(klass)\
|
46
|
+
rb_data_object_alloc(klass,NULL,(RUBY_DATA_FUNC)NULL,(RUBY_DATA_FUNC)NULL)
|
47
|
+
|
48
|
+
#define Frt_Wrap_Struct(self,mmark,mfree,mdata)\
|
49
|
+
do {\
|
50
|
+
((struct RData *)(self))->data = mdata;\
|
51
|
+
((struct RData *)(self))->dmark = mmark;\
|
52
|
+
((struct RData *)(self))->dfree = mfree;\
|
53
|
+
} while (0)
|
54
|
+
|
55
|
+
#define Frt_Unwrap_Struct(self)\
|
56
|
+
do {\
|
57
|
+
((struct RData *)(self))->data = NULL;\
|
58
|
+
((struct RData *)(self))->dmark = NULL;\
|
59
|
+
((struct RData *)(self))->dfree = NULL;\
|
60
|
+
} while (0)
|
127
61
|
|
128
62
|
#endif
|
data/ext/field.c
ADDED
@@ -0,0 +1,395 @@
|
|
1
|
+
#include <index.h>
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
/****************************************************************************
|
5
|
+
*
|
6
|
+
* FieldInfo
|
7
|
+
*
|
8
|
+
****************************************************************************/
|
9
|
+
|
10
|
+
FieldInfo *fi_create(char *name, int number, bool is_indexed,
|
11
|
+
bool store_tv, bool store_pos, bool store_offset, bool omit_norms)
|
12
|
+
{
|
13
|
+
FieldInfo *fi = ALLOC(FieldInfo);
|
14
|
+
fi->name = estrdup(name);
|
15
|
+
fi->number = number;
|
16
|
+
fi->is_indexed = is_indexed;
|
17
|
+
fi->store_tv = store_tv;
|
18
|
+
fi->store_offset = store_offset;
|
19
|
+
fi->store_pos = store_pos;
|
20
|
+
fi->omit_norms = omit_norms;
|
21
|
+
return fi;
|
22
|
+
}
|
23
|
+
|
24
|
+
void fi_destroy(void *p)
|
25
|
+
{
|
26
|
+
FieldInfo *fi = (FieldInfo *)p;
|
27
|
+
free(fi->name);
|
28
|
+
free(fi);
|
29
|
+
}
|
30
|
+
|
31
|
+
/****************************************************************************
|
32
|
+
*
|
33
|
+
* FieldInfos
|
34
|
+
*
|
35
|
+
****************************************************************************/
|
36
|
+
|
37
|
+
FieldInfos *fis_create()
|
38
|
+
{
|
39
|
+
FieldInfos *fis = ALLOC(FieldInfos);
|
40
|
+
fis->by_name = ht_create();
|
41
|
+
fis->by_number = NULL;
|
42
|
+
fis->fcnt = 0;
|
43
|
+
return fis;
|
44
|
+
}
|
45
|
+
|
46
|
+
FieldInfos *fis_open(Store *store, char *filename)
|
47
|
+
{
|
48
|
+
FieldInfos *fis = fis_create();
|
49
|
+
InStream *is = store->open_input(store, filename);
|
50
|
+
fis_read(fis, is);
|
51
|
+
is_close(is);
|
52
|
+
return fis;
|
53
|
+
}
|
54
|
+
|
55
|
+
void fis_destroy(void *p)
|
56
|
+
{
|
57
|
+
int i;
|
58
|
+
FieldInfos *fis = (FieldInfos *)p;
|
59
|
+
for (i = 0; i < fis->fcnt; i++) {
|
60
|
+
fi_destroy(fis->by_number[i]);
|
61
|
+
}
|
62
|
+
ht_destroy(fis->by_name);
|
63
|
+
free(fis->by_number);
|
64
|
+
free(fis);
|
65
|
+
}
|
66
|
+
|
67
|
+
FieldInfo *fis_add(FieldInfos *fis,
|
68
|
+
char *name,
|
69
|
+
bool is_indexed,
|
70
|
+
bool store_tv,
|
71
|
+
bool store_pos,
|
72
|
+
bool store_offset,
|
73
|
+
bool omit_norms)
|
74
|
+
{
|
75
|
+
FieldInfo *fi = ht_get(fis->by_name, name);
|
76
|
+
if (fi == NULL) {
|
77
|
+
fi = fi_create(name, fis->fcnt, is_indexed, store_tv,
|
78
|
+
store_pos, store_offset, omit_norms);
|
79
|
+
fis->fcnt++;
|
80
|
+
REALLOC_N(fis->by_number, FieldInfo *, fis->fcnt);
|
81
|
+
|
82
|
+
fis->by_number[fi->number] = fi;
|
83
|
+
ht_set(fis->by_name, name, fi);
|
84
|
+
} else {
|
85
|
+
if (fi->is_indexed != is_indexed)
|
86
|
+
fi->is_indexed = true; // once indexed, always index
|
87
|
+
if (fi->store_tv != store_tv)
|
88
|
+
fi->store_tv = true; // once vector, always vector
|
89
|
+
if (fi->store_pos != store_pos)
|
90
|
+
fi->store_pos = true; // once vector, always vector
|
91
|
+
if (fi->store_offset != store_offset)
|
92
|
+
fi->store_offset = true; // once vector, always vector
|
93
|
+
if (fi->omit_norms != omit_norms)
|
94
|
+
fi->omit_norms = false; // once kept, always keep
|
95
|
+
}
|
96
|
+
return fi;
|
97
|
+
}
|
98
|
+
|
99
|
+
void fis_add_fields(FieldInfos *fis,
|
100
|
+
HashSet *field_names,
|
101
|
+
bool is_indexed,
|
102
|
+
bool store_tv,
|
103
|
+
bool store_pos,
|
104
|
+
bool store_offset,
|
105
|
+
bool omit_norms)
|
106
|
+
{
|
107
|
+
int i;
|
108
|
+
for (i = 0; i < field_names->size; i++) {
|
109
|
+
fis_add(fis, field_names->elems[i], is_indexed, store_tv, store_pos,
|
110
|
+
store_offset, omit_norms);
|
111
|
+
}
|
112
|
+
hs_destroy(field_names);
|
113
|
+
}
|
114
|
+
|
115
|
+
bool fis_has_vectors(FieldInfos *fis)
|
116
|
+
{
|
117
|
+
int i;
|
118
|
+
for (i = 0; i < fis->fcnt; i++) {
|
119
|
+
if (fis->by_number[i]->store_tv)
|
120
|
+
return true;
|
121
|
+
}
|
122
|
+
return false;
|
123
|
+
}
|
124
|
+
|
125
|
+
FieldInfo *fis_get_fi(FieldInfos *fis, char *name)
|
126
|
+
{
|
127
|
+
return (FieldInfo *)ht_get(fis->by_name, name);
|
128
|
+
}
|
129
|
+
|
130
|
+
unsigned long long fis_get_number(FieldInfos *fis, char *name)
|
131
|
+
{
|
132
|
+
FieldInfo *fi = (FieldInfo *)ht_get(fis->by_name, name);
|
133
|
+
if (fi == NULL)
|
134
|
+
return 0xFFFFFFFFull; // to be compatible with Jave version
|
135
|
+
else
|
136
|
+
return fi->number;
|
137
|
+
}
|
138
|
+
|
139
|
+
#define IS_INDEXED 0x01
|
140
|
+
#define STORE_TV 0x02
|
141
|
+
#define STORE_POS 0x04
|
142
|
+
#define STORE_OFFSET 0x08
|
143
|
+
#define OMIT_NORMS 0x10
|
144
|
+
|
145
|
+
int fi_field_info_byte(FieldInfo *fi)
|
146
|
+
{
|
147
|
+
int bits = 0x0;
|
148
|
+
if (fi->is_indexed)
|
149
|
+
bits |= IS_INDEXED;
|
150
|
+
if (fi->store_tv)
|
151
|
+
bits |= STORE_TV;
|
152
|
+
if (fi->store_pos)
|
153
|
+
bits |= STORE_POS;
|
154
|
+
if (fi->store_offset)
|
155
|
+
bits |= STORE_OFFSET;
|
156
|
+
if (fi->omit_norms)
|
157
|
+
bits |= OMIT_NORMS;
|
158
|
+
return bits;
|
159
|
+
}
|
160
|
+
|
161
|
+
void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext)
|
162
|
+
{
|
163
|
+
int i;
|
164
|
+
FieldInfo *fi;
|
165
|
+
char fname[SEGMENT_NAME_MAX_LENGTH];
|
166
|
+
strcpy(fname, segment);
|
167
|
+
strcat(fname, ext);
|
168
|
+
OutStream *os = store->create_output(store, fname);
|
169
|
+
os_write_vint(os, fis->fcnt);
|
170
|
+
for (i = 0; i < fis->fcnt; i++) {
|
171
|
+
fi = fis->by_number[i];
|
172
|
+
os_write_string(os, fi->name);
|
173
|
+
os_write_vint(os, fi_field_info_byte(fi));
|
174
|
+
}
|
175
|
+
os_close(os);
|
176
|
+
}
|
177
|
+
|
178
|
+
FieldInfos *fis_read(FieldInfos *fis, InStream *is)
|
179
|
+
{
|
180
|
+
int i, size = is_read_vint(is); //read in the size
|
181
|
+
int bits, is_indexed, store_tv, store_pos, store_offset, omit_norms;
|
182
|
+
char *name;
|
183
|
+
for (i = 0; i < size; i++) {
|
184
|
+
name = is_read_string(is);
|
185
|
+
bits = is_read_byte(is);
|
186
|
+
is_indexed = (bits & IS_INDEXED) != 0;
|
187
|
+
store_tv = (bits & STORE_TV) != 0;
|
188
|
+
store_pos = (bits & STORE_POS) != 0;
|
189
|
+
store_offset = (bits & STORE_OFFSET) != 0;
|
190
|
+
omit_norms = (bits & OMIT_NORMS) != 0;
|
191
|
+
fis_add(fis, name, is_indexed, store_tv,
|
192
|
+
store_pos, store_offset, omit_norms);
|
193
|
+
free(name);
|
194
|
+
}
|
195
|
+
return fis;
|
196
|
+
}
|
197
|
+
|
198
|
+
FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc)
|
199
|
+
{
|
200
|
+
int i;
|
201
|
+
DocField *df;
|
202
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
203
|
+
df = doc->df_arr[i];
|
204
|
+
fis_add(fis, df->name, df->is_indexed, df->store_tv,
|
205
|
+
df->store_pos, df->store_offset, df->omit_norms);
|
206
|
+
}
|
207
|
+
return fis;
|
208
|
+
}
|
209
|
+
|
210
|
+
/****************************************************************************
|
211
|
+
*
|
212
|
+
* FieldsWriter
|
213
|
+
*
|
214
|
+
****************************************************************************/
|
215
|
+
|
216
|
+
FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis)
|
217
|
+
{
|
218
|
+
char buf[SEGMENT_NAME_MAX_LENGTH];
|
219
|
+
int slen = strlen(segment);
|
220
|
+
strcpy(buf, segment);
|
221
|
+
|
222
|
+
FieldsWriter *fw = ALLOC(FieldsWriter);
|
223
|
+
fw->fis = fis;
|
224
|
+
strcpy(buf+slen, ".fdt");
|
225
|
+
fw->fields_out = store->create_output(store, buf);
|
226
|
+
strcpy(buf+slen, ".fdx");
|
227
|
+
fw->index_out = store->create_output(store, buf);
|
228
|
+
return fw;
|
229
|
+
}
|
230
|
+
|
231
|
+
void fw_close(FieldsWriter *fw)
|
232
|
+
{
|
233
|
+
os_close(fw->fields_out);
|
234
|
+
os_close(fw->index_out);
|
235
|
+
free(fw);
|
236
|
+
}
|
237
|
+
|
238
|
+
void save_data(OutStream *fout, char *data, int dlen)
|
239
|
+
{
|
240
|
+
os_write_vint(fout, dlen);
|
241
|
+
os_write_bytes(fout, (uchar *)data, dlen);
|
242
|
+
}
|
243
|
+
|
244
|
+
void fw_add_doc(FieldsWriter *fw, Document *doc)
|
245
|
+
{
|
246
|
+
int i, bits;
|
247
|
+
OutStream *fout = fw->fields_out, *iout = fw->index_out;
|
248
|
+
os_write_long(iout, os_pos(fout));
|
249
|
+
DocField *df;
|
250
|
+
char *data;
|
251
|
+
|
252
|
+
int stored_count = 0;
|
253
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
254
|
+
if (doc->df_arr[i]->is_stored)
|
255
|
+
stored_count++;
|
256
|
+
}
|
257
|
+
os_write_vint(fout, stored_count);
|
258
|
+
|
259
|
+
for (i = 0; i < doc->dfcnt; i++) {
|
260
|
+
df = doc->df_arr[i];
|
261
|
+
if (df->is_stored) {
|
262
|
+
os_write_vint(fout, ((FieldInfo *)ht_get(fw->fis->by_name, df->name))->number);
|
263
|
+
|
264
|
+
bits = 0;
|
265
|
+
if (df->is_tokenized)
|
266
|
+
bits |= FIELD_IS_TOKENIZED;
|
267
|
+
if (df->is_binary)
|
268
|
+
bits |= FIELD_IS_BINARY;
|
269
|
+
if (df->is_compressed)
|
270
|
+
bits |= FIELD_IS_COMPRESSED;
|
271
|
+
os_write_byte(fout, bits);
|
272
|
+
|
273
|
+
data = NULL;
|
274
|
+
if (df->is_compressed) {
|
275
|
+
// Not compressing just yet but we'll save it anyway
|
276
|
+
if (df->is_binary) {
|
277
|
+
save_data(fout, df->data, df->blen);
|
278
|
+
} else {
|
279
|
+
os_write_string(fout, df->data);
|
280
|
+
}
|
281
|
+
} else {
|
282
|
+
if (df->is_binary) {
|
283
|
+
save_data(fout, df->data, df->blen);
|
284
|
+
} else {
|
285
|
+
os_write_string(fout, df->data);
|
286
|
+
}
|
287
|
+
}
|
288
|
+
}
|
289
|
+
}
|
290
|
+
}
|
291
|
+
|
292
|
+
/****************************************************************************
|
293
|
+
*
|
294
|
+
* FieldsReader
|
295
|
+
*
|
296
|
+
****************************************************************************/
|
297
|
+
|
298
|
+
FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis)
|
299
|
+
{
|
300
|
+
char buf[100];
|
301
|
+
int slen = strlen(segment);
|
302
|
+
strcpy(buf, segment);
|
303
|
+
|
304
|
+
FieldsReader *fr = ALLOC(FieldsReader);
|
305
|
+
fr->fis = fis;
|
306
|
+
strcpy(buf+slen, ".fdt");
|
307
|
+
fr->fields_in = store->open_input(store, buf);
|
308
|
+
strcpy(buf+slen, ".fdx");
|
309
|
+
InStream *iin = fr->index_in = store->open_input(store, buf);
|
310
|
+
fr->len = iin->length_internal(iin)/8;
|
311
|
+
return fr;
|
312
|
+
}
|
313
|
+
|
314
|
+
void fr_close(FieldsReader *fr)
|
315
|
+
{
|
316
|
+
is_close(fr->fields_in);
|
317
|
+
is_close(fr->index_in);
|
318
|
+
free(fr);
|
319
|
+
}
|
320
|
+
|
321
|
+
Document *fr_get_doc(FieldsReader *fr, int doc_num)
|
322
|
+
{
|
323
|
+
int i, bits, dlen;
|
324
|
+
char *data;
|
325
|
+
int store, index, stv;
|
326
|
+
int is_compressed, is_tokenized, is_binary;
|
327
|
+
Document *doc = doc_create();
|
328
|
+
InStream *iin = fr->index_in;
|
329
|
+
InStream *fin = fr->fields_in;
|
330
|
+
is_seek(iin, doc_num * 8);
|
331
|
+
int position = is_read_long(iin);
|
332
|
+
is_seek(fin, position);
|
333
|
+
int field_cnt = is_read_vint(fin);
|
334
|
+
int field_number;
|
335
|
+
FieldInfo *fi;
|
336
|
+
|
337
|
+
for (i = 0; i < field_cnt; i++) {
|
338
|
+
field_number = is_read_vint(fin);
|
339
|
+
fi = fr->fis->by_number[field_number];
|
340
|
+
|
341
|
+
bits = is_read_byte(fin);
|
342
|
+
|
343
|
+
is_compressed = (bits & FIELD_IS_COMPRESSED) != 0;
|
344
|
+
is_tokenized = (bits & FIELD_IS_TOKENIZED) != 0;
|
345
|
+
is_binary = (bits & FIELD_IS_BINARY) != 0;
|
346
|
+
|
347
|
+
if (is_binary) {
|
348
|
+
dlen = is_read_vint(fin);
|
349
|
+
data = ALLOC_N(char, dlen);
|
350
|
+
is_read_bytes(fin, (uchar *)data, 0, dlen);
|
351
|
+
if (is_compressed) {
|
352
|
+
doc_add_field(doc, df_create_binary(fi->name, data, dlen, DF_STORE_COMPRESS));
|
353
|
+
} else {
|
354
|
+
doc_add_field(doc, df_create_binary(fi->name, data, dlen, DF_STORE_YES));
|
355
|
+
}
|
356
|
+
} else {
|
357
|
+
store = DF_STORE_YES;
|
358
|
+
if (!fi->is_indexed) {
|
359
|
+
index = DF_INDEX_NO;
|
360
|
+
} else if (is_tokenized) {
|
361
|
+
index = DF_INDEX_TOKENIZED;
|
362
|
+
} else if (fi->omit_norms) {
|
363
|
+
index = DF_INDEX_NO_NORMS;
|
364
|
+
} else {
|
365
|
+
index = DF_INDEX_UNTOKENIZED;
|
366
|
+
}
|
367
|
+
data = NULL;
|
368
|
+
if (is_compressed) {
|
369
|
+
store = DF_STORE_COMPRESS;
|
370
|
+
dlen = is_read_vint(fin);
|
371
|
+
data = ALLOC_N(char, (dlen + 1));
|
372
|
+
data[dlen] = '\0';
|
373
|
+
is_read_bytes(fin, (uchar *)data, 0, dlen);
|
374
|
+
} else {
|
375
|
+
data = is_read_string(fin);
|
376
|
+
}
|
377
|
+
stv = DF_TERM_VECTOR_NO;
|
378
|
+
if (fi->store_tv) {
|
379
|
+
if (fi->store_pos && fi->store_offset) {
|
380
|
+
stv = DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS;
|
381
|
+
} else if (fi->store_pos) {
|
382
|
+
stv = DF_TERM_VECTOR_WITH_POSITIONS;
|
383
|
+
} else if (fi->store_offset) {
|
384
|
+
stv = DF_TERM_VECTOR_WITH_OFFSETS;
|
385
|
+
} else {
|
386
|
+
stv = DF_TERM_VECTOR_YES;
|
387
|
+
}
|
388
|
+
}
|
389
|
+
doc_add_field(doc, df_create(fi->name, data, store, index, stv));
|
390
|
+
}
|
391
|
+
}
|
392
|
+
|
393
|
+
return doc;
|
394
|
+
}
|
395
|
+
|