ferret 0.3.2 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/ferret.h CHANGED
@@ -1,128 +1,62 @@
1
1
  #ifndef __FERRET_H_
2
2
  #define __FERRET_H_
3
3
 
4
- #include <ruby.h>
5
-
6
- #define BUFFER_SIZE 1024
7
-
8
- typedef unsigned char byte_t;
9
-
10
- typedef struct IndexBuffer {
11
- long start;
12
- int len;
13
- int pos;
14
- byte_t *buffer;
15
- } IndexBuffer;
16
-
17
- typedef struct Term {
18
- VALUE field;
19
- char *text;
20
- int tlen;
21
- } Term;
22
-
23
- typedef struct PriorityQueue {
24
- VALUE *heap;
25
- int len;
26
- int size;
27
- } PriorityQueue;
28
-
29
- typedef struct TermInfo {
30
- int doc_freq;
31
- long freq_pointer;
32
- long prox_pointer;
33
- int skip_offset;
34
- } TermInfo;
35
-
36
- typedef struct RAMFile {
37
- void **buffers;
38
- int bufcnt;
39
- VALUE mtime;
40
- char *name;
41
- int length;
42
- } RAMFile;
43
-
44
- typedef struct SegmentTermEnum {
45
- VALUE input;
46
- IndexBuffer *buf;
47
- VALUE field_infos;
48
- VALUE rtb_curr;
49
- Term *tb_curr;
50
- VALUE rtb_prev;
51
- Term *tb_prev;
52
- TermInfo *ti;
53
- int is_index;
54
- int size;
55
- int position;
56
- int index_pointer;
57
- int index_interval;
58
- int skip_interval;
59
- int format;
60
- int format_m1skip_interval;
61
- } SegmentTermEnum;
4
+ #include "global.h"
5
+ #include "document.h"
62
6
 
63
7
  /* IDs */
64
8
  extern ID id_new;
65
- extern ID id_close;
66
- extern ID id_size;
67
- extern ID id_iv_size;
68
9
 
69
10
  /* Modules */
70
11
  extern VALUE mFerret;
71
- extern VALUE mStore;
72
- extern VALUE mIndex;
73
- extern VALUE mUtils;
74
12
  extern VALUE mAnalysis;
13
+ extern VALUE mDocument;
14
+ extern VALUE mIndex;
75
15
  extern VALUE mSearch;
16
+ extern VALUE mStore;
76
17
  extern VALUE mStringHelper;
18
+ extern VALUE mUtils;
19
+ extern VALUE mSpans;
77
20
 
78
21
  /* Classes */
79
- extern VALUE cRAMDirectory;
80
- extern VALUE cIndexIn;
81
- extern VALUE cBufferedIndexIn;
82
- extern VALUE cFSIndexIn;
83
- extern VALUE cIndexOut;
84
- extern VALUE cBufferedIndexOut;
85
- extern VALUE cFSIndexOut;
86
- extern VALUE cRAMIndexOut;
87
- extern VALUE cRAMIndexIn;
88
- extern VALUE cTerm;
89
- extern VALUE cTermBuffer;
90
- extern VALUE cTermInfo;
91
- extern VALUE cToken;
92
- extern VALUE cPriorityQueue;
93
- extern VALUE cSegmentMergeQueue;
94
- extern VALUE cTermEnum;
95
- extern VALUE cTermInfosReader;
96
- extern VALUE cSegmentTermEnum;
97
- extern VALUE cSimilarity;
98
- extern VALUE cDefaultSimilarity;
22
+ extern VALUE cDirectory;
99
23
 
100
24
  /* Ferret Inits */
101
- extern void Init_indexio();
102
25
  extern void Init_term();
103
- extern void Init_term_info();
104
- extern void Init_term_infos_reader();
105
- extern void Init_term_buffer();
106
- extern void Init_priority_queue();
107
- extern void Init_token();
108
- extern void Init_segment_merge_queue();
109
- extern void Init_segment_term_enum();
110
- extern void Init_ram_directory();
111
- extern void Init_string_helper();
112
- extern void Init_similarity();
113
-
114
- /* External functions */
115
- extern int frt_hash(register char *p, register int len);
116
- extern unsigned long long frt_read_vint(VALUE self, IndexBuffer *my_buf);
117
- extern VALUE frt_indexin_read_long(VALUE self);
118
- extern VALUE frt_indexin_read_int(VALUE self);
119
- extern VALUE frt_indexin_seek(VALUE self, VALUE pos);
120
- extern VALUE frt_termbuffer_to_term(VALUE self);
121
- extern void frt_read_chars(VALUE self, char *buf, int offset, int len);
122
- extern void frt_write_bytes(VALUE self, byte_t *buf, int len);
123
- extern int frt_term_compare_to_int(VALUE self, VALUE rother);
124
- extern VALUE frt_termbuffer_init_copy(VALUE self, VALUE rother);
125
- extern VALUE frt_termbuffer_read(VALUE self, VALUE input, VALUE info);
126
- extern inline int frt_term_cmp(Term *t1, Term *t2);
26
+ extern void Init_dir();
27
+ extern void Init_analysis();
28
+ extern void Init_doc();
29
+ extern void Init_index_io();
30
+ extern void Init_search();
31
+ extern void Init_qparser();
32
+ //extern void object_add(void *key, VALUE obj);
33
+ #define object_add(key, obj) object_add2(key, obj, __FILE__, __LINE__, __func__)
34
+ extern void object_add2(void *key, VALUE obj, const char *file, int line, const char *func);
35
+ //extern void object_del(void *key);
36
+ #define object_del(key) object_del2(key, __FILE__, __LINE__, __func__)
37
+ extern void object_del2(void *key, const char *file, int line, const char *func);
38
+ extern void frt_gc_mark(void *key);
39
+ extern VALUE object_get(void *key);
40
+ extern VALUE frt_data_alloc(VALUE klass);
41
+ extern VALUE frt_get_doc(Document *doc);
42
+ extern void frt_deref_free(void *p);
43
+
44
+
45
+ #define Frt_Make_Struct(klass)\
46
+ rb_data_object_alloc(klass,NULL,(RUBY_DATA_FUNC)NULL,(RUBY_DATA_FUNC)NULL)
47
+
48
+ #define Frt_Wrap_Struct(self,mmark,mfree,mdata)\
49
+ do {\
50
+ ((struct RData *)(self))->data = mdata;\
51
+ ((struct RData *)(self))->dmark = mmark;\
52
+ ((struct RData *)(self))->dfree = mfree;\
53
+ } while (0)
54
+
55
+ #define Frt_Unwrap_Struct(self)\
56
+ do {\
57
+ ((struct RData *)(self))->data = NULL;\
58
+ ((struct RData *)(self))->dmark = NULL;\
59
+ ((struct RData *)(self))->dfree = NULL;\
60
+ } while (0)
127
61
 
128
62
  #endif
data/ext/field.c ADDED
@@ -0,0 +1,395 @@
1
+ #include <index.h>
2
+ #include <string.h>
3
+
4
+ /****************************************************************************
5
+ *
6
+ * FieldInfo
7
+ *
8
+ ****************************************************************************/
9
+
10
+ FieldInfo *fi_create(char *name, int number, bool is_indexed,
11
+ bool store_tv, bool store_pos, bool store_offset, bool omit_norms)
12
+ {
13
+ FieldInfo *fi = ALLOC(FieldInfo);
14
+ fi->name = estrdup(name);
15
+ fi->number = number;
16
+ fi->is_indexed = is_indexed;
17
+ fi->store_tv = store_tv;
18
+ fi->store_offset = store_offset;
19
+ fi->store_pos = store_pos;
20
+ fi->omit_norms = omit_norms;
21
+ return fi;
22
+ }
23
+
24
+ void fi_destroy(void *p)
25
+ {
26
+ FieldInfo *fi = (FieldInfo *)p;
27
+ free(fi->name);
28
+ free(fi);
29
+ }
30
+
31
+ /****************************************************************************
32
+ *
33
+ * FieldInfos
34
+ *
35
+ ****************************************************************************/
36
+
37
+ FieldInfos *fis_create()
38
+ {
39
+ FieldInfos *fis = ALLOC(FieldInfos);
40
+ fis->by_name = ht_create();
41
+ fis->by_number = NULL;
42
+ fis->fcnt = 0;
43
+ return fis;
44
+ }
45
+
46
+ FieldInfos *fis_open(Store *store, char *filename)
47
+ {
48
+ FieldInfos *fis = fis_create();
49
+ InStream *is = store->open_input(store, filename);
50
+ fis_read(fis, is);
51
+ is_close(is);
52
+ return fis;
53
+ }
54
+
55
+ void fis_destroy(void *p)
56
+ {
57
+ int i;
58
+ FieldInfos *fis = (FieldInfos *)p;
59
+ for (i = 0; i < fis->fcnt; i++) {
60
+ fi_destroy(fis->by_number[i]);
61
+ }
62
+ ht_destroy(fis->by_name);
63
+ free(fis->by_number);
64
+ free(fis);
65
+ }
66
+
67
+ FieldInfo *fis_add(FieldInfos *fis,
68
+ char *name,
69
+ bool is_indexed,
70
+ bool store_tv,
71
+ bool store_pos,
72
+ bool store_offset,
73
+ bool omit_norms)
74
+ {
75
+ FieldInfo *fi = ht_get(fis->by_name, name);
76
+ if (fi == NULL) {
77
+ fi = fi_create(name, fis->fcnt, is_indexed, store_tv,
78
+ store_pos, store_offset, omit_norms);
79
+ fis->fcnt++;
80
+ REALLOC_N(fis->by_number, FieldInfo *, fis->fcnt);
81
+
82
+ fis->by_number[fi->number] = fi;
83
+ ht_set(fis->by_name, name, fi);
84
+ } else {
85
+ if (fi->is_indexed != is_indexed)
86
+ fi->is_indexed = true; // once indexed, always index
87
+ if (fi->store_tv != store_tv)
88
+ fi->store_tv = true; // once vector, always vector
89
+ if (fi->store_pos != store_pos)
90
+ fi->store_pos = true; // once vector, always vector
91
+ if (fi->store_offset != store_offset)
92
+ fi->store_offset = true; // once vector, always vector
93
+ if (fi->omit_norms != omit_norms)
94
+ fi->omit_norms = false; // once kept, always keep
95
+ }
96
+ return fi;
97
+ }
98
+
99
+ void fis_add_fields(FieldInfos *fis,
100
+ HashSet *field_names,
101
+ bool is_indexed,
102
+ bool store_tv,
103
+ bool store_pos,
104
+ bool store_offset,
105
+ bool omit_norms)
106
+ {
107
+ int i;
108
+ for (i = 0; i < field_names->size; i++) {
109
+ fis_add(fis, field_names->elems[i], is_indexed, store_tv, store_pos,
110
+ store_offset, omit_norms);
111
+ }
112
+ hs_destroy(field_names);
113
+ }
114
+
115
+ bool fis_has_vectors(FieldInfos *fis)
116
+ {
117
+ int i;
118
+ for (i = 0; i < fis->fcnt; i++) {
119
+ if (fis->by_number[i]->store_tv)
120
+ return true;
121
+ }
122
+ return false;
123
+ }
124
+
125
+ FieldInfo *fis_get_fi(FieldInfos *fis, char *name)
126
+ {
127
+ return (FieldInfo *)ht_get(fis->by_name, name);
128
+ }
129
+
130
+ unsigned long long fis_get_number(FieldInfos *fis, char *name)
131
+ {
132
+ FieldInfo *fi = (FieldInfo *)ht_get(fis->by_name, name);
133
+ if (fi == NULL)
134
+ return 0xFFFFFFFFull; // to be compatible with Jave version
135
+ else
136
+ return fi->number;
137
+ }
138
+
139
+ #define IS_INDEXED 0x01
140
+ #define STORE_TV 0x02
141
+ #define STORE_POS 0x04
142
+ #define STORE_OFFSET 0x08
143
+ #define OMIT_NORMS 0x10
144
+
145
+ int fi_field_info_byte(FieldInfo *fi)
146
+ {
147
+ int bits = 0x0;
148
+ if (fi->is_indexed)
149
+ bits |= IS_INDEXED;
150
+ if (fi->store_tv)
151
+ bits |= STORE_TV;
152
+ if (fi->store_pos)
153
+ bits |= STORE_POS;
154
+ if (fi->store_offset)
155
+ bits |= STORE_OFFSET;
156
+ if (fi->omit_norms)
157
+ bits |= OMIT_NORMS;
158
+ return bits;
159
+ }
160
+
161
+ void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext)
162
+ {
163
+ int i;
164
+ FieldInfo *fi;
165
+ char fname[SEGMENT_NAME_MAX_LENGTH];
166
+ strcpy(fname, segment);
167
+ strcat(fname, ext);
168
+ OutStream *os = store->create_output(store, fname);
169
+ os_write_vint(os, fis->fcnt);
170
+ for (i = 0; i < fis->fcnt; i++) {
171
+ fi = fis->by_number[i];
172
+ os_write_string(os, fi->name);
173
+ os_write_vint(os, fi_field_info_byte(fi));
174
+ }
175
+ os_close(os);
176
+ }
177
+
178
+ FieldInfos *fis_read(FieldInfos *fis, InStream *is)
179
+ {
180
+ int i, size = is_read_vint(is); //read in the size
181
+ int bits, is_indexed, store_tv, store_pos, store_offset, omit_norms;
182
+ char *name;
183
+ for (i = 0; i < size; i++) {
184
+ name = is_read_string(is);
185
+ bits = is_read_byte(is);
186
+ is_indexed = (bits & IS_INDEXED) != 0;
187
+ store_tv = (bits & STORE_TV) != 0;
188
+ store_pos = (bits & STORE_POS) != 0;
189
+ store_offset = (bits & STORE_OFFSET) != 0;
190
+ omit_norms = (bits & OMIT_NORMS) != 0;
191
+ fis_add(fis, name, is_indexed, store_tv,
192
+ store_pos, store_offset, omit_norms);
193
+ free(name);
194
+ }
195
+ return fis;
196
+ }
197
+
198
+ FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc)
199
+ {
200
+ int i;
201
+ DocField *df;
202
+ for (i = 0; i < doc->dfcnt; i++) {
203
+ df = doc->df_arr[i];
204
+ fis_add(fis, df->name, df->is_indexed, df->store_tv,
205
+ df->store_pos, df->store_offset, df->omit_norms);
206
+ }
207
+ return fis;
208
+ }
209
+
210
+ /****************************************************************************
211
+ *
212
+ * FieldsWriter
213
+ *
214
+ ****************************************************************************/
215
+
216
+ FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis)
217
+ {
218
+ char buf[SEGMENT_NAME_MAX_LENGTH];
219
+ int slen = strlen(segment);
220
+ strcpy(buf, segment);
221
+
222
+ FieldsWriter *fw = ALLOC(FieldsWriter);
223
+ fw->fis = fis;
224
+ strcpy(buf+slen, ".fdt");
225
+ fw->fields_out = store->create_output(store, buf);
226
+ strcpy(buf+slen, ".fdx");
227
+ fw->index_out = store->create_output(store, buf);
228
+ return fw;
229
+ }
230
+
231
+ void fw_close(FieldsWriter *fw)
232
+ {
233
+ os_close(fw->fields_out);
234
+ os_close(fw->index_out);
235
+ free(fw);
236
+ }
237
+
238
+ void save_data(OutStream *fout, char *data, int dlen)
239
+ {
240
+ os_write_vint(fout, dlen);
241
+ os_write_bytes(fout, (uchar *)data, dlen);
242
+ }
243
+
244
+ void fw_add_doc(FieldsWriter *fw, Document *doc)
245
+ {
246
+ int i, bits;
247
+ OutStream *fout = fw->fields_out, *iout = fw->index_out;
248
+ os_write_long(iout, os_pos(fout));
249
+ DocField *df;
250
+ char *data;
251
+
252
+ int stored_count = 0;
253
+ for (i = 0; i < doc->dfcnt; i++) {
254
+ if (doc->df_arr[i]->is_stored)
255
+ stored_count++;
256
+ }
257
+ os_write_vint(fout, stored_count);
258
+
259
+ for (i = 0; i < doc->dfcnt; i++) {
260
+ df = doc->df_arr[i];
261
+ if (df->is_stored) {
262
+ os_write_vint(fout, ((FieldInfo *)ht_get(fw->fis->by_name, df->name))->number);
263
+
264
+ bits = 0;
265
+ if (df->is_tokenized)
266
+ bits |= FIELD_IS_TOKENIZED;
267
+ if (df->is_binary)
268
+ bits |= FIELD_IS_BINARY;
269
+ if (df->is_compressed)
270
+ bits |= FIELD_IS_COMPRESSED;
271
+ os_write_byte(fout, bits);
272
+
273
+ data = NULL;
274
+ if (df->is_compressed) {
275
+ // Not compressing just yet but we'll save it anyway
276
+ if (df->is_binary) {
277
+ save_data(fout, df->data, df->blen);
278
+ } else {
279
+ os_write_string(fout, df->data);
280
+ }
281
+ } else {
282
+ if (df->is_binary) {
283
+ save_data(fout, df->data, df->blen);
284
+ } else {
285
+ os_write_string(fout, df->data);
286
+ }
287
+ }
288
+ }
289
+ }
290
+ }
291
+
292
+ /****************************************************************************
293
+ *
294
+ * FieldsReader
295
+ *
296
+ ****************************************************************************/
297
+
298
+ FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis)
299
+ {
300
+ char buf[100];
301
+ int slen = strlen(segment);
302
+ strcpy(buf, segment);
303
+
304
+ FieldsReader *fr = ALLOC(FieldsReader);
305
+ fr->fis = fis;
306
+ strcpy(buf+slen, ".fdt");
307
+ fr->fields_in = store->open_input(store, buf);
308
+ strcpy(buf+slen, ".fdx");
309
+ InStream *iin = fr->index_in = store->open_input(store, buf);
310
+ fr->len = iin->length_internal(iin)/8;
311
+ return fr;
312
+ }
313
+
314
+ void fr_close(FieldsReader *fr)
315
+ {
316
+ is_close(fr->fields_in);
317
+ is_close(fr->index_in);
318
+ free(fr);
319
+ }
320
+
321
+ Document *fr_get_doc(FieldsReader *fr, int doc_num)
322
+ {
323
+ int i, bits, dlen;
324
+ char *data;
325
+ int store, index, stv;
326
+ int is_compressed, is_tokenized, is_binary;
327
+ Document *doc = doc_create();
328
+ InStream *iin = fr->index_in;
329
+ InStream *fin = fr->fields_in;
330
+ is_seek(iin, doc_num * 8);
331
+ int position = is_read_long(iin);
332
+ is_seek(fin, position);
333
+ int field_cnt = is_read_vint(fin);
334
+ int field_number;
335
+ FieldInfo *fi;
336
+
337
+ for (i = 0; i < field_cnt; i++) {
338
+ field_number = is_read_vint(fin);
339
+ fi = fr->fis->by_number[field_number];
340
+
341
+ bits = is_read_byte(fin);
342
+
343
+ is_compressed = (bits & FIELD_IS_COMPRESSED) != 0;
344
+ is_tokenized = (bits & FIELD_IS_TOKENIZED) != 0;
345
+ is_binary = (bits & FIELD_IS_BINARY) != 0;
346
+
347
+ if (is_binary) {
348
+ dlen = is_read_vint(fin);
349
+ data = ALLOC_N(char, dlen);
350
+ is_read_bytes(fin, (uchar *)data, 0, dlen);
351
+ if (is_compressed) {
352
+ doc_add_field(doc, df_create_binary(fi->name, data, dlen, DF_STORE_COMPRESS));
353
+ } else {
354
+ doc_add_field(doc, df_create_binary(fi->name, data, dlen, DF_STORE_YES));
355
+ }
356
+ } else {
357
+ store = DF_STORE_YES;
358
+ if (!fi->is_indexed) {
359
+ index = DF_INDEX_NO;
360
+ } else if (is_tokenized) {
361
+ index = DF_INDEX_TOKENIZED;
362
+ } else if (fi->omit_norms) {
363
+ index = DF_INDEX_NO_NORMS;
364
+ } else {
365
+ index = DF_INDEX_UNTOKENIZED;
366
+ }
367
+ data = NULL;
368
+ if (is_compressed) {
369
+ store = DF_STORE_COMPRESS;
370
+ dlen = is_read_vint(fin);
371
+ data = ALLOC_N(char, (dlen + 1));
372
+ data[dlen] = '\0';
373
+ is_read_bytes(fin, (uchar *)data, 0, dlen);
374
+ } else {
375
+ data = is_read_string(fin);
376
+ }
377
+ stv = DF_TERM_VECTOR_NO;
378
+ if (fi->store_tv) {
379
+ if (fi->store_pos && fi->store_offset) {
380
+ stv = DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS;
381
+ } else if (fi->store_pos) {
382
+ stv = DF_TERM_VECTOR_WITH_POSITIONS;
383
+ } else if (fi->store_offset) {
384
+ stv = DF_TERM_VECTOR_WITH_OFFSETS;
385
+ } else {
386
+ stv = DF_TERM_VECTOR_YES;
387
+ }
388
+ }
389
+ doc_add_field(doc, df_create(fi->name, data, store, index, stv));
390
+ }
391
+ }
392
+
393
+ return doc;
394
+ }
395
+