ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/ferret.h CHANGED
@@ -1,128 +1,62 @@
1
1
  #ifndef __FERRET_H_
2
2
  #define __FERRET_H_
3
3
 
4
- #include <ruby.h>
5
-
6
- #define BUFFER_SIZE 1024
7
-
8
- typedef unsigned char byte_t;
9
-
10
- typedef struct IndexBuffer {
11
- long start;
12
- int len;
13
- int pos;
14
- byte_t *buffer;
15
- } IndexBuffer;
16
-
17
- typedef struct Term {
18
- VALUE field;
19
- char *text;
20
- int tlen;
21
- } Term;
22
-
23
- typedef struct PriorityQueue {
24
- VALUE *heap;
25
- int len;
26
- int size;
27
- } PriorityQueue;
28
-
29
- typedef struct TermInfo {
30
- int doc_freq;
31
- long freq_pointer;
32
- long prox_pointer;
33
- int skip_offset;
34
- } TermInfo;
35
-
36
- typedef struct RAMFile {
37
- void **buffers;
38
- int bufcnt;
39
- VALUE mtime;
40
- char *name;
41
- int length;
42
- } RAMFile;
43
-
44
- typedef struct SegmentTermEnum {
45
- VALUE input;
46
- IndexBuffer *buf;
47
- VALUE field_infos;
48
- VALUE rtb_curr;
49
- Term *tb_curr;
50
- VALUE rtb_prev;
51
- Term *tb_prev;
52
- TermInfo *ti;
53
- int is_index;
54
- int size;
55
- int position;
56
- int index_pointer;
57
- int index_interval;
58
- int skip_interval;
59
- int format;
60
- int format_m1skip_interval;
61
- } SegmentTermEnum;
4
+ #include "global.h"
5
+ #include "document.h"
62
6
 
63
7
  /* IDs */
64
8
  extern ID id_new;
65
- extern ID id_close;
66
- extern ID id_size;
67
- extern ID id_iv_size;
68
9
 
69
10
  /* Modules */
70
11
  extern VALUE mFerret;
71
- extern VALUE mStore;
72
- extern VALUE mIndex;
73
- extern VALUE mUtils;
74
12
  extern VALUE mAnalysis;
13
+ extern VALUE mDocument;
14
+ extern VALUE mIndex;
75
15
  extern VALUE mSearch;
16
+ extern VALUE mStore;
76
17
  extern VALUE mStringHelper;
18
+ extern VALUE mUtils;
19
+ extern VALUE mSpans;
77
20
 
78
21
  /* Classes */
79
- extern VALUE cRAMDirectory;
80
- extern VALUE cIndexIn;
81
- extern VALUE cBufferedIndexIn;
82
- extern VALUE cFSIndexIn;
83
- extern VALUE cIndexOut;
84
- extern VALUE cBufferedIndexOut;
85
- extern VALUE cFSIndexOut;
86
- extern VALUE cRAMIndexOut;
87
- extern VALUE cRAMIndexIn;
88
- extern VALUE cTerm;
89
- extern VALUE cTermBuffer;
90
- extern VALUE cTermInfo;
91
- extern VALUE cToken;
92
- extern VALUE cPriorityQueue;
93
- extern VALUE cSegmentMergeQueue;
94
- extern VALUE cTermEnum;
95
- extern VALUE cTermInfosReader;
96
- extern VALUE cSegmentTermEnum;
97
- extern VALUE cSimilarity;
98
- extern VALUE cDefaultSimilarity;
22
+ extern VALUE cDirectory;
99
23
 
100
24
  /* Ferret Inits */
101
- extern void Init_indexio();
102
25
  extern void Init_term();
103
- extern void Init_term_info();
104
- extern void Init_term_infos_reader();
105
- extern void Init_term_buffer();
106
- extern void Init_priority_queue();
107
- extern void Init_token();
108
- extern void Init_segment_merge_queue();
109
- extern void Init_segment_term_enum();
110
- extern void Init_ram_directory();
111
- extern void Init_string_helper();
112
- extern void Init_similarity();
113
-
114
- /* External functions */
115
- extern int frt_hash(register char *p, register int len);
116
- extern unsigned long long frt_read_vint(VALUE self, IndexBuffer *my_buf);
117
- extern VALUE frt_indexin_read_long(VALUE self);
118
- extern VALUE frt_indexin_read_int(VALUE self);
119
- extern VALUE frt_indexin_seek(VALUE self, VALUE pos);
120
- extern VALUE frt_termbuffer_to_term(VALUE self);
121
- extern void frt_read_chars(VALUE self, char *buf, int offset, int len);
122
- extern void frt_write_bytes(VALUE self, byte_t *buf, int len);
123
- extern int frt_term_compare_to_int(VALUE self, VALUE rother);
124
- extern VALUE frt_termbuffer_init_copy(VALUE self, VALUE rother);
125
- extern VALUE frt_termbuffer_read(VALUE self, VALUE input, VALUE info);
126
- extern inline int frt_term_cmp(Term *t1, Term *t2);
26
+ extern void Init_dir();
27
+ extern void Init_analysis();
28
+ extern void Init_doc();
29
+ extern void Init_index_io();
30
+ extern void Init_search();
31
+ extern void Init_qparser();
32
+ //extern void object_add(void *key, VALUE obj);
33
+ #define object_add(key, obj) object_add2(key, obj, __FILE__, __LINE__, __func__)
34
+ extern void object_add2(void *key, VALUE obj, const char *file, int line, const char *func);
35
+ //extern void object_del(void *key);
36
+ #define object_del(key) object_del2(key, __FILE__, __LINE__, __func__)
37
+ extern void object_del2(void *key, const char *file, int line, const char *func);
38
+ extern void frt_gc_mark(void *key);
39
+ extern VALUE object_get(void *key);
40
+ extern VALUE frt_data_alloc(VALUE klass);
41
+ extern VALUE frt_get_doc(Document *doc);
42
+ extern void frt_deref_free(void *p);
43
+
44
+
45
+ #define Frt_Make_Struct(klass)\
46
+ rb_data_object_alloc(klass,NULL,(RUBY_DATA_FUNC)NULL,(RUBY_DATA_FUNC)NULL)
47
+
48
+ #define Frt_Wrap_Struct(self,mmark,mfree,mdata)\
49
+ do {\
50
+ ((struct RData *)(self))->data = mdata;\
51
+ ((struct RData *)(self))->dmark = mmark;\
52
+ ((struct RData *)(self))->dfree = mfree;\
53
+ } while (0)
54
+
55
+ #define Frt_Unwrap_Struct(self)\
56
+ do {\
57
+ ((struct RData *)(self))->data = NULL;\
58
+ ((struct RData *)(self))->dmark = NULL;\
59
+ ((struct RData *)(self))->dfree = NULL;\
60
+ } while (0)
127
61
 
128
62
  #endif
data/ext/field.c ADDED
@@ -0,0 +1,395 @@
1
+ #include <index.h>
2
+ #include <string.h>
3
+
4
+ /****************************************************************************
5
+ *
6
+ * FieldInfo
7
+ *
8
+ ****************************************************************************/
9
+
10
+ FieldInfo *fi_create(char *name, int number, bool is_indexed,
11
+ bool store_tv, bool store_pos, bool store_offset, bool omit_norms)
12
+ {
13
+ FieldInfo *fi = ALLOC(FieldInfo);
14
+ fi->name = estrdup(name);
15
+ fi->number = number;
16
+ fi->is_indexed = is_indexed;
17
+ fi->store_tv = store_tv;
18
+ fi->store_offset = store_offset;
19
+ fi->store_pos = store_pos;
20
+ fi->omit_norms = omit_norms;
21
+ return fi;
22
+ }
23
+
24
+ void fi_destroy(void *p)
25
+ {
26
+ FieldInfo *fi = (FieldInfo *)p;
27
+ free(fi->name);
28
+ free(fi);
29
+ }
30
+
31
+ /****************************************************************************
32
+ *
33
+ * FieldInfos
34
+ *
35
+ ****************************************************************************/
36
+
37
+ FieldInfos *fis_create()
38
+ {
39
+ FieldInfos *fis = ALLOC(FieldInfos);
40
+ fis->by_name = ht_create();
41
+ fis->by_number = NULL;
42
+ fis->fcnt = 0;
43
+ return fis;
44
+ }
45
+
46
+ FieldInfos *fis_open(Store *store, char *filename)
47
+ {
48
+ FieldInfos *fis = fis_create();
49
+ InStream *is = store->open_input(store, filename);
50
+ fis_read(fis, is);
51
+ is_close(is);
52
+ return fis;
53
+ }
54
+
55
+ void fis_destroy(void *p)
56
+ {
57
+ int i;
58
+ FieldInfos *fis = (FieldInfos *)p;
59
+ for (i = 0; i < fis->fcnt; i++) {
60
+ fi_destroy(fis->by_number[i]);
61
+ }
62
+ ht_destroy(fis->by_name);
63
+ free(fis->by_number);
64
+ free(fis);
65
+ }
66
+
67
+ FieldInfo *fis_add(FieldInfos *fis,
68
+ char *name,
69
+ bool is_indexed,
70
+ bool store_tv,
71
+ bool store_pos,
72
+ bool store_offset,
73
+ bool omit_norms)
74
+ {
75
+ FieldInfo *fi = ht_get(fis->by_name, name);
76
+ if (fi == NULL) {
77
+ fi = fi_create(name, fis->fcnt, is_indexed, store_tv,
78
+ store_pos, store_offset, omit_norms);
79
+ fis->fcnt++;
80
+ REALLOC_N(fis->by_number, FieldInfo *, fis->fcnt);
81
+
82
+ fis->by_number[fi->number] = fi;
83
+ ht_set(fis->by_name, name, fi);
84
+ } else {
85
+ if (fi->is_indexed != is_indexed)
86
+ fi->is_indexed = true; // once indexed, always index
87
+ if (fi->store_tv != store_tv)
88
+ fi->store_tv = true; // once vector, always vector
89
+ if (fi->store_pos != store_pos)
90
+ fi->store_pos = true; // once vector, always vector
91
+ if (fi->store_offset != store_offset)
92
+ fi->store_offset = true; // once vector, always vector
93
+ if (fi->omit_norms != omit_norms)
94
+ fi->omit_norms = false; // once kept, always keep
95
+ }
96
+ return fi;
97
+ }
98
+
99
+ void fis_add_fields(FieldInfos *fis,
100
+ HashSet *field_names,
101
+ bool is_indexed,
102
+ bool store_tv,
103
+ bool store_pos,
104
+ bool store_offset,
105
+ bool omit_norms)
106
+ {
107
+ int i;
108
+ for (i = 0; i < field_names->size; i++) {
109
+ fis_add(fis, field_names->elems[i], is_indexed, store_tv, store_pos,
110
+ store_offset, omit_norms);
111
+ }
112
+ hs_destroy(field_names);
113
+ }
114
+
115
+ bool fis_has_vectors(FieldInfos *fis)
116
+ {
117
+ int i;
118
+ for (i = 0; i < fis->fcnt; i++) {
119
+ if (fis->by_number[i]->store_tv)
120
+ return true;
121
+ }
122
+ return false;
123
+ }
124
+
125
+ FieldInfo *fis_get_fi(FieldInfos *fis, char *name)
126
+ {
127
+ return (FieldInfo *)ht_get(fis->by_name, name);
128
+ }
129
+
130
+ unsigned long long fis_get_number(FieldInfos *fis, char *name)
131
+ {
132
+ FieldInfo *fi = (FieldInfo *)ht_get(fis->by_name, name);
133
+ if (fi == NULL)
134
+ return 0xFFFFFFFFull; // to be compatible with Jave version
135
+ else
136
+ return fi->number;
137
+ }
138
+
139
+ #define IS_INDEXED 0x01
140
+ #define STORE_TV 0x02
141
+ #define STORE_POS 0x04
142
+ #define STORE_OFFSET 0x08
143
+ #define OMIT_NORMS 0x10
144
+
145
+ int fi_field_info_byte(FieldInfo *fi)
146
+ {
147
+ int bits = 0x0;
148
+ if (fi->is_indexed)
149
+ bits |= IS_INDEXED;
150
+ if (fi->store_tv)
151
+ bits |= STORE_TV;
152
+ if (fi->store_pos)
153
+ bits |= STORE_POS;
154
+ if (fi->store_offset)
155
+ bits |= STORE_OFFSET;
156
+ if (fi->omit_norms)
157
+ bits |= OMIT_NORMS;
158
+ return bits;
159
+ }
160
+
161
+ void fis_write(FieldInfos *fis, Store *store, char *segment, char *ext)
162
+ {
163
+ int i;
164
+ FieldInfo *fi;
165
+ char fname[SEGMENT_NAME_MAX_LENGTH];
166
+ strcpy(fname, segment);
167
+ strcat(fname, ext);
168
+ OutStream *os = store->create_output(store, fname);
169
+ os_write_vint(os, fis->fcnt);
170
+ for (i = 0; i < fis->fcnt; i++) {
171
+ fi = fis->by_number[i];
172
+ os_write_string(os, fi->name);
173
+ os_write_vint(os, fi_field_info_byte(fi));
174
+ }
175
+ os_close(os);
176
+ }
177
+
178
+ FieldInfos *fis_read(FieldInfos *fis, InStream *is)
179
+ {
180
+ int i, size = is_read_vint(is); //read in the size
181
+ int bits, is_indexed, store_tv, store_pos, store_offset, omit_norms;
182
+ char *name;
183
+ for (i = 0; i < size; i++) {
184
+ name = is_read_string(is);
185
+ bits = is_read_byte(is);
186
+ is_indexed = (bits & IS_INDEXED) != 0;
187
+ store_tv = (bits & STORE_TV) != 0;
188
+ store_pos = (bits & STORE_POS) != 0;
189
+ store_offset = (bits & STORE_OFFSET) != 0;
190
+ omit_norms = (bits & OMIT_NORMS) != 0;
191
+ fis_add(fis, name, is_indexed, store_tv,
192
+ store_pos, store_offset, omit_norms);
193
+ free(name);
194
+ }
195
+ return fis;
196
+ }
197
+
198
+ FieldInfos *fis_add_doc(FieldInfos *fis, Document *doc)
199
+ {
200
+ int i;
201
+ DocField *df;
202
+ for (i = 0; i < doc->dfcnt; i++) {
203
+ df = doc->df_arr[i];
204
+ fis_add(fis, df->name, df->is_indexed, df->store_tv,
205
+ df->store_pos, df->store_offset, df->omit_norms);
206
+ }
207
+ return fis;
208
+ }
209
+
210
+ /****************************************************************************
211
+ *
212
+ * FieldsWriter
213
+ *
214
+ ****************************************************************************/
215
+
216
+ FieldsWriter *fw_open(Store *store, char *segment, FieldInfos *fis)
217
+ {
218
+ char buf[SEGMENT_NAME_MAX_LENGTH];
219
+ int slen = strlen(segment);
220
+ strcpy(buf, segment);
221
+
222
+ FieldsWriter *fw = ALLOC(FieldsWriter);
223
+ fw->fis = fis;
224
+ strcpy(buf+slen, ".fdt");
225
+ fw->fields_out = store->create_output(store, buf);
226
+ strcpy(buf+slen, ".fdx");
227
+ fw->index_out = store->create_output(store, buf);
228
+ return fw;
229
+ }
230
+
231
+ void fw_close(FieldsWriter *fw)
232
+ {
233
+ os_close(fw->fields_out);
234
+ os_close(fw->index_out);
235
+ free(fw);
236
+ }
237
+
238
+ void save_data(OutStream *fout, char *data, int dlen)
239
+ {
240
+ os_write_vint(fout, dlen);
241
+ os_write_bytes(fout, (uchar *)data, dlen);
242
+ }
243
+
244
+ void fw_add_doc(FieldsWriter *fw, Document *doc)
245
+ {
246
+ int i, bits;
247
+ OutStream *fout = fw->fields_out, *iout = fw->index_out;
248
+ os_write_long(iout, os_pos(fout));
249
+ DocField *df;
250
+ char *data;
251
+
252
+ int stored_count = 0;
253
+ for (i = 0; i < doc->dfcnt; i++) {
254
+ if (doc->df_arr[i]->is_stored)
255
+ stored_count++;
256
+ }
257
+ os_write_vint(fout, stored_count);
258
+
259
+ for (i = 0; i < doc->dfcnt; i++) {
260
+ df = doc->df_arr[i];
261
+ if (df->is_stored) {
262
+ os_write_vint(fout, ((FieldInfo *)ht_get(fw->fis->by_name, df->name))->number);
263
+
264
+ bits = 0;
265
+ if (df->is_tokenized)
266
+ bits |= FIELD_IS_TOKENIZED;
267
+ if (df->is_binary)
268
+ bits |= FIELD_IS_BINARY;
269
+ if (df->is_compressed)
270
+ bits |= FIELD_IS_COMPRESSED;
271
+ os_write_byte(fout, bits);
272
+
273
+ data = NULL;
274
+ if (df->is_compressed) {
275
+ // Not compressing just yet but we'll save it anyway
276
+ if (df->is_binary) {
277
+ save_data(fout, df->data, df->blen);
278
+ } else {
279
+ os_write_string(fout, df->data);
280
+ }
281
+ } else {
282
+ if (df->is_binary) {
283
+ save_data(fout, df->data, df->blen);
284
+ } else {
285
+ os_write_string(fout, df->data);
286
+ }
287
+ }
288
+ }
289
+ }
290
+ }
291
+
292
+ /****************************************************************************
293
+ *
294
+ * FieldsReader
295
+ *
296
+ ****************************************************************************/
297
+
298
+ FieldsReader *fr_open(Store *store, char *segment, FieldInfos *fis)
299
+ {
300
+ char buf[100];
301
+ int slen = strlen(segment);
302
+ strcpy(buf, segment);
303
+
304
+ FieldsReader *fr = ALLOC(FieldsReader);
305
+ fr->fis = fis;
306
+ strcpy(buf+slen, ".fdt");
307
+ fr->fields_in = store->open_input(store, buf);
308
+ strcpy(buf+slen, ".fdx");
309
+ InStream *iin = fr->index_in = store->open_input(store, buf);
310
+ fr->len = iin->length_internal(iin)/8;
311
+ return fr;
312
+ }
313
+
314
+ void fr_close(FieldsReader *fr)
315
+ {
316
+ is_close(fr->fields_in);
317
+ is_close(fr->index_in);
318
+ free(fr);
319
+ }
320
+
321
+ Document *fr_get_doc(FieldsReader *fr, int doc_num)
322
+ {
323
+ int i, bits, dlen;
324
+ char *data;
325
+ int store, index, stv;
326
+ int is_compressed, is_tokenized, is_binary;
327
+ Document *doc = doc_create();
328
+ InStream *iin = fr->index_in;
329
+ InStream *fin = fr->fields_in;
330
+ is_seek(iin, doc_num * 8);
331
+ int position = is_read_long(iin);
332
+ is_seek(fin, position);
333
+ int field_cnt = is_read_vint(fin);
334
+ int field_number;
335
+ FieldInfo *fi;
336
+
337
+ for (i = 0; i < field_cnt; i++) {
338
+ field_number = is_read_vint(fin);
339
+ fi = fr->fis->by_number[field_number];
340
+
341
+ bits = is_read_byte(fin);
342
+
343
+ is_compressed = (bits & FIELD_IS_COMPRESSED) != 0;
344
+ is_tokenized = (bits & FIELD_IS_TOKENIZED) != 0;
345
+ is_binary = (bits & FIELD_IS_BINARY) != 0;
346
+
347
+ if (is_binary) {
348
+ dlen = is_read_vint(fin);
349
+ data = ALLOC_N(char, dlen);
350
+ is_read_bytes(fin, (uchar *)data, 0, dlen);
351
+ if (is_compressed) {
352
+ doc_add_field(doc, df_create_binary(fi->name, data, dlen, DF_STORE_COMPRESS));
353
+ } else {
354
+ doc_add_field(doc, df_create_binary(fi->name, data, dlen, DF_STORE_YES));
355
+ }
356
+ } else {
357
+ store = DF_STORE_YES;
358
+ if (!fi->is_indexed) {
359
+ index = DF_INDEX_NO;
360
+ } else if (is_tokenized) {
361
+ index = DF_INDEX_TOKENIZED;
362
+ } else if (fi->omit_norms) {
363
+ index = DF_INDEX_NO_NORMS;
364
+ } else {
365
+ index = DF_INDEX_UNTOKENIZED;
366
+ }
367
+ data = NULL;
368
+ if (is_compressed) {
369
+ store = DF_STORE_COMPRESS;
370
+ dlen = is_read_vint(fin);
371
+ data = ALLOC_N(char, (dlen + 1));
372
+ data[dlen] = '\0';
373
+ is_read_bytes(fin, (uchar *)data, 0, dlen);
374
+ } else {
375
+ data = is_read_string(fin);
376
+ }
377
+ stv = DF_TERM_VECTOR_NO;
378
+ if (fi->store_tv) {
379
+ if (fi->store_pos && fi->store_offset) {
380
+ stv = DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS;
381
+ } else if (fi->store_pos) {
382
+ stv = DF_TERM_VECTOR_WITH_POSITIONS;
383
+ } else if (fi->store_offset) {
384
+ stv = DF_TERM_VECTOR_WITH_OFFSETS;
385
+ } else {
386
+ stv = DF_TERM_VECTOR_YES;
387
+ }
388
+ }
389
+ doc_add_field(doc, df_create(fi->name, data, store, index, stv));
390
+ }
391
+ }
392
+
393
+ return doc;
394
+ }
395
+