ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/r_index_io.c ADDED
@@ -0,0 +1,996 @@
1
+ #include "ferret.h"
2
+ #include "index.h"
3
+
4
+ VALUE cTVOffsetInfo;
5
+ VALUE cTermVector;
6
+ VALUE cTermDocEnum;
7
+ VALUE cIndexWriter;
8
+ VALUE cIndexReader;
9
+ VALUE cTermEnum;
10
+
11
+ VALUE ranalyzer_key;
12
+ VALUE rclose_dir_key;
13
+ VALUE rcreate_key;
14
+ VALUE rcreate_if_missing_key;
15
+ VALUE ruse_compound_file_key;
16
+ VALUE rmerge_factor_key;
17
+ VALUE rmin_merge_docs_key;
18
+ VALUE rmax_merge_docs_key;
19
+ VALUE rmax_field_length_key;
20
+ VALUE rterm_index_interval_key;
21
+
22
+ extern void frt_set_term(VALUE rterm, Term *t);
23
+ extern VALUE frt_get_rterm(char *field, char *text);
24
+
25
+ /****************************************************************************
26
+ *
27
+ * TermEnum Methods
28
+ *
29
+ ****************************************************************************/
30
+
31
+ static void
32
+ frt_te_free(void *p)
33
+ {
34
+ TermEnum *te = (TermEnum *)p;
35
+ te->close(te);
36
+ }
37
+
38
+ #define GET_TE TermEnum *te; Data_Get_Struct(self, TermEnum, te)
39
+ static VALUE
40
+ frt_te_next(VALUE self)
41
+ {
42
+ GET_TE;
43
+ return te->next(te) ? Qtrue : Qfalse;
44
+ }
45
+
46
+ static VALUE
47
+ frt_te_term(VALUE self)
48
+ {
49
+ GET_TE;
50
+ if (!te->tb_curr) return Qnil;
51
+ return frt_get_rterm(te->tb_curr->field, te->tb_curr->text);
52
+ }
53
+
54
+ static VALUE
55
+ frt_te_doc_freq(VALUE self)
56
+ {
57
+ GET_TE;
58
+ if (!te->tb_curr) return Qnil;
59
+ return INT2FIX(te->ti_curr->doc_freq);
60
+ }
61
+
62
+ static VALUE
63
+ frt_te_close(VALUE self)
64
+ {
65
+ GET_TE;
66
+ Frt_Unwrap_Struct(self);
67
+ te->close(te);
68
+ return Qnil;
69
+ }
70
+
71
+ static VALUE
72
+ frt_te_skip_to(VALUE self, VALUE rterm)
73
+ {
74
+ GET_TE;
75
+ Term t;
76
+ frt_set_term(rterm, &t);
77
+
78
+ return te_skip_to(te, &t) ? Qtrue : Qfalse;
79
+ }
80
+
81
+ /****************************************************************************
82
+ *
83
+ * TermVectorOffsetInfo Methods
84
+ *
85
+ ****************************************************************************/
86
+
87
+ void
88
+ frt_tvoi_free(void *p)
89
+ {
90
+ object_del(p);
91
+ tvoi_destroy(p);
92
+ }
93
+
94
+ static VALUE
95
+ frt_tvoi_init(VALUE self, VALUE rstart, VALUE rend)
96
+ {
97
+ TVOffsetInfo *tvoi = tvoi_create(FIX2INT(rstart), FIX2INT(rend));
98
+ Frt_Wrap_Struct(self, NULL, &frt_tvoi_free, tvoi);
99
+ object_add(tvoi, self);
100
+ return self;
101
+ }
102
+
103
+ #define GET_TVOI TVOffsetInfo *tvoi;\
104
+ Data_Get_Struct(self, TVOffsetInfo, tvoi)
105
+ static VALUE
106
+ frt_tvoi_set_start(VALUE self, VALUE rstart)
107
+ {
108
+ GET_TVOI;
109
+ tvoi->start = FIX2INT(rstart);
110
+ return Qnil;
111
+ }
112
+
113
+ static VALUE
114
+ frt_tvoi_get_start(VALUE self)
115
+ {
116
+ GET_TVOI;
117
+ return INT2FIX(tvoi->start);
118
+ }
119
+
120
+ static VALUE
121
+ frt_tvoi_set_end(VALUE self, VALUE rend)
122
+ {
123
+ GET_TVOI;
124
+ tvoi->end = FIX2INT(rend);
125
+ return Qnil;
126
+ }
127
+
128
+ static VALUE
129
+ frt_tvoi_get_end(VALUE self)
130
+ {
131
+ GET_TVOI;
132
+ return INT2FIX(tvoi->end);
133
+ }
134
+
135
+ static VALUE
136
+ frt_tvoi_eql(VALUE self, VALUE rother)
137
+ {
138
+ if (TYPE(rother) != T_DATA) return Qfalse;
139
+ TVOffsetInfo *other;
140
+ GET_TVOI;
141
+ Data_Get_Struct(rother, TVOffsetInfo, other);
142
+
143
+ return ((tvoi->start == other->start) && (tvoi->end == other->end))
144
+ ? Qtrue : Qfalse;
145
+ }
146
+
147
+ static VALUE
148
+ frt_tvoi_hash(VALUE self, VALUE rother)
149
+ {
150
+ GET_TVOI;
151
+ return INT2FIX(29 * tvoi->start + tvoi->end);
152
+ }
153
+
154
+ static VALUE
155
+ frt_tvoi_to_s(VALUE self)
156
+ {
157
+ char buf[60];
158
+ GET_TVOI;
159
+ sprintf(buf, "TermVectorOffsetInfo(%d:%d)", tvoi->start, tvoi->end);
160
+ return rb_str_new2(buf);
161
+ }
162
+
163
+ /****************************************************************************
164
+ *
165
+ * TermVector Methods
166
+ *
167
+ ****************************************************************************/
168
+
169
+ void
170
+ frt_tv_free(void *p)
171
+ {
172
+ int i;
173
+ TermVector *tv = (TermVector *)p;
174
+ for (i = 0; i < tv->tcnt; i++) {
175
+ free(tv->terms[i]);
176
+ }
177
+ free(tv->terms);
178
+ if (tv->positions) {
179
+ for (i = 0; i < tv->tcnt; i++) {
180
+ free(tv->positions[i]);
181
+ }
182
+ free(tv->positions);
183
+ }
184
+ if (tv->offsets) {
185
+ for (i = 0; i < tv->tcnt; i++) {
186
+ free(tv->offsets[i]);
187
+ }
188
+ free(tv->offsets);
189
+ }
190
+ free(tv->freqs);
191
+ object_del(p);
192
+ free(p);
193
+ }
194
+
195
+ void
196
+ frt_tv_mark(void *p)
197
+ {
198
+ int i, j;
199
+ TermVector *tv = (TermVector *)p;
200
+ if (tv->offsets != NULL) {
201
+ for (i = 0; i < tv->tcnt; i++) {
202
+ for (j = 0; j < tv->freqs[i]; j++) {
203
+ frt_gc_mark(tv->offsets[i][j]);
204
+ }
205
+ }
206
+ }
207
+ }
208
+
209
+ static VALUE
210
+ frt_get_tv(TermVector *tv)
211
+ {
212
+ VALUE self = Qnil;
213
+ if (tv) {
214
+ self = object_get(tv);
215
+ if (self == Qnil) {
216
+ self = Data_Wrap_Struct(cTermVector, &frt_tv_mark, &frt_tv_free, tv);
217
+ if (tv->offsets) {
218
+ TVOffsetInfo *tvoi;
219
+ VALUE rtvoi;
220
+ int i, j;
221
+ for (i = 0; i < tv->tcnt; i++) {
222
+ for (j = 0; j < tv->freqs[i]; j++) {
223
+ tvoi = tv->offsets[i][j];
224
+ if (object_get(tvoi) == Qnil) {
225
+ rtvoi = Data_Wrap_Struct(cTVOffsetInfo, NULL, &frt_tvoi_free, tvoi);
226
+ object_add(tvoi, rtvoi);
227
+ }
228
+ }
229
+ }
230
+ }
231
+ object_add(tv, self);
232
+ }
233
+ }
234
+ return self;
235
+ }
236
+
237
+ #define GET_TV TermVector *tv; Data_Get_Struct(self, TermVector, tv);
238
+ static VALUE
239
+ frt_tv_get_field(VALUE self)
240
+ {
241
+ GET_TV;
242
+ return rb_str_new2(tv->field);
243
+ }
244
+
245
+ static VALUE
246
+ frt_tv_get_terms(VALUE self)
247
+ {
248
+ int i;
249
+ GET_TV;
250
+ VALUE rterms = rb_ary_new2(tv->tcnt);
251
+ for (i = 0; i < tv->tcnt; i++) {
252
+ rb_ary_push(rterms, rb_str_new2(tv->terms[i]));
253
+ }
254
+ return rterms;
255
+ }
256
+
257
+ static VALUE
258
+ frt_tv_get_freqs(VALUE self)
259
+ {
260
+ int i;
261
+ GET_TV;
262
+ VALUE rfreqs = rb_ary_new2(tv->tcnt);
263
+ for (i = 0; i < tv->tcnt; i++) {
264
+ rb_ary_push(rfreqs, INT2FIX(tv->freqs[i]));
265
+ }
266
+ return rfreqs;
267
+ }
268
+
269
+ static VALUE
270
+ frt_tv_get_positions(VALUE self)
271
+ {
272
+ int i, j, freq;
273
+ GET_TV;
274
+ if (!tv->positions) return Qnil;
275
+ VALUE rpositions;
276
+ VALUE rpositionss = rb_ary_new2(tv->tcnt);
277
+ for (i = 0; i < tv->tcnt; i++) {
278
+ freq = tv->freqs[i];
279
+ rpositions = rb_ary_new2(freq);
280
+ for (j = 0; j < freq; j++) {
281
+ rb_ary_push(rpositions, INT2FIX(tv->positions[i][j]));
282
+ }
283
+ rb_ary_push(rpositionss, rpositions);
284
+ }
285
+ return rpositionss;
286
+ }
287
+
288
+ static VALUE
289
+ frt_tv_get_offsets(VALUE self)
290
+ {
291
+ int i, j, freq;
292
+ GET_TV;
293
+ if (!tv->offsets) return Qnil;
294
+ VALUE roffsets, roffset;
295
+ VALUE roffsetss = rb_ary_new2(tv->tcnt);
296
+ for (i = 0; i < tv->tcnt; i++) {
297
+ freq = tv->freqs[i];
298
+ roffsets = rb_ary_new2(freq);
299
+ for (j = 0; j < freq; j++) {
300
+ roffset = object_get(tv->offsets[i][j]);
301
+ rb_ary_push(roffsets, roffset);
302
+ }
303
+ rb_ary_push(roffsetss, roffsets);
304
+ }
305
+ return roffsetss;
306
+ }
307
+
308
+ /****************************************************************************
309
+ *
310
+ * TermDocEnum Methods
311
+ *
312
+ ****************************************************************************/
313
+
314
+ void
315
+ frt_tde_free(void *p)
316
+ {
317
+ TermDocEnum *tde = (TermDocEnum *)p;
318
+ tde->close(tde);
319
+ }
320
+
321
+ static VALUE
322
+ frt_get_tde(TermDocEnum *tde)
323
+ {
324
+ return Data_Wrap_Struct(cTermDocEnum, NULL, &frt_tde_free, tde);
325
+ }
326
+
327
+ #define GET_TDE TermDocEnum *tde; Data_Get_Struct(self, TermDocEnum, tde)
328
+ static VALUE
329
+ frt_tde_close(VALUE self)
330
+ {
331
+ GET_TDE;
332
+ Frt_Unwrap_Struct(self);
333
+ tde->close(tde);
334
+ return Qnil;
335
+ }
336
+
337
+ static VALUE
338
+ frt_tde_seek(VALUE self, VALUE rterm)
339
+ {
340
+ GET_TDE;
341
+ Term t;
342
+ frt_set_term(rterm, &t);
343
+ tde->seek(tde, &t);
344
+ return Qnil;
345
+ }
346
+
347
+ static VALUE
348
+ frt_tde_doc(VALUE self)
349
+ {
350
+ GET_TDE;
351
+ return INT2FIX(tde->doc_num(tde));
352
+ }
353
+
354
+ static VALUE
355
+ frt_tde_freq(VALUE self)
356
+ {
357
+ GET_TDE;
358
+ return INT2FIX(tde->freq(tde));
359
+ }
360
+
361
+ static VALUE
362
+ frt_tde_next(VALUE self)
363
+ {
364
+ GET_TDE;
365
+ return tde->next(tde) ? Qtrue : Qfalse;
366
+ }
367
+
368
+ static VALUE
369
+ frt_tde_next_position(VALUE self)
370
+ {
371
+ GET_TDE;
372
+ return INT2FIX(tde->next_position(tde));
373
+ }
374
+
375
+ static VALUE
376
+ frt_tde_read(VALUE self, VALUE rdocs, VALUE rfreqs)
377
+ {
378
+ int i;
379
+ GET_TDE;
380
+ Check_Type(rdocs, T_ARRAY);
381
+ Check_Type(rfreqs, T_ARRAY);
382
+ int req_num = MIN(RARRAY(rdocs)->len, RARRAY(rfreqs)->len);
383
+ int cnt = tde->read(tde, (int *)RARRAY(rdocs)->ptr,
384
+ (int *)RARRAY(rfreqs)->ptr, req_num);
385
+ for (i = 0; i < cnt; i++) {
386
+ RARRAY(rdocs)->ptr[i] = INT2FIX(RARRAY(rdocs)->ptr[i]);
387
+ RARRAY(rfreqs)->ptr[i] = INT2FIX(RARRAY(rfreqs)->ptr[i]);
388
+ }
389
+ return INT2FIX(cnt);
390
+ }
391
+
392
+ static VALUE
393
+ frt_tde_skip_to(VALUE self, VALUE rtarget)
394
+ {
395
+ GET_TDE;
396
+ return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
397
+ }
398
+
399
+ /****************************************************************************
400
+ *
401
+ * IndexWriter Methods
402
+ *
403
+ ****************************************************************************/
404
+
405
+ void
406
+ frt_iw_free(void *p)
407
+ {
408
+ IndexWriter *iw = (IndexWriter *)p;
409
+ iw_close(iw);
410
+ }
411
+
412
+ void
413
+ frt_iw_mark(void *p)
414
+ {
415
+ IndexWriter *iw = (IndexWriter *)p;
416
+ frt_gc_mark(iw->analyzer);
417
+ frt_gc_mark(iw->store);
418
+ }
419
+
420
+ #define SET_INT_ATTR(attr) \
421
+ if (RTEST(rval = rb_hash_aref(roptions, r##attr##_key)))\
422
+ iw->attr = FIX2INT(rval);
423
+
424
+ static VALUE
425
+ frt_iw_init(int argc, VALUE *argv, VALUE self)
426
+ {
427
+ VALUE rdir, roptions, rval;
428
+ bool close_dir = false;
429
+ bool close_analyzer = true;
430
+ bool create = false;
431
+ bool use_compound_file = true;
432
+ Store *store;
433
+ Analyzer *analyzer = NULL;
434
+ IndexWriter *iw;
435
+ rb_scan_args(argc, argv, "02", &rdir, &roptions);
436
+ if (argc > 0) {
437
+ if (TYPE(rdir) == T_DATA) {
438
+ store = DATA_PTR(rdir);
439
+ } else {
440
+ rdir = rb_obj_as_string(rdir);
441
+ store = open_fs_store(RSTRING(rdir)->ptr);
442
+ close_dir = true;
443
+ }
444
+ } else {
445
+ store = open_ram_store();
446
+ close_dir = true;
447
+ }
448
+ if (argc == 2) {
449
+ Check_Type(roptions, T_HASH);
450
+ if (!close_dir) {
451
+ close_dir = RTEST(rb_hash_aref(roptions, rclose_dir_key));
452
+ }
453
+ /* use_compound_file defaults to true */
454
+ use_compound_file =
455
+ (rb_hash_aref(roptions, ruse_compound_file_key) == Qfalse) ? false : true;
456
+
457
+ rval = rb_hash_aref(roptions, ranalyzer_key);
458
+ if (rval == Qnil) {
459
+ analyzer = standard_analyzer_create();
460
+ } else {
461
+ Data_Get_Struct(rval, Analyzer, analyzer);
462
+ close_analyzer = false;
463
+ }
464
+ create = RTEST(rb_hash_aref(roptions, rcreate_key));
465
+ if (!create && RTEST(rb_hash_aref(roptions, rcreate_if_missing_key))) {
466
+ if (!store->exists(store, "segments")) {
467
+ create = true;
468
+ }
469
+ }
470
+ }
471
+ iw = iw_open(store, analyzer, create, close_dir, close_analyzer);
472
+ iw->use_compound_file = use_compound_file;
473
+
474
+ SET_INT_ATTR(merge_factor);
475
+ SET_INT_ATTR(min_merge_docs);
476
+ SET_INT_ATTR(max_merge_docs);
477
+ SET_INT_ATTR(max_field_length);
478
+ SET_INT_ATTR(term_index_interval);
479
+
480
+ Frt_Wrap_Struct(self, &frt_iw_mark, &frt_iw_free, iw);
481
+ return self;
482
+ }
483
+
484
+ #define GET_IW IndexWriter *iw; Data_Get_Struct(self, IndexWriter, iw)
485
+ static VALUE
486
+ frt_iw_close(VALUE self)
487
+ {
488
+ GET_IW;
489
+ Frt_Unwrap_Struct(self);
490
+ iw_close(iw);
491
+ return Qnil;
492
+ }
493
+
494
+ static VALUE
495
+ frt_iw_add_doc(VALUE self, VALUE rdoc)
496
+ {
497
+ GET_IW;
498
+ Document *doc;
499
+ Data_Get_Struct(rdoc, Document, doc);
500
+ iw_add_doc(iw, doc);
501
+ return Qnil;
502
+ }
503
+
504
+ static VALUE
505
+ frt_iw_set_merge_factor(VALUE self, VALUE val)
506
+ {
507
+ GET_IW;
508
+ iw->merge_factor = FIX2INT(val);
509
+ return Qnil;
510
+ }
511
+
512
+ static VALUE
513
+ frt_iw_set_min_merge_docs(VALUE self, VALUE val)
514
+ {
515
+ GET_IW;
516
+ iw->min_merge_docs = FIX2INT(val);
517
+ return Qnil;
518
+ }
519
+
520
+ static VALUE
521
+ frt_iw_set_max_merge_docs(VALUE self, VALUE val)
522
+ {
523
+ GET_IW;
524
+ iw->max_merge_docs = FIX2INT(val);
525
+ return Qnil;
526
+ }
527
+
528
+ static VALUE
529
+ frt_iw_set_max_field_length(VALUE self, VALUE val)
530
+ {
531
+ GET_IW;
532
+ iw->max_field_length = FIX2INT(val);
533
+ return Qnil;
534
+ }
535
+
536
+ static VALUE
537
+ frt_iw_set_term_index_interval(VALUE self, VALUE val)
538
+ {
539
+ GET_IW;
540
+ iw->term_index_interval = FIX2INT(val);
541
+ return Qnil;
542
+ }
543
+
544
+ static VALUE
545
+ frt_iw_set_use_compound_file(VALUE self, VALUE val)
546
+ {
547
+ GET_IW;
548
+ iw->use_compound_file = FIX2INT(val);
549
+ return Qnil;
550
+ }
551
+
552
+ static VALUE
553
+ frt_iw_get_doc_count(VALUE self)
554
+ {
555
+ GET_IW;
556
+ return INT2FIX(iw_doc_count(iw));
557
+ }
558
+
559
+ static VALUE
560
+ frt_iw_get_merge_factor(VALUE self)
561
+ {
562
+ GET_IW;
563
+ return INT2FIX(iw->merge_factor);
564
+ }
565
+
566
+ static VALUE
567
+ frt_iw_get_min_merge_docs(VALUE self)
568
+ {
569
+ GET_IW;
570
+ return INT2FIX(iw->min_merge_docs);
571
+ }
572
+
573
+ static VALUE
574
+ frt_iw_get_max_merge_docs(VALUE self)
575
+ {
576
+ GET_IW;
577
+ return INT2FIX(iw->max_merge_docs);
578
+ }
579
+
580
+ static VALUE
581
+ frt_iw_get_max_field_length(VALUE self)
582
+ {
583
+ GET_IW;
584
+ return INT2FIX(iw->max_field_length);
585
+ }
586
+
587
+ static VALUE
588
+ frt_iw_get_term_index_interval(VALUE self)
589
+ {
590
+ GET_IW;
591
+ return INT2FIX(iw->term_index_interval);
592
+ }
593
+
594
+ static VALUE
595
+ frt_iw_get_use_compound_file(VALUE self)
596
+ {
597
+ GET_IW;
598
+ return INT2FIX(iw->use_compound_file);
599
+ }
600
+
601
+ static VALUE
602
+ frt_iw_optimize(VALUE self)
603
+ {
604
+ GET_IW;
605
+ iw_optimize(iw);
606
+ return Qnil;
607
+ }
608
+
609
+ /****************************************************************************
610
+ *
611
+ * IndexReader Methods
612
+ *
613
+ ****************************************************************************/
614
+
615
+ void
616
+ frt_ir_free(void *p)
617
+ {
618
+ ir_close((IndexReader *)p);
619
+ }
620
+
621
+ void
622
+ frt_ir_mark(void *p)
623
+ {
624
+ IndexReader *ir = (IndexReader *)p;
625
+ frt_gc_mark(ir->store);
626
+ }
627
+
628
+ static VALUE
629
+ frt_ir_init(int argc, VALUE *argv, VALUE self)
630
+ {
631
+ VALUE rdir, rclose_dir;
632
+ bool close_dir = true;
633
+ Store *store = NULL;
634
+ IndexReader *ir;
635
+ switch (rb_scan_args(argc, argv, "11", &rdir, &rclose_dir)) {
636
+ case 2: close_dir = RTEST(rclose_dir);
637
+ case 1:
638
+ if (TYPE(rdir) == T_DATA) {
639
+ store = DATA_PTR(rdir);
640
+ } else {
641
+ rdir = rb_obj_as_string(rdir);
642
+ store = open_fs_store(RSTRING(rdir)->ptr);
643
+ close_dir = true;
644
+ }
645
+ }
646
+ ir = ir_open(store, close_dir);
647
+ Frt_Wrap_Struct(self, &frt_ir_mark, &frt_ir_free, ir);
648
+ return self;
649
+ }
650
+
651
+ static VALUE
652
+ frt_ir_open(int argc, VALUE *argv, VALUE klass)
653
+ {
654
+ VALUE self = Frt_Make_Struct(klass);
655
+ return frt_ir_init(argc, argv, self);
656
+ }
657
+
658
+ #define GET_IR IndexReader *ir; Data_Get_Struct(self, IndexReader, ir)
659
+ static VALUE
660
+ frt_ir_set_norm(VALUE self, VALUE rdoc_num, VALUE rfield, VALUE rval)
661
+ {
662
+ GET_IR;
663
+ rfield = rb_obj_as_string(rfield);
664
+ ir_set_norm(ir, FIX2INT(rdoc_num), RSTRING(rfield)->ptr, NUM2CHR(rval));
665
+ return Qnil;
666
+ }
667
+
668
+ static VALUE
669
+ frt_ir_get_norms(VALUE self, VALUE rfield)
670
+ {
671
+ GET_IR;
672
+ rfield = rb_obj_as_string(rfield);
673
+ uchar *norms = ir->get_norms(ir, RSTRING(rfield)->ptr);
674
+ if (norms) {
675
+ return rb_str_new((char *)norms, ir->max_doc(ir));
676
+ } else {
677
+ return Qnil;
678
+ }
679
+ }
680
+
681
+ static VALUE
682
+ frt_ir_get_norms_into(VALUE self, VALUE rfield, VALUE rnorms, VALUE roffset)
683
+ {
684
+ GET_IR;
685
+ rfield = rb_obj_as_string(rfield);
686
+ int offset = FIX2INT(roffset);
687
+ Check_Type(rnorms, T_STRING);
688
+ if (RSTRING(rnorms)->len < offset + ir->max_doc(ir)) {
689
+ rb_raise(rb_eArgError, "supplied a string of length:%d to IndexReader#get_norms_into but needed a string of length offset:%d + maxdoc:%d", RSTRING(rnorms)->len, offset, ir->max_doc(ir));
690
+ }
691
+
692
+ ir->get_norms_into(ir, RSTRING(rfield)->ptr, (uchar *)RSTRING(rnorms)->ptr, offset);
693
+ return Qnil;
694
+ }
695
+
696
+ static VALUE
697
+ frt_ir_commit(VALUE self)
698
+ {
699
+ GET_IR;
700
+ ir_commit(ir);
701
+ return Qnil;
702
+ }
703
+
704
+ static VALUE
705
+ frt_ir_close(VALUE self)
706
+ {
707
+ GET_IR;
708
+ Frt_Unwrap_Struct(self);
709
+ ir_close(ir);
710
+ return Qnil;
711
+ }
712
+
713
+ static VALUE
714
+ frt_ir_has_deletions(VALUE self)
715
+ {
716
+ GET_IR;
717
+ return ir->has_deletions(ir) ? Qtrue : Qfalse;
718
+ }
719
+
720
+ static VALUE
721
+ frt_ir_delete(VALUE self, VALUE rdoc_num)
722
+ {
723
+ GET_IR;
724
+ int doc_num = FIX2INT(rdoc_num);
725
+ ir_delete_doc(ir, doc_num);
726
+ return Qnil;
727
+ }
728
+
729
+ static VALUE
730
+ frt_ir_is_deleted(VALUE self, VALUE rdoc_num)
731
+ {
732
+ GET_IR;
733
+ int doc_num = FIX2INT(rdoc_num);
734
+ return ir->is_deleted(ir, doc_num) ? Qtrue : Qfalse;
735
+ }
736
+
737
+ static VALUE
738
+ frt_ir_max_doc(VALUE self)
739
+ {
740
+ GET_IR;
741
+ return INT2FIX(ir->max_doc(ir));
742
+ }
743
+
744
+ static VALUE
745
+ frt_ir_num_docs(VALUE self)
746
+ {
747
+ GET_IR;
748
+ return INT2FIX(ir->num_docs(ir));
749
+ }
750
+
751
+ static VALUE
752
+ frt_ir_undelete_all(VALUE self)
753
+ {
754
+ GET_IR;
755
+ ir_undelete_all(ir);
756
+ return Qnil;
757
+ }
758
+
759
+ static VALUE
760
+ frt_ir_get_doc(VALUE self, VALUE rdoc_num)
761
+ {
762
+ GET_IR;
763
+ Document *doc = ir->get_doc(ir, FIX2INT(rdoc_num));
764
+ return frt_get_doc(doc);
765
+ }
766
+
767
+ static VALUE
768
+ frt_ir_is_latest(VALUE self)
769
+ {
770
+ GET_IR;
771
+ return ir_is_latest(ir) ? Qtrue : Qfalse;
772
+ }
773
+
774
+ static VALUE
775
+ frt_ir_get_term_vector(VALUE self, VALUE rdoc_num, VALUE rfield)
776
+ {
777
+ GET_IR;
778
+ rfield = rb_obj_as_string(rfield);
779
+ TermVector *tv =
780
+ ir->get_term_vector(ir, FIX2INT(rdoc_num), RSTRING(rfield)->ptr);
781
+ return frt_get_tv(tv);
782
+ }
783
+
784
+ static VALUE
785
+ frt_ir_get_term_vectors(VALUE self, VALUE rdoc_num)
786
+ {
787
+ int i;
788
+ GET_IR;
789
+ Array *tvs = ir->get_term_vectors(ir, FIX2INT(rdoc_num));
790
+ VALUE rtvs = rb_ary_new2(tvs->size);
791
+ VALUE rtv;
792
+ for (i = 0; i < tvs->size; i++) {
793
+ rtv = frt_get_tv(tvs->elems[i]);
794
+ rb_ary_push(rtvs, rtv);
795
+ }
796
+ tvs->free_elem = NULL;
797
+ ary_destroy(tvs);
798
+
799
+ return rtvs;
800
+ }
801
+
802
+ static VALUE
803
+ frt_ir_term_docs(VALUE self)
804
+ {
805
+ GET_IR;
806
+ return frt_get_tde(ir->term_docs(ir));
807
+ }
808
+
809
+ static VALUE
810
+ frt_ir_term_docs_for(VALUE self, VALUE rterm)
811
+ {
812
+ GET_IR;
813
+ Term t;
814
+ frt_set_term(rterm, &t);
815
+ return frt_get_tde(ir_term_docs_for(ir, &t));
816
+ }
817
+
818
+ static VALUE
819
+ frt_ir_term_positions(VALUE self)
820
+ {
821
+ GET_IR;
822
+ return frt_get_tde(ir->term_positions(ir));
823
+ }
824
+
825
+ static VALUE
826
+ frt_ir_term_positions_for(VALUE self, VALUE rterm)
827
+ {
828
+ GET_IR;
829
+ Term t;
830
+ frt_set_term(rterm, &t);
831
+ return frt_get_tde(ir_term_positions_for(ir, &t));
832
+ }
833
+
834
+ static VALUE
835
+ frt_ir_doc_freq(VALUE self, VALUE rterm)
836
+ {
837
+ GET_IR;
838
+ Term t;
839
+ frt_set_term(rterm, &t);
840
+ return INT2FIX(ir->doc_freq(ir, &t));
841
+ }
842
+
843
+ static VALUE
844
+ frt_ir_terms(VALUE self)
845
+ {
846
+ TermEnum *te;
847
+ GET_IR;
848
+ te = ir->terms(ir);
849
+ return Data_Wrap_Struct(cTermEnum, NULL, &frt_te_free, te);
850
+ }
851
+
852
+ static VALUE
853
+ frt_ir_terms_from(VALUE self, VALUE rterm)
854
+ {
855
+ TermEnum *te;
856
+ Term t;
857
+ GET_IR;
858
+ frt_set_term(rterm, &t);
859
+ te = ir->terms_from(ir, &t);
860
+ return Data_Wrap_Struct(cTermEnum, NULL, &frt_te_free, te);
861
+ }
862
+
863
+ /****************************************************************************
864
+ *
865
+ * Init Function
866
+ *
867
+ ****************************************************************************/
868
+
869
+ void
870
+ Init_index_io(void)
871
+ {
872
+ ranalyzer_key = ID2SYM(rb_intern("analyzer"));
873
+ rclose_dir_key = ID2SYM(rb_intern("close_dir"));
874
+ rcreate_key = ID2SYM(rb_intern("create"));
875
+ rcreate_if_missing_key = ID2SYM(rb_intern("create_if_missing"));
876
+ ruse_compound_file_key = ID2SYM(rb_intern("use_compound_file"));
877
+ rmerge_factor_key = ID2SYM(rb_intern("merge_factor"));
878
+ rmin_merge_docs_key = ID2SYM(rb_intern("min_merge_docs"));
879
+ rmax_merge_docs_key = ID2SYM(rb_intern("max_merge_docs"));
880
+ rmax_field_length_key = ID2SYM(rb_intern("max_field_length"));
881
+ rterm_index_interval_key = ID2SYM(rb_intern("term_index_interval"));
882
+
883
+ /* TermEnum */
884
+ cTermEnum = rb_define_class_under(mIndex, "TermEnum", rb_cObject);
885
+ rb_define_alloc_func(cTermEnum, frt_data_alloc);
886
+
887
+ rb_define_method(cTermEnum, "next?", frt_te_next, 0);
888
+ rb_define_method(cTermEnum, "term", frt_te_term, 0);
889
+ rb_define_method(cTermEnum, "doc_freq", frt_te_doc_freq, 0);
890
+ rb_define_method(cTermEnum, "skip_to", frt_te_skip_to, 1);
891
+ rb_define_method(cTermEnum, "close", frt_te_close, 0);
892
+
893
+ /* TermVectorOffsetInfo */
894
+ cTVOffsetInfo = rb_define_class_under(mIndex, "TermVectorOffsetInfo", rb_cObject);
895
+ rb_define_alloc_func(cTVOffsetInfo, frt_data_alloc);
896
+
897
+ rb_define_method(cTVOffsetInfo, "initialize", frt_tvoi_init, 2);
898
+ rb_define_method(cTVOffsetInfo, "start=", frt_tvoi_set_start, 1);
899
+ rb_define_method(cTVOffsetInfo, "start", frt_tvoi_get_start, 0);
900
+ rb_define_method(cTVOffsetInfo, "end=", frt_tvoi_set_end, 1);
901
+ rb_define_method(cTVOffsetInfo, "end", frt_tvoi_get_end, 0);
902
+ rb_define_method(cTVOffsetInfo, "eql?", frt_tvoi_eql, 1);
903
+ rb_define_method(cTVOffsetInfo, "==", frt_tvoi_eql, 1);
904
+ rb_define_method(cTVOffsetInfo, "hash", frt_tvoi_hash, 0);
905
+ rb_define_method(cTVOffsetInfo, "to_s", frt_tvoi_to_s, 0);
906
+
907
+ /* TermVector */
908
+ cTermVector = rb_define_class_under(mIndex, "TermVector", rb_cObject);
909
+ rb_define_alloc_func(cTermVector, frt_data_alloc);
910
+ rb_define_method(cTermVector, "field", frt_tv_get_field, 0);
911
+ rb_define_method(cTermVector, "terms", frt_tv_get_terms, 0);
912
+ rb_define_method(cTermVector, "freqs", frt_tv_get_freqs, 0);
913
+ rb_define_method(cTermVector, "positions", frt_tv_get_positions, 0);
914
+ rb_define_method(cTermVector, "offsets", frt_tv_get_offsets, 0);
915
+
916
+ /* TermDocEnum */
917
+ cTermDocEnum = rb_define_class_under(mIndex, "TermDocEnum", rb_cObject);
918
+ rb_define_alloc_func(cTermDocEnum, frt_data_alloc);
919
+ rb_define_method(cTermDocEnum, "close", frt_tde_close, 0);
920
+ rb_define_method(cTermDocEnum, "seek", frt_tde_seek, 1);
921
+ rb_define_method(cTermDocEnum, "doc", frt_tde_doc, 0);
922
+ rb_define_method(cTermDocEnum, "freq", frt_tde_freq, 0);
923
+ rb_define_method(cTermDocEnum, "next?", frt_tde_next, 0);
924
+ rb_define_method(cTermDocEnum, "next_position", frt_tde_next_position, 0);
925
+ rb_define_method(cTermDocEnum, "read", frt_tde_read, 2);
926
+ rb_define_method(cTermDocEnum, "skip_to", frt_tde_skip_to, 1);
927
+
928
+ /* IndexWriter */
929
+ cIndexWriter = rb_define_class_under(mIndex, "IndexWriter", rb_cObject);
930
+ rb_define_alloc_func(cIndexWriter, frt_data_alloc);
931
+
932
+ rb_define_const(cIndexWriter, "WRITE_LOCK_TIMEOUT", INT2FIX(1));
933
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_TIMEOUT", INT2FIX(10));
934
+ rb_define_const(cIndexWriter, "WRITE_LOCK_NAME",
935
+ rb_str_new2(WRITE_LOCK_NAME));
936
+ rb_define_const(cIndexWriter, "COMMIT_LOCK_NAME",
937
+ rb_str_new2(COMMIT_LOCK_NAME));
938
+ rb_define_const(cIndexWriter, "DEFAULT_MERGE_FACTOR",
939
+ INT2FIX(config.merge_factor));
940
+ rb_define_const(cIndexWriter, "DEFAULT_MIN_MERGE_DOCS",
941
+ INT2FIX(config.min_merge_docs));
942
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_MERGE_DOCS",
943
+ INT2FIX(config.max_merge_docs));
944
+ rb_define_const(cIndexWriter, "DEFAULT_MAX_FIELD_LENGTH",
945
+ INT2FIX(config.max_field_length));
946
+ rb_define_const(cIndexWriter, "DEFAULT_TERM_INDEX_INTERVAL",
947
+ INT2FIX(config.term_index_interval));
948
+
949
+ rb_define_method(cIndexWriter, "initialize", frt_iw_init, -1);
950
+ rb_define_method(cIndexWriter, "close", frt_iw_close, 0);
951
+ rb_define_method(cIndexWriter, "add_document", frt_iw_add_doc, 1);
952
+ rb_define_method(cIndexWriter, "<<", frt_iw_add_doc, 1);
953
+ rb_define_method(cIndexWriter, "merge_factor", frt_iw_get_merge_factor, 0);
954
+ rb_define_method(cIndexWriter, "min_merge_docs", frt_iw_get_min_merge_docs, 0);
955
+ rb_define_method(cIndexWriter, "max_merge_docs", frt_iw_get_max_merge_docs, 0);
956
+ rb_define_method(cIndexWriter, "max_field_length", frt_iw_get_max_field_length, 0);
957
+ rb_define_method(cIndexWriter, "term_index_interval", frt_iw_get_term_index_interval, 0);
958
+ rb_define_method(cIndexWriter, "use_compound_file", frt_iw_get_use_compound_file, 0);
959
+ rb_define_method(cIndexWriter, "doc_count", frt_iw_get_doc_count, 0);
960
+ rb_define_method(cIndexWriter, "merge_factor=", frt_iw_set_merge_factor, 1);
961
+ rb_define_method(cIndexWriter, "min_merge_docs=", frt_iw_set_min_merge_docs, 1);
962
+ rb_define_method(cIndexWriter, "max_merge_docs=", frt_iw_set_max_merge_docs, 1);
963
+ rb_define_method(cIndexWriter, "max_field_length=", frt_iw_set_max_field_length, 1);
964
+ rb_define_method(cIndexWriter, "term_index_interval=", frt_iw_set_term_index_interval, 1);
965
+ rb_define_method(cIndexWriter, "use_compound_file=", frt_iw_set_use_compound_file, 1);
966
+ rb_define_method(cIndexWriter, "optimize", frt_iw_optimize, 0);
967
+
968
+ /* IndexReader */
969
+ cIndexReader = rb_define_class_under(mIndex, "IndexReader", rb_cObject);
970
+ rb_define_alloc_func(cIndexReader, frt_data_alloc);
971
+ rb_define_singleton_method(cIndexReader, "open", frt_ir_open, -1);
972
+ rb_define_method(cIndexReader, "initialize", frt_ir_init, -1);
973
+ rb_define_method(cIndexReader, "set_norm", frt_ir_set_norm, 3);
974
+ rb_define_method(cIndexReader, "get_norms", frt_ir_get_norms, 1);
975
+ rb_define_method(cIndexReader, "get_norms_into", frt_ir_get_norms_into, 3);
976
+ rb_define_method(cIndexReader, "commit", frt_ir_commit, 0);
977
+ rb_define_method(cIndexReader, "close", frt_ir_close, 0);
978
+ rb_define_method(cIndexReader, "has_deletions?", frt_ir_has_deletions, 0);
979
+ rb_define_method(cIndexReader, "delete", frt_ir_delete, 1);
980
+ rb_define_method(cIndexReader, "deleted?", frt_ir_is_deleted, 1);
981
+ rb_define_method(cIndexReader, "max_doc", frt_ir_max_doc, 0);
982
+ rb_define_method(cIndexReader, "num_docs", frt_ir_num_docs, 0);
983
+ rb_define_method(cIndexReader, "undelete_all", frt_ir_undelete_all, 0);
984
+ rb_define_method(cIndexReader, "latest?", frt_ir_is_latest, 0);
985
+ rb_define_method(cIndexReader, "get_document", frt_ir_get_doc, 1);
986
+ rb_define_method(cIndexReader, "[]", frt_ir_get_doc, 1);
987
+ rb_define_method(cIndexReader, "get_term_vector", frt_ir_get_term_vector, 2);
988
+ rb_define_method(cIndexReader, "get_term_vectors", frt_ir_get_term_vectors, 1);
989
+ rb_define_method(cIndexReader, "term_docs", frt_ir_term_docs, 0);
990
+ rb_define_method(cIndexReader, "term_positions", frt_ir_term_positions, 0);
991
+ rb_define_method(cIndexReader, "term_docs_for", frt_ir_term_docs_for, 1);
992
+ rb_define_method(cIndexReader, "term_positions_for", frt_ir_term_positions_for, 1);
993
+ rb_define_method(cIndexReader, "doc_freq", frt_ir_doc_freq, 1);
994
+ rb_define_method(cIndexReader, "terms", frt_ir_terms, 0);
995
+ rb_define_method(cIndexReader, "terms_from", frt_ir_terms_from, 1);
996
+ }