ferret 0.3.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. data/CHANGELOG +9 -0
  2. data/Rakefile +51 -25
  3. data/ext/analysis.c +553 -0
  4. data/ext/analysis.h +76 -0
  5. data/ext/array.c +83 -0
  6. data/ext/array.h +19 -0
  7. data/ext/bitvector.c +164 -0
  8. data/ext/bitvector.h +29 -0
  9. data/ext/compound_io.c +335 -0
  10. data/ext/document.c +336 -0
  11. data/ext/document.h +87 -0
  12. data/ext/ferret.c +88 -47
  13. data/ext/ferret.h +43 -109
  14. data/ext/field.c +395 -0
  15. data/ext/filter.c +103 -0
  16. data/ext/fs_store.c +352 -0
  17. data/ext/global.c +219 -0
  18. data/ext/global.h +73 -0
  19. data/ext/hash.c +446 -0
  20. data/ext/hash.h +80 -0
  21. data/ext/hashset.c +141 -0
  22. data/ext/hashset.h +37 -0
  23. data/ext/helper.c +11 -0
  24. data/ext/helper.h +5 -0
  25. data/ext/inc/lang.h +41 -0
  26. data/ext/ind.c +389 -0
  27. data/ext/index.h +884 -0
  28. data/ext/index_io.c +269 -415
  29. data/ext/index_rw.c +2543 -0
  30. data/ext/lang.c +31 -0
  31. data/ext/lang.h +41 -0
  32. data/ext/priorityqueue.c +228 -0
  33. data/ext/priorityqueue.h +44 -0
  34. data/ext/q_boolean.c +1331 -0
  35. data/ext/q_const_score.c +154 -0
  36. data/ext/q_fuzzy.c +287 -0
  37. data/ext/q_match_all.c +142 -0
  38. data/ext/q_multi_phrase.c +343 -0
  39. data/ext/q_parser.c +2180 -0
  40. data/ext/q_phrase.c +657 -0
  41. data/ext/q_prefix.c +75 -0
  42. data/ext/q_range.c +247 -0
  43. data/ext/q_span.c +1566 -0
  44. data/ext/q_term.c +308 -0
  45. data/ext/q_wildcard.c +146 -0
  46. data/ext/r_analysis.c +255 -0
  47. data/ext/r_doc.c +578 -0
  48. data/ext/r_index_io.c +996 -0
  49. data/ext/r_qparser.c +158 -0
  50. data/ext/r_search.c +2321 -0
  51. data/ext/r_store.c +263 -0
  52. data/ext/r_term.c +219 -0
  53. data/ext/ram_store.c +447 -0
  54. data/ext/search.c +524 -0
  55. data/ext/search.h +1065 -0
  56. data/ext/similarity.c +143 -39
  57. data/ext/sort.c +661 -0
  58. data/ext/store.c +35 -0
  59. data/ext/store.h +152 -0
  60. data/ext/term.c +704 -143
  61. data/ext/termdocs.c +599 -0
  62. data/ext/vector.c +594 -0
  63. data/lib/ferret.rb +9 -10
  64. data/lib/ferret/analysis/analyzers.rb +2 -2
  65. data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
  66. data/lib/ferret/analysis/token.rb +14 -14
  67. data/lib/ferret/analysis/token_filters.rb +3 -3
  68. data/lib/ferret/document/field.rb +16 -17
  69. data/lib/ferret/index/document_writer.rb +4 -4
  70. data/lib/ferret/index/index.rb +39 -23
  71. data/lib/ferret/index/index_writer.rb +2 -2
  72. data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
  73. data/lib/ferret/index/segment_term_vector.rb +4 -4
  74. data/lib/ferret/index/term.rb +5 -1
  75. data/lib/ferret/index/term_vector_offset_info.rb +6 -6
  76. data/lib/ferret/index/term_vectors_io.rb +5 -5
  77. data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
  78. data/lib/ferret/search.rb +1 -1
  79. data/lib/ferret/search/boolean_query.rb +2 -1
  80. data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
  81. data/lib/ferret/search/fuzzy_query.rb +2 -1
  82. data/lib/ferret/search/index_searcher.rb +3 -0
  83. data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
  84. data/lib/ferret/search/multi_phrase_query.rb +6 -5
  85. data/lib/ferret/search/phrase_query.rb +3 -6
  86. data/lib/ferret/search/prefix_query.rb +4 -4
  87. data/lib/ferret/search/sort.rb +3 -1
  88. data/lib/ferret/search/sort_field.rb +9 -9
  89. data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
  90. data/lib/ferret/search/spans/span_near_query.rb +1 -1
  91. data/lib/ferret/search/spans/span_weight.rb +1 -1
  92. data/lib/ferret/search/spans/spans_enum.rb +7 -7
  93. data/lib/ferret/store/fs_store.rb +10 -6
  94. data/lib/ferret/store/ram_store.rb +3 -3
  95. data/lib/rferret.rb +36 -0
  96. data/test/functional/thread_safety_index_test.rb +2 -2
  97. data/test/test_helper.rb +16 -2
  98. data/test/unit/analysis/c_token.rb +25 -0
  99. data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
  100. data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
  101. data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
  102. data/test/unit/document/c_field.rb +98 -0
  103. data/test/unit/document/tc_field.rb +0 -66
  104. data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
  105. data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
  106. data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
  107. data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
  108. data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
  109. data/test/unit/index/tc_segment_term_vector.rb +2 -2
  110. data/test/unit/index/tc_term_vectors_io.rb +4 -4
  111. data/test/unit/query_parser/c_query_parser.rb +138 -0
  112. data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
  113. data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
  114. data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
  115. data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
  116. data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
  117. data/test/unit/search/c_sort_field.rb +27 -0
  118. data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
  119. data/test/unit/search/tc_sort_field.rb +7 -20
  120. data/test/unit/store/c_fs_store.rb +76 -0
  121. data/test/unit/store/c_ram_store.rb +35 -0
  122. data/test/unit/store/m_store.rb +34 -0
  123. data/test/unit/store/m_store_lock.rb +68 -0
  124. data/test/unit/store/tc_fs_store.rb +0 -53
  125. data/test/unit/store/tc_ram_store.rb +0 -20
  126. data/test/unit/store/tm_store.rb +0 -30
  127. data/test/unit/store/tm_store_lock.rb +0 -66
  128. metadata +84 -31
  129. data/ext/Makefile +0 -140
  130. data/ext/ferret_ext.so +0 -0
  131. data/ext/priority_queue.c +0 -232
  132. data/ext/ram_directory.c +0 -321
  133. data/ext/segment_merge_queue.c +0 -37
  134. data/ext/segment_term_enum.c +0 -326
  135. data/ext/string_helper.c +0 -42
  136. data/ext/tags +0 -344
  137. data/ext/term_buffer.c +0 -230
  138. data/ext/term_infos_reader.c +0 -54
  139. data/ext/terminfo.c +0 -160
  140. data/ext/token.c +0 -93
  141. data/ext/util.c +0 -12
data/ext/r_analysis.c ADDED
@@ -0,0 +1,255 @@
1
+ #include "ferret.h"
2
+ #include "analysis.h"
3
+
4
+ static VALUE cToken;
5
+ static VALUE cLetterTokenizer;
6
+
7
+ static VALUE cAnalyzer;
8
+ static VALUE cLetterAnalyzer;
9
+ static VALUE cWhiteSpaceAnalyzer;
10
+ static VALUE cStandardAnalyzer;
11
+
12
+ /****************************************************************************
13
+ *
14
+ * Token Methods
15
+ *
16
+ ****************************************************************************/
17
+
18
+ typedef struct RToken {
19
+ VALUE text;
20
+ int start;
21
+ int end;
22
+ int pos_inc;
23
+ } RToken;
24
+
25
+ static void
26
+ frt_token_free(void *p)
27
+ {
28
+ free(p);
29
+ }
30
+
31
+ static void
32
+ frt_token_mark(void *p)
33
+ {
34
+ RToken *token = (RToken *)p;
35
+ rb_gc_mark(token->text);
36
+ }
37
+
38
+ static VALUE
39
+ frt_token_alloc(VALUE klass)
40
+ {
41
+ return Data_Wrap_Struct(klass, &frt_token_mark, &frt_token_free, ALLOC(RToken));
42
+ }
43
+
44
+ #define GET_TK RToken *token; Data_Get_Struct(self, RToken, token);
45
+ static VALUE
46
+ frt_token_init(int argc, VALUE *argv, VALUE self)
47
+ {
48
+ GET_TK;
49
+ VALUE rtext, rstart, rend, rpos_inc, rtype;
50
+ token->pos_inc = 1;
51
+ switch (rb_scan_args(argc, argv, "32", &rtext, &rstart, &rend, &rpos_inc, &rtype)) {
52
+ case 5: /* type gets ignored at this stage */
53
+ case 4: token->pos_inc = FIX2INT(rpos_inc);
54
+ }
55
+ token->text = rb_obj_as_string(rtext);
56
+ token->start = FIX2INT(rstart);
57
+ token->end = FIX2INT(rend);
58
+ return self;
59
+ }
60
+
61
+ static VALUE
62
+ frt_token_cmp(VALUE self, VALUE rother)
63
+ {
64
+ RToken *other;
65
+ int cmp;
66
+ GET_TK;
67
+ Data_Get_Struct(rother, RToken, other);
68
+ if (token->start > other->start) {
69
+ cmp = 1;
70
+ } else if (token->start < other->start) {
71
+ cmp = -1;
72
+ } else {
73
+ if (token->end > other->end) {
74
+ cmp = 1;
75
+ } else if (token->end < other->end) {
76
+ cmp = -1;
77
+ } else {
78
+ cmp = strcmp(RSTRING(token->text)->ptr, RSTRING(other->text)->ptr);
79
+ }
80
+ }
81
+ return INT2FIX(cmp);
82
+ }
83
+
84
+ static VALUE
85
+ frt_token_get_text(VALUE self)
86
+ {
87
+ GET_TK;
88
+ return token->text;
89
+ }
90
+
91
+ static VALUE
92
+ frt_token_set_text(VALUE self, VALUE rtext)
93
+ {
94
+ GET_TK;
95
+ token->text = rtext;
96
+ return rtext;
97
+ }
98
+
99
+ static VALUE
100
+ frt_token_get_start_offset(VALUE self)
101
+ {
102
+ GET_TK;
103
+ return INT2FIX(token->start);
104
+ }
105
+
106
+ static VALUE
107
+ frt_token_get_end_offset(VALUE self)
108
+ {
109
+ GET_TK;
110
+ return INT2FIX(token->end);
111
+ }
112
+
113
+ static VALUE
114
+ frt_token_get_pos_inc(VALUE self)
115
+ {
116
+ GET_TK;
117
+ return INT2FIX(token->pos_inc);
118
+ }
119
+
120
+ static VALUE
121
+ frt_token_to_s(VALUE self)
122
+ {
123
+ GET_TK;
124
+ char *buf = alloca(RSTRING(token->text)->len + 80);
125
+ sprintf(buf, "token[\"%s\":%d:%d:%d]", RSTRING(token->text)->ptr, token->start,
126
+ token->end, token->pos_inc);
127
+ return rb_str_new2(buf);
128
+ }
129
+
130
+ /****************************************************************************
131
+ *
132
+ * Tokenizer Methods
133
+ *
134
+ ****************************************************************************/
135
+
136
+ static void
137
+ frt_tokenizer_free(void *p)
138
+ {
139
+ TokenStream *ts = (TokenStream *)p;
140
+ object_del(p);
141
+ ts->destroy(ts);
142
+ }
143
+
144
+ static VALUE
145
+ frt_letter_tokenizer_init(VALUE self, VALUE rstr)
146
+ {
147
+ TokenStream *ts = letter_tokenizer_create();
148
+ Frt_Wrap_Struct(self, NULL, &frt_tokenizer_free, ts);
149
+ return self;
150
+ }
151
+
152
+ /****************************************************************************
153
+ *
154
+ * Analyzer Methods
155
+ *
156
+ ****************************************************************************/
157
+
158
+ static void
159
+ frt_analyzer_free(void *p)
160
+ {
161
+ Analyzer *a = (Analyzer *)p;
162
+ object_del(a);
163
+ a->destroy(a);
164
+ }
165
+
166
+ VALUE
167
+ frt_get_analyzer(Analyzer *a)
168
+ {
169
+ VALUE self = Data_Wrap_Struct(cAnalyzer, NULL, &frt_analyzer_free, a);
170
+ object_add(a, self);
171
+ return self;
172
+ }
173
+
174
+ /*** WhiteSpaceAnalyzer ***/
175
+ static VALUE
176
+ frt_white_space_analyzer_init(VALUE self)
177
+ {
178
+ Analyzer *a = whitespace_analyzer_create();
179
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
180
+ object_add(a, self);
181
+ return self;
182
+ }
183
+
184
+ /*** LetterAnalyzer ***/
185
+ static VALUE
186
+ frt_letter_analyzer_init(VALUE self)
187
+ {
188
+ Analyzer *a = letter_analyzer_create();
189
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
190
+ object_add(a, self);
191
+ return self;
192
+ }
193
+
194
+ /*** StandardAnalyzer ***/
195
+ static VALUE
196
+ frt_standard_analyzer_init(VALUE self)
197
+ {
198
+ Analyzer *a = standard_analyzer_create();
199
+ Frt_Wrap_Struct(self, NULL, &frt_analyzer_free, a);
200
+ object_add(a, self);
201
+ return self;
202
+ }
203
+
204
+ /****************************************************************************
205
+ *
206
+ * Init Function
207
+ *
208
+ ****************************************************************************/
209
+
210
+ void
211
+ Init_analysis(void)
212
+ {
213
+ cToken = rb_define_class_under(mAnalysis, "Token", rb_cObject);
214
+ rb_define_alloc_func(cToken, frt_token_alloc);
215
+ rb_include_module(cToken, rb_mComparable);
216
+
217
+ rb_define_method(cToken, "initialize", frt_token_init, -1);
218
+ rb_define_method(cToken, "<=>", frt_token_cmp, 1);
219
+ rb_define_method(cToken, "text", frt_token_get_text, 0);
220
+ rb_define_method(cToken, "text=", frt_token_set_text, 1);
221
+ rb_define_method(cToken, "start_offset", frt_token_get_start_offset, 0);
222
+ rb_define_method(cToken, "end_offset", frt_token_get_end_offset, 0);
223
+ rb_define_method(cToken, "pos_inc", frt_token_get_pos_inc, 0);
224
+ rb_define_method(cToken, "to_s", frt_token_to_s, 0);
225
+
226
+ cLetterTokenizer =
227
+ rb_define_class_under(mAnalysis, "LetterTokenizer", rb_cObject);
228
+ rb_define_alloc_func(cLetterTokenizer, frt_data_alloc);
229
+ rb_define_method(cLetterTokenizer, "initialize",
230
+ frt_letter_tokenizer_init, 1);
231
+
232
+ cAnalyzer =
233
+ rb_define_class_under(mAnalysis, "Analyzer", rb_cObject);
234
+ rb_define_alloc_func(cAnalyzer, frt_data_alloc);
235
+ rb_define_method(cAnalyzer, "initialize",
236
+ frt_letter_analyzer_init, 0);
237
+
238
+ cLetterAnalyzer =
239
+ rb_define_class_under(mAnalysis, "LetterAnalyzer", cAnalyzer);
240
+ rb_define_alloc_func(cLetterAnalyzer, frt_data_alloc);
241
+ rb_define_method(cAnalyzer, "initialize",
242
+ frt_letter_analyzer_init, 0);
243
+
244
+ cWhiteSpaceAnalyzer =
245
+ rb_define_class_under(mAnalysis, "WhiteSpaceAnalyzer", cAnalyzer);
246
+ rb_define_alloc_func(cWhiteSpaceAnalyzer, frt_data_alloc);
247
+ rb_define_method(cWhiteSpaceAnalyzer, "initialize",
248
+ frt_white_space_analyzer_init, 0);
249
+
250
+ cStandardAnalyzer =
251
+ rb_define_class_under(mAnalysis, "StandardAnalyzer", cAnalyzer);
252
+ rb_define_alloc_func(cStandardAnalyzer, frt_data_alloc);
253
+ rb_define_method(cStandardAnalyzer, "initialize",
254
+ frt_standard_analyzer_init, 0);
255
+ }
data/ext/r_doc.c ADDED
@@ -0,0 +1,578 @@
1
+ #include "ferret.h"
2
+ #include "document.h"
3
+
4
+ VALUE cDocument;
5
+ VALUE cField;
6
+ VALUE cFieldStore;
7
+ VALUE cFieldIndex;
8
+ VALUE cFieldTermVector;
9
+
10
+ /****************************************************************************
11
+ *
12
+ * Field Methods
13
+ *
14
+ ****************************************************************************/
15
+
16
+ void
17
+ frt_field_free(void *p)
18
+ {
19
+ object_del(p);
20
+ df_destroy_data(p);
21
+ }
22
+
23
+ static VALUE
24
+ frt_field_alloc(VALUE klass)
25
+ {
26
+ VALUE self;
27
+ DocField *df = ALLOC(DocField);
28
+ df->name = NULL;
29
+ df->data = NULL;
30
+ self = Data_Wrap_Struct(klass, NULL, &frt_field_free, df);
31
+ object_add(df, self);
32
+ return self;
33
+ }
34
+
35
+ #define GET_DF DocField *df; Data_Get_Struct(self, DocField, df)
36
+ static VALUE
37
+ frt_field_init(int argc, VALUE *argv, VALUE self)
38
+ {
39
+ GET_DF;
40
+ VALUE rname, rdata, rstored, rindexed, rstore_tv, rbinary, rboost;
41
+ float boost = 1.0;
42
+ int stored = 0, indexed = 0, store_tv = 0;
43
+ bool binary = false;
44
+ switch (rb_scan_args(argc, argv, "25", &rname, &rdata, &rstored,
45
+ &rindexed, &rstore_tv, &rbinary, &rboost)) {
46
+ case 7: boost = (float)rb_num2dbl(rboost);
47
+ case 6: binary = RTEST(rbinary);
48
+ case 5: store_tv = FIX2INT(rstore_tv);
49
+ case 4: indexed = FIX2INT(rindexed);
50
+ case 3: stored = FIX2INT(rstored);
51
+ case 2:
52
+ rname = rb_obj_as_string(rname);
53
+ rdata = rb_obj_as_string(rdata);
54
+ break;
55
+ }
56
+ char *name = RSTRING(rname)->ptr;
57
+ int len = RSTRING(rdata)->len;
58
+ char *data = ALLOC_N(char, len + 1);
59
+ MEMCPY(data, RSTRING(rdata)->ptr, char, len);
60
+ data[len] = 0;
61
+ df_set(df, name, data, stored, indexed, store_tv);
62
+ df->blen = len;
63
+ df->is_binary = binary;
64
+ df->boost = boost;
65
+ return Qnil;
66
+ }
67
+
68
+ static VALUE
69
+ frt_field_get_name(VALUE self)
70
+ {
71
+ GET_DF;
72
+ return rb_str_new2(df->name);
73
+ }
74
+
75
+ static VALUE
76
+ frt_field_set_name(VALUE self, VALUE rname)
77
+ {
78
+ int len;
79
+ GET_DF;
80
+ rname = rb_obj_as_string(rname);
81
+ len = RSTRING(rname)->len;
82
+ REALLOC_N(df->name, char, len);
83
+ MEMCPY(df->name, RSTRING(rname)->ptr, char, len);
84
+ return Qnil;
85
+ }
86
+
87
+ static VALUE
88
+ frt_field_get_data(VALUE self)
89
+ {
90
+ GET_DF;
91
+ return rb_str_new(df->data, df->blen);
92
+ }
93
+
94
+ static VALUE
95
+ frt_field_set_data(VALUE self, VALUE rdata)
96
+ {
97
+ int len;
98
+ GET_DF;
99
+ rdata = rb_obj_as_string(rdata);
100
+ len = RSTRING(rdata)->len;
101
+ REALLOC_N(df->data, char, len);
102
+ MEMCPY(df->data, RSTRING(rdata)->ptr, char, len);
103
+ df->blen = len;
104
+ return Qnil;
105
+ }
106
+
107
+ static VALUE
108
+ frt_field_get_boost(VALUE self)
109
+ {
110
+ GET_DF;
111
+ return rb_float_new((double)df->boost);
112
+ }
113
+
114
+ static VALUE
115
+ frt_field_set_boost(VALUE self, VALUE rboost)
116
+ {
117
+ GET_DF;
118
+ df->boost = (float)rb_num2dbl(rboost);
119
+ return Qnil;
120
+ }
121
+
122
+ static VALUE
123
+ frt_field_is_stored(VALUE self)
124
+ {
125
+ GET_DF;
126
+ return df->is_stored ? Qtrue : Qfalse;
127
+ }
128
+
129
+ static VALUE
130
+ frt_field_is_indexed(VALUE self)
131
+ {
132
+ GET_DF;
133
+ return df->is_indexed ? Qtrue : Qfalse;
134
+ }
135
+
136
+ static VALUE
137
+ frt_field_is_tokenized(VALUE self)
138
+ {
139
+ GET_DF;
140
+ return df->is_tokenized ? Qtrue : Qfalse;
141
+ }
142
+
143
+ static VALUE
144
+ frt_field_is_binary(VALUE self)
145
+ {
146
+ GET_DF;
147
+ return df->is_binary ? Qtrue : Qfalse;
148
+ }
149
+
150
+ static VALUE
151
+ frt_field_is_compressed(VALUE self)
152
+ {
153
+ GET_DF;
154
+ return df->is_compressed ? Qtrue : Qfalse;
155
+ }
156
+
157
+ static VALUE
158
+ frt_field_store_tv(VALUE self)
159
+ {
160
+ GET_DF;
161
+ return df->store_tv ? Qtrue : Qfalse;
162
+ }
163
+
164
+ static VALUE
165
+ frt_field_store_pos(VALUE self)
166
+ {
167
+ GET_DF;
168
+ return df->store_pos ? Qtrue : Qfalse;
169
+ }
170
+
171
+ static VALUE
172
+ frt_field_store_offset(VALUE self)
173
+ {
174
+ GET_DF;
175
+ return df->store_offset ? Qtrue : Qfalse;
176
+ }
177
+
178
+ static VALUE
179
+ frt_field_omit_norms(VALUE self)
180
+ {
181
+ GET_DF;
182
+ return df->omit_norms ? Qtrue : Qfalse;
183
+ }
184
+
185
+ static VALUE
186
+ frt_field_to_s(VALUE self)
187
+ {
188
+ VALUE rstr;
189
+ char *str;
190
+ GET_DF;
191
+
192
+ str = df_to_s(df);
193
+ rstr = rb_str_new2(str);
194
+ free(str);
195
+ return rstr;
196
+ }
197
+
198
+ static VALUE
199
+ frt_field_new_binary(VALUE klass, VALUE rname, VALUE rdata, VALUE rstore)
200
+ {
201
+ char *data;
202
+ int len;
203
+ DocField *df;
204
+ int store = FIX2INT(rstore);
205
+ rname = rb_obj_as_string(rname);
206
+ rdata = rb_obj_as_string(rdata);
207
+ len = RSTRING(rdata)->len;
208
+ data = ALLOC_N(char, len);
209
+ MEMCPY(data, RSTRING(rdata)->ptr, char, len);
210
+
211
+ df = df_create_binary(RSTRING(rname)->ptr, data, len, store);
212
+ return Data_Wrap_Struct(klass, NULL, &df_destroy_data, df);
213
+ }
214
+
215
+ static VALUE
216
+ frt_field_set_store(VALUE self, VALUE rstore)
217
+ {
218
+ GET_DF;
219
+ int store = FIX2INT(rstore);
220
+ df_set_store(df, store);
221
+ return Qnil;
222
+ }
223
+
224
+ static VALUE
225
+ frt_field_set_term_vector(VALUE self, VALUE rterm_vector)
226
+ {
227
+ GET_DF;
228
+ int term_vector = FIX2INT(rterm_vector);
229
+ df_set_term_vector(df, term_vector);
230
+ return Qnil;
231
+ }
232
+
233
+ static VALUE
234
+ frt_field_set_index(VALUE self, VALUE rindex)
235
+ {
236
+ GET_DF;
237
+ int index = FIX2INT(rindex);
238
+ df_set_index(df, index);
239
+ return Qnil;
240
+ }
241
+
242
+ /****************************************************************************
243
+ *
244
+ * Document Methods
245
+ *
246
+ ****************************************************************************/
247
+
248
+ void
249
+ frt_doc_free(void *p)
250
+ {
251
+ object_del(p);
252
+ doc_destroy(p);
253
+ }
254
+
255
+ void
256
+ frt_doc_mark(void *p)
257
+ {
258
+ int i;
259
+ DocField *df;
260
+ Document *doc = (Document *)p;
261
+ for (i = 0; i < doc->dfcnt; i++) {
262
+ df = doc->df_arr[i];
263
+ frt_gc_mark(df);
264
+ }
265
+ }
266
+
267
+ static VALUE
268
+ frt_doc_alloc(VALUE klass)
269
+ {
270
+ Document *doc = doc_create();
271
+ doc->free_data = NULL;
272
+ VALUE self = Data_Wrap_Struct(klass, &frt_doc_mark, &frt_doc_free, doc);
273
+ object_add(doc, self);
274
+ return self;
275
+ }
276
+
277
+ VALUE
278
+ frt_get_doc(Document *doc)
279
+ {
280
+ VALUE rfield, self;
281
+ DocField *df;
282
+ int i;
283
+ HshEntry *he;
284
+ if (!doc || (self = object_get(doc)) != Qnil) return Qnil;
285
+
286
+ doc->free_data = NULL;
287
+ /* Set all fields to not free their data */
288
+ for (i = 0; i <= doc->fields->mask; i++) {
289
+ he = &doc->fields->table[i];
290
+ if (he->key != NULL && he->key != dummy_key) {
291
+ ((Array *)he->value)->free_elem = NULL;
292
+ }
293
+ }
294
+ for (i = 0; i < doc->dfcnt; i++) {
295
+ df = doc->df_arr[i];
296
+ rfield = Data_Wrap_Struct(cField, NULL, &frt_field_free, df);
297
+ object_add(df, rfield);
298
+ }
299
+ self = Data_Wrap_Struct(cDocument, &frt_doc_mark, &frt_doc_free, doc);
300
+ object_add(doc, self);
301
+ return self;
302
+ }
303
+
304
+ #define GET_DOC Document *doc; Data_Get_Struct(self, Document, doc)
305
+ static VALUE
306
+ frt_doc_init(VALUE self)
307
+ {
308
+ return self;
309
+ }
310
+
311
+ static VALUE
312
+ frt_doc_all_fields(VALUE self)
313
+ {
314
+ int i;
315
+ GET_DOC;
316
+ VALUE values = rb_ary_new2(doc->dfcnt);
317
+ for (i = 0; i < doc->dfcnt; i++) {
318
+ rb_ary_push(values, object_get(doc->df_arr[i]));
319
+ }
320
+ return values;
321
+ }
322
+
323
+ static VALUE
324
+ frt_doc_field_count(VALUE self)
325
+ {
326
+ GET_DOC;
327
+ return INT2FIX(doc->fcnt);
328
+ }
329
+
330
+ static VALUE
331
+ frt_doc_entry_count(VALUE self)
332
+ {
333
+ GET_DOC;
334
+ return INT2FIX(doc->dfcnt);
335
+ }
336
+
337
+ static VALUE
338
+ frt_doc_add_field(VALUE self, VALUE rfield)
339
+ {
340
+ DocField *df;
341
+ GET_DOC;
342
+ Data_Get_Struct(rfield, DocField, df);
343
+ doc_add_field(doc, df);
344
+ return Qnil;
345
+ }
346
+
347
+ /* TODO: return the removed fields as an array */
348
+ static VALUE
349
+ frt_doc_remove_fields(VALUE self, VALUE rname)
350
+ {
351
+ Array *fields;
352
+ GET_DOC;
353
+ rname = rb_obj_as_string(rname);
354
+ fields = doc_remove_fields(doc, RSTRING(rname)->ptr);
355
+ ary_destroy(fields);
356
+ return Qnil;
357
+ }
358
+
359
+ static VALUE
360
+ frt_doc_remove_field(VALUE self, VALUE rname)
361
+ {
362
+ DocField *df;
363
+ GET_DOC;
364
+ rname = rb_obj_as_string(rname);
365
+ df = doc_remove_field(doc, RSTRING(rname)->ptr);
366
+ return object_get(df);
367
+ }
368
+
369
+ static VALUE
370
+ frt_doc_field(VALUE self, VALUE rname)
371
+ {
372
+ GET_DOC;
373
+ DocField *df;
374
+ rname = rb_obj_as_string(rname);
375
+ df = doc_get_field(doc, RSTRING(rname)->ptr);
376
+ return object_get(df);
377
+ }
378
+
379
+ static VALUE
380
+ frt_doc_fields(VALUE self, VALUE rname)
381
+ {
382
+ int i;
383
+ VALUE fields;
384
+ GET_DOC;
385
+ Array *dfs;
386
+ rname = rb_obj_as_string(rname);
387
+ dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
388
+ if (!dfs) return Qnil;
389
+ fields = rb_ary_new2(dfs->size);
390
+ for (i = 0; i < dfs->size; i++) {
391
+ rb_ary_push(fields, object_get(dfs->elems[i]));
392
+ }
393
+
394
+ return fields;
395
+ }
396
+
397
+ static VALUE
398
+ frt_doc_values(VALUE self, VALUE rname)
399
+ {
400
+ int i, len = 0, vindex = 0;
401
+ VALUE rvalues;
402
+ char *values = NULL;
403
+ GET_DOC;
404
+ Array *dfs;
405
+ DocField *df;
406
+ rname = rb_obj_as_string(rname);
407
+ dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
408
+ if (!dfs) return Qnil;
409
+
410
+ for (i = 0; i < dfs->size; i++) {
411
+ df = (DocField *)dfs->elems[i];
412
+ if (df->is_binary) continue;
413
+ len += df->blen + 1;
414
+ REALLOC_N(values, char, len);
415
+ MEMCPY(values + vindex, df->data, char, df->blen);
416
+ vindex = len;
417
+ values[vindex-1] = ' ';
418
+ }
419
+ if (len) {
420
+ values[len-1] = '\0';
421
+ rvalues = rb_str_new(values, len-1);
422
+ free(values);
423
+ } else {
424
+ rvalues = Qnil;
425
+ }
426
+
427
+ return rvalues;
428
+ }
429
+
430
+ static VALUE
431
+ frt_doc_binaries(VALUE self, VALUE rname)
432
+ {
433
+ int i;
434
+ VALUE rvalues;
435
+ GET_DOC;
436
+ Array *dfs;
437
+ DocField *df;
438
+ rname = rb_obj_as_string(rname);
439
+ dfs = doc_get_fields(doc, RSTRING(rname)->ptr);
440
+ if (!dfs) return Qnil;
441
+
442
+ rvalues = rb_ary_new2(dfs->size);
443
+ for (i = 0; i < dfs->size; i++) {
444
+ df = (DocField *)dfs->elems[i];
445
+ if (!df->is_binary) continue;
446
+ rb_ary_push(rvalues, rb_str_new(df->data, df->blen));
447
+ }
448
+ return rvalues;
449
+ }
450
+
451
+ static VALUE
452
+ frt_doc_set(VALUE self, VALUE rname, VALUE rdata)
453
+ {
454
+ DocField *df;
455
+ GET_DOC;
456
+ VALUE rfield;
457
+ rname = rb_obj_as_string(rname);
458
+ rdata = rb_obj_as_string(rdata);
459
+
460
+ df = doc_get_field(doc, RSTRING(rname)->ptr);
461
+ if (df) {
462
+ free(df->data);
463
+ df->data = estrdup(RSTRING(rdata)->ptr);
464
+ rfield = object_get(df);
465
+ } else {
466
+ rfield = rb_funcall(cField, id_new, 2, rname, rdata);
467
+ Data_Get_Struct(rfield, DocField, df);
468
+ doc_add_field(doc, df);
469
+ }
470
+ return rfield;
471
+ }
472
+ static VALUE
473
+ frt_doc_to_s(VALUE self)
474
+ {
475
+ char *str;
476
+ VALUE rstr;
477
+ GET_DOC;
478
+ str = doc_to_s(doc);
479
+ rstr = rb_str_new2(str);
480
+ free(str);
481
+ return rstr;
482
+ }
483
+
484
+ static VALUE
485
+ frt_doc_get_boost(VALUE self)
486
+ {
487
+ GET_DOC;
488
+ return rb_float_new((double)doc->boost);
489
+ }
490
+
491
+ static VALUE
492
+ frt_doc_set_boost(VALUE self, VALUE rboost)
493
+ {
494
+ GET_DOC;
495
+ doc->boost = (float)rb_num2dbl(rboost);
496
+ return Qnil;
497
+ }
498
+
499
+
500
+ /****************************************************************************
501
+ *
502
+ * Init Function
503
+ *
504
+ ****************************************************************************/
505
+
506
+ void
507
+ Init_doc(void)
508
+ {
509
+ /* Field */
510
+ cField = rb_define_class_under(mDocument, "Field", rb_cObject);
511
+ rb_define_alloc_func(cField, frt_field_alloc);
512
+
513
+ rb_define_method(cField, "initialize", frt_field_init, -1);
514
+ rb_define_singleton_method(cField, "new_binary_field",
515
+ frt_field_new_binary, 3);
516
+ rb_define_method(cField, "name", frt_field_get_name, 0);
517
+ rb_define_method(cField, "name=", frt_field_set_name, 1);
518
+ rb_define_method(cField, "data", frt_field_get_data, 0);
519
+ rb_define_method(cField, "data=", frt_field_set_data, 1);
520
+ rb_define_method(cField, "boost", frt_field_get_boost, 0);
521
+ rb_define_method(cField, "boost=", frt_field_set_boost, 1);
522
+ rb_define_method(cField, "stored?", frt_field_is_stored, 0);
523
+ rb_define_method(cField, "indexed?", frt_field_is_indexed, 0);
524
+ rb_define_method(cField, "tokenized?", frt_field_is_tokenized, 0);
525
+ rb_define_method(cField, "binary?", frt_field_is_binary, 0);
526
+ rb_define_method(cField, "compressed?", frt_field_is_compressed, 0);
527
+ rb_define_method(cField, "store_term_vector?", frt_field_store_tv, 0);
528
+ rb_define_method(cField, "store_positions?", frt_field_store_pos, 0);
529
+ rb_define_method(cField, "store_offsets?", frt_field_store_offset, 0);
530
+ rb_define_method(cField, "omit_norms?", frt_field_omit_norms, 0);
531
+ rb_define_method(cField, "to_s", frt_field_to_s, 0);
532
+ rb_define_method(cField, "store=", frt_field_set_store, 1);
533
+ rb_define_method(cField, "index=", frt_field_set_index, 1);
534
+ rb_define_method(cField, "term_vector=", frt_field_set_term_vector, 1);
535
+
536
+ /* Field Constants */
537
+ cFieldStore = rb_define_class_under(cField, "Store", rb_cObject);
538
+ rb_define_const(cFieldStore, "YES", INT2FIX(DF_STORE_YES));
539
+ rb_define_const(cFieldStore, "NO", INT2FIX(DF_STORE_NO));
540
+ rb_define_const(cFieldStore, "COMPRESS", INT2FIX(DF_STORE_COMPRESS));
541
+ cFieldIndex = rb_define_class_under(cField, "Index", rb_cObject);
542
+ rb_define_const(cFieldIndex, "UNTOKENIZED", INT2FIX(DF_INDEX_UNTOKENIZED));
543
+ rb_define_const(cFieldIndex, "TOKENIZED", INT2FIX(DF_INDEX_TOKENIZED));
544
+ rb_define_const(cFieldIndex, "NO", INT2FIX(DF_INDEX_NO));
545
+ rb_define_const(cFieldIndex, "NO_NORMS", INT2FIX(DF_INDEX_NO_NORMS));
546
+ cFieldTermVector = rb_define_class_under(cField, "TermVector", rb_cObject);
547
+ rb_define_const(cFieldTermVector, "NO", INT2FIX(DF_TERM_VECTOR_NO));
548
+ rb_define_const(cFieldTermVector, "YES", INT2FIX(DF_TERM_VECTOR_YES));
549
+ rb_define_const(cFieldTermVector, "WITH_POSITIONS",
550
+ INT2FIX(DF_TERM_VECTOR_WITH_POSITIONS));
551
+ rb_define_const(cFieldTermVector, "WITH_OFFSETS",
552
+ INT2FIX(DF_TERM_VECTOR_WITH_OFFSETS));
553
+ rb_define_const(cFieldTermVector, "WITH_POSITIONS_OFFSETS",
554
+ INT2FIX(DF_TERM_VECTOR_WITH_POSITIONS_OFFSETS));
555
+
556
+ /* Document */
557
+ cDocument = rb_define_class_under(mDocument, "Document", rb_cObject);
558
+ rb_define_alloc_func(cDocument, frt_doc_alloc);
559
+
560
+ rb_define_method(cDocument, "initialize", frt_doc_init, 0);
561
+ rb_define_method(cDocument, "all_fields", frt_doc_all_fields, 0);
562
+ rb_define_method(cDocument, "field_count", frt_doc_field_count, 0);
563
+ rb_define_method(cDocument, "entry_count", frt_doc_entry_count, 0);
564
+ rb_define_method(cDocument, "add_field", frt_doc_add_field, 1);
565
+ rb_define_method(cDocument, "<<", frt_doc_add_field, 1);
566
+ rb_define_method(cDocument, "remove_fields", frt_doc_remove_fields, 1);
567
+ rb_define_method(cDocument, "remove_field", frt_doc_remove_field, 1);
568
+ rb_define_method(cDocument, "field", frt_doc_field, 1);
569
+ rb_define_method(cDocument, "fields", frt_doc_fields, 1);
570
+ rb_define_method(cDocument, "values", frt_doc_values, 1);
571
+ rb_define_method(cDocument, "binaries", frt_doc_binaries, 1);
572
+ rb_define_method(cDocument, "[]", frt_doc_values, 1);
573
+ rb_define_method(cDocument, "set", frt_doc_set, 2);
574
+ rb_define_method(cDocument, "[]=", frt_doc_set, 2);
575
+ rb_define_method(cDocument, "to_s", frt_doc_to_s, 0);
576
+ rb_define_method(cDocument, "boost", frt_doc_get_boost, 0);
577
+ rb_define_method(cDocument, "boost=", frt_doc_set_boost, 1);
578
+ }